diff mbox series

[v4,3/3] csu: Implement and use _dl_early_allocate during static startup

Message ID 1b8e10100702d11449eb4ec02dbfbaa1db5d4c9f.1651762968.git.fweimer@redhat.com
State New
Headers show
Series [v4,1/3] ia64: Always define IA64_USE_NEW_STUB as a flag macro | expand

Commit Message

Florian Weimer May 5, 2022, 3:08 p.m. UTC
This implements mmap fallback for a brk failure during TLS
allocation.

scripts/tls-elf-edit.py is updated to support the new patching method.
The script no longer requires that in the input object is of ET_DYN
type.
---
v4: Do not call __libc_fatal from generic _dl_early_allocate.
v3: Remove spurious empty line change.
 csu/libc-tls.c                              | 11 ++-
 elf/Makefile                                | 19 +++++
 elf/dl-early_allocate.c                     | 30 ++++++++
 elf/tst-tls-allocation-failure-static.c     | 31 ++++++++
 scripts/tst-elf-edit.py                     | 34 +++++++--
 sysdeps/generic/ldsodefs.h                  |  5 ++
 sysdeps/unix/sysv/linux/dl-early_allocate.c | 81 +++++++++++++++++++++
 7 files changed, 201 insertions(+), 10 deletions(-)
 create mode 100644 elf/dl-early_allocate.c
 create mode 100644 elf/tst-tls-allocation-failure-static.c
 create mode 100644 sysdeps/unix/sysv/linux/dl-early_allocate.c

Comments

Adhemerval Zanella Netto May 5, 2022, 5:48 p.m. UTC | #1
On 05/05/2022 12:08, Florian Weimer via Libc-alpha wrote:
> This implements mmap fallback for a brk failure during TLS
> allocation.
> 
> scripts/tls-elf-edit.py is updated to support the new patching method.
> The script no longer requires that in the input object is of ET_DYN
> type.
> ---
> v4: Do not call __libc_fatal from generic _dl_early_allocate.
> v3: Remove spurious empty line change.
>  csu/libc-tls.c                              | 11 ++-
>  elf/Makefile                                | 19 +++++
>  elf/dl-early_allocate.c                     | 30 ++++++++
>  elf/tst-tls-allocation-failure-static.c     | 31 ++++++++
>  scripts/tst-elf-edit.py                     | 34 +++++++--
>  sysdeps/generic/ldsodefs.h                  |  5 ++
>  sysdeps/unix/sysv/linux/dl-early_allocate.c | 81 +++++++++++++++++++++
>  7 files changed, 201 insertions(+), 10 deletions(-)
>  create mode 100644 elf/dl-early_allocate.c
>  create mode 100644 elf/tst-tls-allocation-failure-static.c
>  create mode 100644 sysdeps/unix/sysv/linux/dl-early_allocate.c
> 
> diff --git a/csu/libc-tls.c b/csu/libc-tls.c
> index bef92a7568..0a216c5502 100644
> --- a/csu/libc-tls.c
> +++ b/csu/libc-tls.c
> @@ -145,11 +145,16 @@ __libc_setup_tls (void)
>       _dl_allocate_tls_storage (in elf/dl-tls.c) does using __libc_memalign
>       and dl_tls_static_align.  */
>    tcb_offset = roundup (memsz + GLRO(dl_tls_static_surplus), max_align);
> -  tlsblock = __sbrk (tcb_offset + TLS_INIT_TCB_SIZE + max_align);
> +  tlsblock = _dl_early_allocate (tcb_offset + TLS_INIT_TCB_SIZE + max_align);
> +  if (tlsblock == NULL)
> +    _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n");
>  #elif TLS_DTV_AT_TP
>    tcb_offset = roundup (TLS_INIT_TCB_SIZE, align ?: 1);
> -  tlsblock = __sbrk (tcb_offset + memsz + max_align
> -		     + TLS_PRE_TCB_SIZE + GLRO(dl_tls_static_surplus));
> +  tlsblock = _dl_early_allocate (tcb_offset + memsz + max_align
> +				 + TLS_PRE_TCB_SIZE
> +				 + GLRO(dl_tls_static_surplus));
> +  if (tlsblock == NULL)
> +    _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n");
>    tlsblock += TLS_PRE_TCB_SIZE;
>  #else
>    /* In case a model with a different layout for the TCB and DTV

Ok.

> diff --git a/elf/Makefile b/elf/Makefile
> index fc9860edee..ce3345ed92 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -33,6 +33,7 @@ routines = \
>    $(all-dl-routines) \
>    dl-addr \
>    dl-addr-obj \
> +  dl-early_allocate \
>    dl-error \
>    dl-iteratephdr \
>    dl-libc \
> @@ -108,6 +109,7 @@ all-dl-routines = $(dl-routines) $(sysdep-dl-routines)
>  # But they are absent from the shared libc, because that code is in ld.so.
>  elide-routines.os = \
>    $(all-dl-routines) \
> +  dl-early_allocate \
>    dl-exception \
>    dl-origin \
>    dl-reloc-static-pie \
> @@ -276,6 +278,7 @@ tests-static-normal := \
>    tst-linkall-static \
>    tst-single_threaded-pthread-static \
>    tst-single_threaded-static \
> +  tst-tls-allocation-failure-static \
>    tst-tlsalign-extern-static \
>    tst-tlsalign-static \
>    # tests-static-normal
> @@ -1213,6 +1216,10 @@ $(objpfx)tst-glibcelf.out: tst-glibcelf.py elf.h $(..)/scripts/glibcelf.py \
>            --cc="$(CC) $(patsubst -DMODULE_NAME=%,-DMODULE_NAME=testsuite,$(CPPFLAGS))" \
>  	  < /dev/null > $@ 2>&1; $(evaluate-test)
>  
> +ifeq ($(run-built-tests),yes)
> +tests-special += $(objpfx)tst-tls-allocation-failure-static-patched.out
> +endif
> +
>  # The test requires shared _and_ PIE because the executable
>  # unit test driver must be able to link with the shared object
>  # that is going to eventually go into an installed DSO.
> @@ -2937,3 +2944,15 @@ $(eval $(call tst-trace-skeleton,4,\
>  	$(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4))
>  $(eval $(call tst-trace-skeleton,5,\
>  	$(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4:$(objpfx)libtracemod5))
> +
> +$(objpfx)tst-tls-allocation-failure-static-patched: \
> +  $(objpfx)tst-tls-allocation-failure-static $(..)scripts/tst-elf-edit.py
> +	cp $< $@
> +	$(PYTHON) $(..)scripts/tst-elf-edit.py --maximize-tls-size $@
> +
> +$(objpfx)tst-tls-allocation-failure-static-patched.out: \
> +  $(objpfx)tst-tls-allocation-failure-static-patched
> +	$< > $@ 2>&1; echo "status: $$?" >> $@
> +	grep -q '^Fatal glibc error: Cannot allocate TLS block$$' $@ \
> +	  && grep -q '^status: 127$$' $@; \
> +	  $(evaluate-test)

Ok.

> diff --git a/elf/dl-early_allocate.c b/elf/dl-early_allocate.c
> new file mode 100644
> index 0000000000..61677aaa03
> --- /dev/null
> +++ b/elf/dl-early_allocate.c
> @@ -0,0 +1,30 @@
> +/* Early memory allocation for the dynamic loader.  Generic version.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <ldsodefs.h>
> +#include <stddef.h>
> +#include <unistd.h>
> +
> +void *
> +_dl_early_allocate (size_t size)
> +{
> +  void *result = __sbrk (size);
> +  if (result == (void *) -1)
> +    result = NULL;
> +  return result;
> +}

I still think there will be way simpler to just use mmap here, it will allow
to use a generic version without the need to fiddle with a Linux specific 
(it will be required just a wrapper to MMAP_CALL_INTERNAL, instead of an
implementation that need to know both sbrk and mmap internals).

I think you even clean up all the dl-brk.c and similar files required to
make brk/sbrk work on the loader with an allocator that only uses mmap.

> diff --git a/elf/tst-tls-allocation-failure-static.c b/elf/tst-tls-allocation-failure-static.c
> new file mode 100644
> index 0000000000..8de831b246
> --- /dev/null
> +++ b/elf/tst-tls-allocation-failure-static.c
> @@ -0,0 +1,31 @@
> +/* Base for test program with impossiblyh large PT_TLS segment.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* The test actual binary is patched using scripts/tst-elf-edit.py
> +   --maximize-tls-size, and this introduces the expected test
> +   allocation failure due to an excessive PT_LS p_memsz value.
> +
> +   Patching the binary is required because on some 64-bit targets, TLS
> +   relocations can only cover a 32-bit range, and glibc-internal TLS
> +   variables such as errno end up outside that range.  */
> +
> +int
> +main (void)
> +{
> +  return 0;
> +}
> diff --git a/scripts/tst-elf-edit.py b/scripts/tst-elf-edit.py
> index a514179bbf..0e19ce1e73 100644
> --- a/scripts/tst-elf-edit.py
> +++ b/scripts/tst-elf-edit.py
> @@ -43,9 +43,11 @@ EI_DATA=5
>  ELFDATA2LSB=b'\x01'
>  ELFDATA2MSB=b'\x02'
>  
> +ET_EXEC=2
>  ET_DYN=3
>  
>  PT_LOAD=1
> +PT_TLS=7
>  
>  def elf_types_fmts(e_ident):
>      endian = '<' if e_ident[EI_DATA] == ELFDATA2LSB else '>'
> @@ -146,8 +148,15 @@ def elf_edit_align(phdr, align):
>      else:
>          phdr.p_align = int(align)
>  
> +def elf_edit_maximize_tls_size(phdr, elfclass):
> +    if elfclass == ELFCLASS32:
> +        # It is possible that the kernel can allocate half of the
> +        # address space, so use something larger.
> +        phdr.p_memsz = 0xfff00000
> +    else:
> +        phdr.p_memsz = 1 << 63
>  
> -def elf_edit(f, align):
> +def elf_edit(f, opts):
>      ei_nident_fmt = 'c' * EI_NIDENT
>      ei_nident_len = struct.calcsize(ei_nident_fmt)
>  
> @@ -172,24 +181,35 @@ def elf_edit(f, align):
>  
>      ehdr = Elf_Ehdr(e_ident)
>      ehdr.read(f)
> -    if ehdr.e_type != ET_DYN:
> -       error('{}: not a shared library'.format(f.name))
> +    if ehdr.e_type not in (ET_EXEC, ET_DYN):
> +       error('{}: not an executable or shared library'.format(f.name))
>  
>      phdr = Elf_Phdr(e_ident)
> +    maximize_tls_size_done = False
>      for i in range(0, ehdr.e_phnum):
>          f.seek(ehdr.e_phoff + i * phdr.len)
>          phdr.read(f)
> -        if phdr.p_type == PT_LOAD:
> -            elf_edit_align(phdr, align)
> +        if phdr.p_type == PT_LOAD and opts.align is not None:
> +            elf_edit_align(phdr, opts.align)
> +            f.seek(ehdr.e_phoff + i * phdr.len)
> +            phdr.write(f)
> +            break
> +        if phdr.p_type == PT_TLS and opts.maximize_tls_size:
> +            elf_edit_maximize_tls_size(phdr, e_ident[EI_CLASS])
>              f.seek(ehdr.e_phoff + i * phdr.len)
>              phdr.write(f)
> +            maximize_tls_size_done = True
>              break
>  
> +    if opts.maximize_tls_size and not maximize_tls_size_done:
> +        error('{}: TLS maximum size was not updated'.format(f.name))
>  
>  def get_parser():
>      parser = argparse.ArgumentParser(description=__doc__)
> -    parser.add_argument('-a', dest='align', required=True,
> +    parser.add_argument('-a', dest='align',
>                          help='How to set the LOAD alignment')
> +    parser.add_argument('--maximize-tls-size', action='store_true',
> +                        help='Set maximum PT_TLS size')
>      parser.add_argument('output',
>                          help='ELF file to edit')
>      return parser
> @@ -199,7 +219,7 @@ def main(argv):
>      parser = get_parser()
>      opts = parser.parse_args(argv)
>      with open(opts.output, 'r+b') as fout:
> -       elf_edit(fout, opts.align)
> +       elf_edit(fout, opts)
>  
>  
>  if __name__ == '__main__':
> diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
> index 4a5e698db2..5d0369358d 100644
> --- a/sysdeps/generic/ldsodefs.h
> +++ b/sysdeps/generic/ldsodefs.h
> @@ -1211,6 +1211,11 @@ extern struct link_map * _dl_get_dl_main_map (void)
>  # endif
>  #endif
>  
> +/* Perform early memory allocation, avoding a TCB dependency.
> +   Terminate the process if allocation fails.  May attempt to use
> +   brk.  */
> +void *_dl_early_allocate (size_t size) attribute_hidden;
> +
>  /* Initialize the DSO sort algorithm to use.  */
>  #if !HAVE_TUNABLES
>  static inline void
> diff --git a/sysdeps/unix/sysv/linux/dl-early_allocate.c b/sysdeps/unix/sysv/linux/dl-early_allocate.c
> new file mode 100644
> index 0000000000..2c77b6e3ff
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/dl-early_allocate.c
> @@ -0,0 +1,81 @@
> +/* Early memory allocation for the dynamic loader.  Generic version.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Mark symbols hidden in static PIE for early self relocation to work.  */
> +#if BUILD_PIE_DEFAULT
> +# pragma GCC visibility push(hidden)
> +#endif
> +#include <startup.h>
> +
> +#include <ldsodefs.h>
> +#include <stddef.h>
> +#include <string.h>
> +#include <sysdep.h>
> +#include <unistd.h>
> +
> +#include <mmap_call.h>
> +
> +/* Defined in brk.c.  */
> +extern void *__curbrk;
> +
> +void *
> +_dl_early_allocate (size_t size)
> +{
> +  void *result;
> +
> +  if (__curbrk != NULL)
> +    /* If the break has been initialized, brk must have run before,
> +       so just call it once more.  */
> +    {
> +      result = __sbrk (size);
> +      if (result == (void *) -1)
> +        result = NULL;
> +    }
> +  else
> +    {
> +      /* If brk has not been invoked, there is no need to update
> +         __curbrk.  The first call to brk will take care of that.  */
> +      void *previous = (void *) INTERNAL_SYSCALL_CALL (brk, 0);
> +      result = (void *) INTERNAL_SYSCALL_CALL (brk, previous + size);
> +      if (result == previous)
> +        result = NULL;
> +      else
> +        result = previous;
> +    }
> +
> +  /* If brk fails, fall back to mmap.  This can happen due to
> +     unfortunate ASLR layout decisions and kernel bugs, particularly
> +     for static PIE.  */
> +  if (result == NULL)
> +    {
> +      long int ret;
> +      int prot = PROT_READ | PROT_WRITE;
> +      int flags = MAP_PRIVATE | MAP_ANONYMOUS;
> +#ifdef __NR_mmap2
> +      ret = MMAP_CALL_INTERNAL (mmap2, 0, size, prot, flags, -1, 0);
> +#else
> +      ret = MMAP_CALL_INTERNAL (mmap, 0, size, prot, flags, -1, 0);
> +#endif
> +      if (INTERNAL_SYSCALL_ERROR_P (ret))
> +        result = NULL;
> +      else
> +        result = (void *) ret;
> +    }
> +
> +  return result;
> +}
Florian Weimer May 5, 2022, 6:03 p.m. UTC | #2
* Adhemerval Zanella:

> I still think there will be way simpler to just use mmap here, it will allow
> to use a generic version without the need to fiddle with a Linux specific 
> (it will be required just a wrapper to MMAP_CALL_INTERNAL, instead of an
> implementation that need to know both sbrk and mmap internals).

We need to wrap the difference in error checking as well.

> I think you even clean up all the dl-brk.c and similar files required to
> make brk/sbrk work on the loader with an allocator that only uses mmap.

At the cost of additional VMAs, sure.  We won't be able to reuse the
unused tail of the allocation easily because we do not know the page
size yet.  __minimal_malloc does not yet work at this point.  We also
lose some ASLR.  I'm not convinced it's worth it.

Thanks,
Florian
Adhemerval Zanella Netto May 5, 2022, 6:25 p.m. UTC | #3
On 05/05/2022 15:03, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> I still think there will be way simpler to just use mmap here, it will allow
>> to use a generic version without the need to fiddle with a Linux specific 
>> (it will be required just a wrapper to MMAP_CALL_INTERNAL, instead of an
>> implementation that need to know both sbrk and mmap internals).
> 
> We need to wrap the difference in error checking as well.
> 
>> I think you even clean up all the dl-brk.c and similar files required to
>> make brk/sbrk work on the loader with an allocator that only uses mmap.
> 
> At the cost of additional VMAs, sure.  We won't be able to reuse the
> unused tail of the allocation easily because we do not know the page
> size yet.  __minimal_malloc does not yet work at this point.  We also
> lose some ASLR.  I'm not convinced it's worth it.

It is another code that does something similar to __minimal_malloc to
a specific usage, so now we have another allocation scheme that is used
solely for static initialization.  And it does something different than
__minimal_malloc, so has different semantic.

I am not really against it, but initialization code is already 
quite complex with a lot of different paths for dynamic/static case,
so I think it would be good to have single loader/static allocation 
code.
Florian Weimer May 6, 2022, 10 a.m. UTC | #4
* Adhemerval Zanella:

> On 05/05/2022 15:03, Florian Weimer wrote:
>> * Adhemerval Zanella:
>> 
>>> I still think there will be way simpler to just use mmap here, it will allow
>>> to use a generic version without the need to fiddle with a Linux specific 
>>> (it will be required just a wrapper to MMAP_CALL_INTERNAL, instead of an
>>> implementation that need to know both sbrk and mmap internals).
>> 
>> We need to wrap the difference in error checking as well.
>> 
>>> I think you even clean up all the dl-brk.c and similar files required to
>>> make brk/sbrk work on the loader with an allocator that only uses mmap.
>> 
>> At the cost of additional VMAs, sure.  We won't be able to reuse the
>> unused tail of the allocation easily because we do not know the page
>> size yet.  __minimal_malloc does not yet work at this point.  We also
>> lose some ASLR.  I'm not convinced it's worth it.
>
> It is another code that does something similar to __minimal_malloc to
> a specific usage, so now we have another allocation scheme that is used
> solely for static initialization.  And it does something different than
> __minimal_malloc, so has different semantic.
>
> I am not really against it, but initialization code is already 
> quite complex with a lot of different paths for dynamic/static case,
> so I think it would be good to have single loader/static allocation 
> code.  

As I said, we don't have the page size yet at this point, and
__minimal_malloc needs to know that.  And there are likely other
obstacles to consolidation.

How can we move this forward?

Thanks,
Florian
Adhemerval Zanella Netto May 6, 2022, 1:04 p.m. UTC | #5
On 06/05/2022 07:00, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> On 05/05/2022 15:03, Florian Weimer wrote:
>>> * Adhemerval Zanella:
>>>
>>>> I still think there will be way simpler to just use mmap here, it will allow
>>>> to use a generic version without the need to fiddle with a Linux specific 
>>>> (it will be required just a wrapper to MMAP_CALL_INTERNAL, instead of an
>>>> implementation that need to know both sbrk and mmap internals).
>>>
>>> We need to wrap the difference in error checking as well.
>>>
>>>> I think you even clean up all the dl-brk.c and similar files required to
>>>> make brk/sbrk work on the loader with an allocator that only uses mmap.
>>>
>>> At the cost of additional VMAs, sure.  We won't be able to reuse the
>>> unused tail of the allocation easily because we do not know the page
>>> size yet.  __minimal_malloc does not yet work at this point.  We also
>>> lose some ASLR.  I'm not convinced it's worth it.
>>
>> It is another code that does something similar to __minimal_malloc to
>> a specific usage, so now we have another allocation scheme that is used
>> solely for static initialization.  And it does something different than
>> __minimal_malloc, so has different semantic.
>>
>> I am not really against it, but initialization code is already 
>> quite complex with a lot of different paths for dynamic/static case,
>> so I think it would be good to have single loader/static allocation 
>> code.  
> 
> As I said, we don't have the page size yet at this point, and
> __minimal_malloc needs to know that.  And there are likely other
> obstacles to consolidation.

But for !SHARED we do call _dl_aux_init before ARCH_SETUP_TLS, so
we can access GRLO(dl_pagesize).  

> How can we move this forward?

I am still not very found of bleeding misc/sbrk.c details to Linux
dl-early_allocate.c, but this is not a blocker for the patch.
Adhemerval Zanella Netto May 6, 2022, 1:19 p.m. UTC | #6
On 05/05/2022 12:08, Florian Weimer via Libc-alpha wrote:
> This implements mmap fallback for a brk failure during TLS
> allocation.
> 
> scripts/tls-elf-edit.py is updated to support the new patching method.
> The script no longer requires that in the input object is of ET_DYN
> type.
> ---
> v4: Do not call __libc_fatal from generic _dl_early_allocate.
> v3: Remove spurious empty line change.
>  csu/libc-tls.c                              | 11 ++-
>  elf/Makefile                                | 19 +++++
>  elf/dl-early_allocate.c                     | 30 ++++++++
>  elf/tst-tls-allocation-failure-static.c     | 31 ++++++++
>  scripts/tst-elf-edit.py                     | 34 +++++++--
>  sysdeps/generic/ldsodefs.h                  |  5 ++
>  sysdeps/unix/sysv/linux/dl-early_allocate.c | 81 +++++++++++++++++++++
>  7 files changed, 201 insertions(+), 10 deletions(-)
>  create mode 100644 elf/dl-early_allocate.c
>  create mode 100644 elf/tst-tls-allocation-failure-static.c
>  create mode 100644 sysdeps/unix/sysv/linux/dl-early_allocate.c
> 
> diff --git a/csu/libc-tls.c b/csu/libc-tls.c
> index bef92a7568..0a216c5502 100644
> --- a/csu/libc-tls.c
> +++ b/csu/libc-tls.c
> @@ -145,11 +145,16 @@ __libc_setup_tls (void)
>       _dl_allocate_tls_storage (in elf/dl-tls.c) does using __libc_memalign
>       and dl_tls_static_align.  */
>    tcb_offset = roundup (memsz + GLRO(dl_tls_static_surplus), max_align);
> -  tlsblock = __sbrk (tcb_offset + TLS_INIT_TCB_SIZE + max_align);
> +  tlsblock = _dl_early_allocate (tcb_offset + TLS_INIT_TCB_SIZE + max_align);
> +  if (tlsblock == NULL)
> +    _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n");
>  #elif TLS_DTV_AT_TP
>    tcb_offset = roundup (TLS_INIT_TCB_SIZE, align ?: 1);
> -  tlsblock = __sbrk (tcb_offset + memsz + max_align
> -		     + TLS_PRE_TCB_SIZE + GLRO(dl_tls_static_surplus));
> +  tlsblock = _dl_early_allocate (tcb_offset + memsz + max_align
> +				 + TLS_PRE_TCB_SIZE
> +				 + GLRO(dl_tls_static_surplus));
> +  if (tlsblock == NULL)
> +    _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n");
>    tlsblock += TLS_PRE_TCB_SIZE;
>  #else
>    /* In case a model with a different layout for the TCB and DTV
> diff --git a/elf/Makefile b/elf/Makefile
> index fc9860edee..ce3345ed92 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -33,6 +33,7 @@ routines = \
>    $(all-dl-routines) \
>    dl-addr \
>    dl-addr-obj \
> +  dl-early_allocate \
>    dl-error \
>    dl-iteratephdr \
>    dl-libc \
> @@ -108,6 +109,7 @@ all-dl-routines = $(dl-routines) $(sysdep-dl-routines)
>  # But they are absent from the shared libc, because that code is in ld.so.
>  elide-routines.os = \
>    $(all-dl-routines) \
> +  dl-early_allocate \
>    dl-exception \
>    dl-origin \
>    dl-reloc-static-pie \
> @@ -276,6 +278,7 @@ tests-static-normal := \
>    tst-linkall-static \
>    tst-single_threaded-pthread-static \
>    tst-single_threaded-static \
> +  tst-tls-allocation-failure-static \
>    tst-tlsalign-extern-static \
>    tst-tlsalign-static \
>    # tests-static-normal
> @@ -1213,6 +1216,10 @@ $(objpfx)tst-glibcelf.out: tst-glibcelf.py elf.h $(..)/scripts/glibcelf.py \
>            --cc="$(CC) $(patsubst -DMODULE_NAME=%,-DMODULE_NAME=testsuite,$(CPPFLAGS))" \
>  	  < /dev/null > $@ 2>&1; $(evaluate-test)
>  
> +ifeq ($(run-built-tests),yes)
> +tests-special += $(objpfx)tst-tls-allocation-failure-static-patched.out
> +endif
> +
>  # The test requires shared _and_ PIE because the executable
>  # unit test driver must be able to link with the shared object
>  # that is going to eventually go into an installed DSO.
> @@ -2937,3 +2944,15 @@ $(eval $(call tst-trace-skeleton,4,\
>  	$(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4))
>  $(eval $(call tst-trace-skeleton,5,\
>  	$(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4:$(objpfx)libtracemod5))
> +
> +$(objpfx)tst-tls-allocation-failure-static-patched: \
> +  $(objpfx)tst-tls-allocation-failure-static $(..)scripts/tst-elf-edit.py
> +	cp $< $@
> +	$(PYTHON) $(..)scripts/tst-elf-edit.py --maximize-tls-size $@
> +
> +$(objpfx)tst-tls-allocation-failure-static-patched.out: \
> +  $(objpfx)tst-tls-allocation-failure-static-patched
> +	$< > $@ 2>&1; echo "status: $$?" >> $@
> +	grep -q '^Fatal glibc error: Cannot allocate TLS block$$' $@ \
> +	  && grep -q '^status: 127$$' $@; \
> +	  $(evaluate-test)
> diff --git a/elf/dl-early_allocate.c b/elf/dl-early_allocate.c
> new file mode 100644
> index 0000000000..61677aaa03
> --- /dev/null
> +++ b/elf/dl-early_allocate.c
> @@ -0,0 +1,30 @@
> +/* Early memory allocation for the dynamic loader.  Generic version.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <ldsodefs.h>
> +#include <stddef.h>
> +#include <unistd.h>
> +
> +void *
> +_dl_early_allocate (size_t size)
> +{
> +  void *result = __sbrk (size);
> +  if (result == (void *) -1)
> +    result = NULL;
> +  return result;
> +}

Ok.

> diff --git a/elf/tst-tls-allocation-failure-static.c b/elf/tst-tls-allocation-failure-static.c
> new file mode 100644
> index 0000000000..8de831b246
> --- /dev/null
> +++ b/elf/tst-tls-allocation-failure-static.c
> @@ -0,0 +1,31 @@
> +/* Base for test program with impossiblyh large PT_TLS segment.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* The test actual binary is patched using scripts/tst-elf-edit.py
> +   --maximize-tls-size, and this introduces the expected test
> +   allocation failure due to an excessive PT_LS p_memsz value.
> +
> +   Patching the binary is required because on some 64-bit targets, TLS
> +   relocations can only cover a 32-bit range, and glibc-internal TLS
> +   variables such as errno end up outside that range.  */
> +
> +int
> +main (void)
> +{
> +  return 0;
> +}
> diff --git a/scripts/tst-elf-edit.py b/scripts/tst-elf-edit.py
> index a514179bbf..0e19ce1e73 100644
> --- a/scripts/tst-elf-edit.py
> +++ b/scripts/tst-elf-edit.py
> @@ -43,9 +43,11 @@ EI_DATA=5
>  ELFDATA2LSB=b'\x01'
>  ELFDATA2MSB=b'\x02'
>  
> +ET_EXEC=2
>  ET_DYN=3
>  
>  PT_LOAD=1
> +PT_TLS=7
>  
>  def elf_types_fmts(e_ident):
>      endian = '<' if e_ident[EI_DATA] == ELFDATA2LSB else '>'
> @@ -146,8 +148,15 @@ def elf_edit_align(phdr, align):
>      else:
>          phdr.p_align = int(align)
>  
> +def elf_edit_maximize_tls_size(phdr, elfclass):
> +    if elfclass == ELFCLASS32:
> +        # It is possible that the kernel can allocate half of the
> +        # address space, so use something larger.
> +        phdr.p_memsz = 0xfff00000
> +    else:
> +        phdr.p_memsz = 1 << 63
>  
> -def elf_edit(f, align):
> +def elf_edit(f, opts):
>      ei_nident_fmt = 'c' * EI_NIDENT
>      ei_nident_len = struct.calcsize(ei_nident_fmt)
>  
> @@ -172,24 +181,35 @@ def elf_edit(f, align):
>  
>      ehdr = Elf_Ehdr(e_ident)
>      ehdr.read(f)
> -    if ehdr.e_type != ET_DYN:
> -       error('{}: not a shared library'.format(f.name))
> +    if ehdr.e_type not in (ET_EXEC, ET_DYN):
> +       error('{}: not an executable or shared library'.format(f.name))
>  
>      phdr = Elf_Phdr(e_ident)
> +    maximize_tls_size_done = False
>      for i in range(0, ehdr.e_phnum):
>          f.seek(ehdr.e_phoff + i * phdr.len)
>          phdr.read(f)
> -        if phdr.p_type == PT_LOAD:
> -            elf_edit_align(phdr, align)
> +        if phdr.p_type == PT_LOAD and opts.align is not None:

I think you can omit the None check.

> +            elf_edit_align(phdr, opts.align)
> +            f.seek(ehdr.e_phoff + i * phdr.len)
> +            phdr.write(f)
> +            break
> +        if phdr.p_type == PT_TLS and opts.maximize_tls_size:
> +            elf_edit_maximize_tls_size(phdr, e_ident[EI_CLASS])
>              f.seek(ehdr.e_phoff + i * phdr.len)
>              phdr.write(f)
> +            maximize_tls_size_done = True
>              break
>  
> +    if opts.maximize_tls_size and not maximize_tls_size_done:
> +        error('{}: TLS maximum size was not updated'.format(f.name))
>  
>  def get_parser():
>      parser = argparse.ArgumentParser(description=__doc__)
> -    parser.add_argument('-a', dest='align', required=True,
> +    parser.add_argument('-a', dest='align',
>                          help='How to set the LOAD alignment')
> +    parser.add_argument('--maximize-tls-size', action='store_true',
> +                        help='Set maximum PT_TLS size')
>      parser.add_argument('output',
>                          help='ELF file to edit')
>      return parser
> @@ -199,7 +219,7 @@ def main(argv):
>      parser = get_parser()
>      opts = parser.parse_args(argv)
>      with open(opts.output, 'r+b') as fout:
> -       elf_edit(fout, opts.align)
> +       elf_edit(fout, opts)
>  
>  
>  if __name__ == '__main__':

Ok.

> diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
> index 4a5e698db2..5d0369358d 100644
> --- a/sysdeps/generic/ldsodefs.h
> +++ b/sysdeps/generic/ldsodefs.h
> @@ -1211,6 +1211,11 @@ extern struct link_map * _dl_get_dl_main_map (void)
>  # endif
>  #endif
>  
> +/* Perform early memory allocation, avoding a TCB dependency.
> +   Terminate the process if allocation fails.  May attempt to use
> +   brk.  */
> +void *_dl_early_allocate (size_t size) attribute_hidden;
> +
>  /* Initialize the DSO sort algorithm to use.  */
>  #if !HAVE_TUNABLES
>  static inline void
> diff --git a/sysdeps/unix/sysv/linux/dl-early_allocate.c b/sysdeps/unix/sysv/linux/dl-early_allocate.c
> new file mode 100644
> index 0000000000..2c77b6e3ff
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/dl-early_allocate.c
> @@ -0,0 +1,81 @@
> +/* Early memory allocation for the dynamic loader.  Generic version.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Mark symbols hidden in static PIE for early self relocation to work.  */
> +#if BUILD_PIE_DEFAULT
> +# pragma GCC visibility push(hidden)
> +#endif
> +#include <startup.h>
> +
> +#include <ldsodefs.h>
> +#include <stddef.h>
> +#include <string.h>
> +#include <sysdep.h>
> +#include <unistd.h>
> +
> +#include <mmap_call.h>
> +
> +/* Defined in brk.c.  */
> +extern void *__curbrk;
> +
> +void *
> +_dl_early_allocate (size_t size)
> +{
> +  void *result;
> +
> +  if (__curbrk != NULL)
> +    /* If the break has been initialized, brk must have run before,
> +       so just call it once more.  */
> +    {
> +      result = __sbrk (size);
> +      if (result == (void *) -1)
> +        result = NULL;
> +    }
> +  else
> +    {
> +      /* If brk has not been invoked, there is no need to update
> +         __curbrk.  The first call to brk will take care of that.  */
> +      void *previous = (void *) INTERNAL_SYSCALL_CALL (brk, 0);
> +      result = (void *) INTERNAL_SYSCALL_CALL (brk, previous + size);
> +      if (result == previous)
> +        result = NULL;
> +      else
> +        result = previous;

You will need to factor it be arch-specific since alpha return -ENOMEM
in case of failure and sparc has different calling convention (similar 
to what it does for clone).  Maybe add a

static inline void *
brk_call (void *addr)
{
  void *r = syscall;
  r = check_error (r) ? -1 : 0;
}

And then refactor Linux brk.c versions to use brk_call as well.

> +    }
> +
> +  /* If brk fails, fall back to mmap.  This can happen due to
> +     unfortunate ASLR layout decisions and kernel bugs, particularly
> +     for static PIE.  */
> +  if (result == NULL)
> +    {
> +      long int ret;
> +      int prot = PROT_READ | PROT_WRITE;
> +      int flags = MAP_PRIVATE | MAP_ANONYMOUS;
> +#ifdef __NR_mmap2
> +      ret = MMAP_CALL_INTERNAL (mmap2, 0, size, prot, flags, -1, 0);
> +#else
> +      ret = MMAP_CALL_INTERNAL (mmap, 0, size, prot, flags, -1, 0);
> +#endif
> +      if (INTERNAL_SYSCALL_ERROR_P (ret))

Maybe move it to mmap_call.h and make it a static inline:

static inline void *
mmap64_call (void *addr, size_t len, int prot, int flags, int fd, 
	   off64_t offset)
{
  long int ret;
#ifdef __NR_mmap2
  ret = MMAP_CALL_INTERNAL (mmap2, addr, len, prot, flags, fd,
			    (off_t) (offset / MMAP2_PAGE_UNIT));
#else
  ret = MMAP_CALL_INTERNAL (mmap, addr, len, prot, flags, fd,
			    offset);
#endif
  return INTERNAL_SYSCALL_ERROR_P (ret) ? NULL : (void *) ret;
}

static inline void *
mmap_call_internal (size_t len)
{
  int prot = PROT_READ | PROT_WRITE;
  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
  return mmap64_call (0, len, PROT_READ | PROT_WRITE,
		      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
}

> +        result = NULL;
> +      else
> +        result = (void *) ret;
> +    }
> +
> +  return result;
> +}
Florian Weimer May 9, 2022, 4:10 p.m. UTC | #7
* Adhemerval Zanella:

>> diff --git a/scripts/tst-elf-edit.py b/scripts/tst-elf-edit.py
>> index a514179bbf..0e19ce1e73 100644
>> --- a/scripts/tst-elf-edit.py
>> +++ b/scripts/tst-elf-edit.py

>> @@ -172,24 +181,35 @@ def elf_edit(f, align):
>>  
>>      ehdr = Elf_Ehdr(e_ident)
>>      ehdr.read(f)
>> -    if ehdr.e_type != ET_DYN:
>> -       error('{}: not a shared library'.format(f.name))
>> +    if ehdr.e_type not in (ET_EXEC, ET_DYN):
>> +       error('{}: not an executable or shared library'.format(f.name))
>>  
>>      phdr = Elf_Phdr(e_ident)
>> +    maximize_tls_size_done = False
>>      for i in range(0, ehdr.e_phnum):
>>          f.seek(ehdr.e_phoff + i * phdr.len)
>>          phdr.read(f)
>> -        if phdr.p_type == PT_LOAD:
>> -            elf_edit_align(phdr, align)
>> +        if phdr.p_type == PT_LOAD and opts.align is not None:
>
> I think you can omit the None check.

I think this will raise an exception in elf_edit_align because the -a
argument is now optional.

>> +void *
>> +_dl_early_allocate (size_t size)
>> +{
>> +  void *result;
>> +
>> +  if (__curbrk != NULL)
>> +    /* If the break has been initialized, brk must have run before,
>> +       so just call it once more.  */
>> +    {
>> +      result = __sbrk (size);
>> +      if (result == (void *) -1)
>> +        result = NULL;
>> +    }
>> +  else
>> +    {
>> +      /* If brk has not been invoked, there is no need to update
>> +         __curbrk.  The first call to brk will take care of that.  */
>> +      void *previous = (void *) INTERNAL_SYSCALL_CALL (brk, 0);
>> +      result = (void *) INTERNAL_SYSCALL_CALL (brk, previous + size);
>> +      if (result == previous)
>> +        result = NULL;
>> +      else
>> +        result = previous;
>
> You will need to factor it be arch-specific since alpha return -ENOMEM
> in case of failure and sparc has different calling convention (similar 
> to what it does for clone).  Maybe add a
>
> static inline void *
> brk_call (void *addr)
> {
>   void *r = syscall;
>   r = check_error (r) ? -1 : 0;
> }
>
> And then refactor Linux brk.c versions to use brk_call as well.

Thanks.  I'm testing a new version with this change.

>> +  /* If brk fails, fall back to mmap.  This can happen due to
>> +     unfortunate ASLR layout decisions and kernel bugs, particularly
>> +     for static PIE.  */
>> +  if (result == NULL)
>> +    {
>> +      long int ret;
>> +      int prot = PROT_READ | PROT_WRITE;
>> +      int flags = MAP_PRIVATE | MAP_ANONYMOUS;
>> +#ifdef __NR_mmap2
>> +      ret = MMAP_CALL_INTERNAL (mmap2, 0, size, prot, flags, -1, 0);
>> +#else
>> +      ret = MMAP_CALL_INTERNAL (mmap, 0, size, prot, flags, -1, 0);
>> +#endif
>> +      if (INTERNAL_SYSCALL_ERROR_P (ret))
>
> Maybe move it to mmap_call.h and make it a static inline:
>
> static inline void *
> mmap64_call (void *addr, size_t len, int prot, int flags, int fd, 
> 	   off64_t offset)
> {
>   long int ret;
> #ifdef __NR_mmap2
>   ret = MMAP_CALL_INTERNAL (mmap2, addr, len, prot, flags, fd,
> 			    (off_t) (offset / MMAP2_PAGE_UNIT));
> #else
>   ret = MMAP_CALL_INTERNAL (mmap, addr, len, prot, flags, fd,
> 			    offset);
> #endif
>   return INTERNAL_SYSCALL_ERROR_P (ret) ? NULL : (void *) ret;
> }
>
> static inline void *
> mmap_call_internal (size_t len)
> {
>   int prot = PROT_READ | PROT_WRITE;
>   int flags = MAP_PRIVATE | MAP_ANONYMOUS;
>   return mmap64_call (0, len, PROT_READ | PROT_WRITE,
> 		      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> }

We have several customization points for mmap and mmap2.
MMAP2_PAGE_UNIT can expand to a static global variable page_unit.  This
is difficult to encapsulate properly in an inline function.  Adding
__brk_call was simple enough, but this looks way more complex.

Thanks,
Florian
Adhemerval Zanella Netto May 9, 2022, 4:52 p.m. UTC | #8
On 09/05/2022 13:10, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>>> diff --git a/scripts/tst-elf-edit.py b/scripts/tst-elf-edit.py
>>> index a514179bbf..0e19ce1e73 100644
>>> --- a/scripts/tst-elf-edit.py
>>> +++ b/scripts/tst-elf-edit.py
> 
>>> @@ -172,24 +181,35 @@ def elf_edit(f, align):
>>>  
>>>      ehdr = Elf_Ehdr(e_ident)
>>>      ehdr.read(f)
>>> -    if ehdr.e_type != ET_DYN:
>>> -       error('{}: not a shared library'.format(f.name))
>>> +    if ehdr.e_type not in (ET_EXEC, ET_DYN):
>>> +       error('{}: not an executable or shared library'.format(f.name))
>>>  
>>>      phdr = Elf_Phdr(e_ident)
>>> +    maximize_tls_size_done = False
>>>      for i in range(0, ehdr.e_phnum):
>>>          f.seek(ehdr.e_phoff + i * phdr.len)
>>>          phdr.read(f)
>>> -        if phdr.p_type == PT_LOAD:
>>> -            elf_edit_align(phdr, align)
>>> +        if phdr.p_type == PT_LOAD and opts.align is not None:
>>
>> I think you can omit the None check.
> 
> I think this will raise an exception in elf_edit_align because the -a
> argument is now optional.
> 
>>> +void *
>>> +_dl_early_allocate (size_t size)
>>> +{
>>> +  void *result;
>>> +
>>> +  if (__curbrk != NULL)
>>> +    /* If the break has been initialized, brk must have run before,
>>> +       so just call it once more.  */
>>> +    {
>>> +      result = __sbrk (size);
>>> +      if (result == (void *) -1)
>>> +        result = NULL;
>>> +    }
>>> +  else
>>> +    {
>>> +      /* If brk has not been invoked, there is no need to update
>>> +         __curbrk.  The first call to brk will take care of that.  */
>>> +      void *previous = (void *) INTERNAL_SYSCALL_CALL (brk, 0);
>>> +      result = (void *) INTERNAL_SYSCALL_CALL (brk, previous + size);
>>> +      if (result == previous)
>>> +        result = NULL;
>>> +      else
>>> +        result = previous;
>>
>> You will need to factor it be arch-specific since alpha return -ENOMEM
>> in case of failure and sparc has different calling convention (similar 
>> to what it does for clone).  Maybe add a
>>
>> static inline void *
>> brk_call (void *addr)
>> {
>>   void *r = syscall;
>>   r = check_error (r) ? -1 : 0;
>> }
>>
>> And then refactor Linux brk.c versions to use brk_call as well.
> 
> Thanks.  I'm testing a new version with this change.
> 
>>> +  /* If brk fails, fall back to mmap.  This can happen due to
>>> +     unfortunate ASLR layout decisions and kernel bugs, particularly
>>> +     for static PIE.  */
>>> +  if (result == NULL)
>>> +    {
>>> +      long int ret;
>>> +      int prot = PROT_READ | PROT_WRITE;
>>> +      int flags = MAP_PRIVATE | MAP_ANONYMOUS;
>>> +#ifdef __NR_mmap2
>>> +      ret = MMAP_CALL_INTERNAL (mmap2, 0, size, prot, flags, -1, 0);
>>> +#else
>>> +      ret = MMAP_CALL_INTERNAL (mmap, 0, size, prot, flags, -1, 0);
>>> +#endif
>>> +      if (INTERNAL_SYSCALL_ERROR_P (ret))
>>
>> Maybe move it to mmap_call.h and make it a static inline:
>>
>> static inline void *
>> mmap64_call (void *addr, size_t len, int prot, int flags, int fd, 
>> 	   off64_t offset)
>> {
>>   long int ret;
>> #ifdef __NR_mmap2
>>   ret = MMAP_CALL_INTERNAL (mmap2, addr, len, prot, flags, fd,
>> 			    (off_t) (offset / MMAP2_PAGE_UNIT));
>> #else
>>   ret = MMAP_CALL_INTERNAL (mmap, addr, len, prot, flags, fd,
>> 			    offset);
>> #endif
>>   return INTERNAL_SYSCALL_ERROR_P (ret) ? NULL : (void *) ret;
>> }
>>
>> static inline void *
>> mmap_call_internal (size_t len)
>> {
>>   int prot = PROT_READ | PROT_WRITE;
>>   int flags = MAP_PRIVATE | MAP_ANONYMOUS;
>>   return mmap64_call (0, len, PROT_READ | PROT_WRITE,
>> 		      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>> }
> 
> We have several customization points for mmap and mmap2.
> MMAP2_PAGE_UNIT can expand to a static global variable page_unit.  This
> is difficult to encapsulate properly in an inline function.  Adding
> __brk_call was simple enough, but this looks way more complex.

The idea is to allow __mmap64 use the mmap64_call and have the syscall
logic in one place.  Something like:

void *
__mmap64 (void *addr, size_t len, int prot, int flags, int fd, off64_t offset)
{
  MMAP_CHECK_PAGE_UNIT ();

  if (offset & MMAP_OFF_MASK)
    return (void *) INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);

  MMAP_PREPARE (addr, len, prot, flags, fd, offset);
  return mmap64_call (addr, len, prot, flags, fd, offset);
}
Florian Weimer May 9, 2022, 5:08 p.m. UTC | #9
* Adhemerval Zanella:

> The idea is to allow __mmap64 use the mmap64_call and have the syscall
> logic in one place.  Something like:
>
> void *
> __mmap64 (void *addr, size_t len, int prot, int flags, int fd, off64_t offset)
> {
>   MMAP_CHECK_PAGE_UNIT ();
>
>   if (offset & MMAP_OFF_MASK)
>     return (void *) INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
>
>   MMAP_PREPARE (addr, len, prot, flags, fd, offset);
>   return mmap64_call (addr, len, prot, flags, fd, offset);
> }

With this, a definition of mmap64_call still needs a working definition
of MMAP2_PAGE_UNIT for mmap2 targets, I think.

Thanks,
Florian
Adhemerval Zanella Netto May 9, 2022, 5:37 p.m. UTC | #10
On 09/05/2022 14:08, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> The idea is to allow __mmap64 use the mmap64_call and have the syscall
>> logic in one place.  Something like:
>>
>> void *
>> __mmap64 (void *addr, size_t len, int prot, int flags, int fd, off64_t offset)
>> {
>>   MMAP_CHECK_PAGE_UNIT ();
>>
>>   if (offset & MMAP_OFF_MASK)
>>     return (void *) INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
>>
>>   MMAP_PREPARE (addr, len, prot, flags, fd, offset);
>>   return mmap64_call (addr, len, prot, flags, fd, offset);
>> }
> 
> With this, a definition of mmap64_call still needs a working definition
> of MMAP2_PAGE_UNIT for mmap2 targets, I think.

We can add the MMAP2_PAGE_UNIT as an additional argument for mmap64_call
and use a valid but unused value, I think compiler would see than using 0
 as offset argument and optimize the division away.
diff mbox series

Patch

diff --git a/csu/libc-tls.c b/csu/libc-tls.c
index bef92a7568..0a216c5502 100644
--- a/csu/libc-tls.c
+++ b/csu/libc-tls.c
@@ -145,11 +145,16 @@  __libc_setup_tls (void)
      _dl_allocate_tls_storage (in elf/dl-tls.c) does using __libc_memalign
      and dl_tls_static_align.  */
   tcb_offset = roundup (memsz + GLRO(dl_tls_static_surplus), max_align);
-  tlsblock = __sbrk (tcb_offset + TLS_INIT_TCB_SIZE + max_align);
+  tlsblock = _dl_early_allocate (tcb_offset + TLS_INIT_TCB_SIZE + max_align);
+  if (tlsblock == NULL)
+    _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n");
 #elif TLS_DTV_AT_TP
   tcb_offset = roundup (TLS_INIT_TCB_SIZE, align ?: 1);
-  tlsblock = __sbrk (tcb_offset + memsz + max_align
-		     + TLS_PRE_TCB_SIZE + GLRO(dl_tls_static_surplus));
+  tlsblock = _dl_early_allocate (tcb_offset + memsz + max_align
+				 + TLS_PRE_TCB_SIZE
+				 + GLRO(dl_tls_static_surplus));
+  if (tlsblock == NULL)
+    _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n");
   tlsblock += TLS_PRE_TCB_SIZE;
 #else
   /* In case a model with a different layout for the TCB and DTV
diff --git a/elf/Makefile b/elf/Makefile
index fc9860edee..ce3345ed92 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -33,6 +33,7 @@  routines = \
   $(all-dl-routines) \
   dl-addr \
   dl-addr-obj \
+  dl-early_allocate \
   dl-error \
   dl-iteratephdr \
   dl-libc \
@@ -108,6 +109,7 @@  all-dl-routines = $(dl-routines) $(sysdep-dl-routines)
 # But they are absent from the shared libc, because that code is in ld.so.
 elide-routines.os = \
   $(all-dl-routines) \
+  dl-early_allocate \
   dl-exception \
   dl-origin \
   dl-reloc-static-pie \
@@ -276,6 +278,7 @@  tests-static-normal := \
   tst-linkall-static \
   tst-single_threaded-pthread-static \
   tst-single_threaded-static \
+  tst-tls-allocation-failure-static \
   tst-tlsalign-extern-static \
   tst-tlsalign-static \
   # tests-static-normal
@@ -1213,6 +1216,10 @@  $(objpfx)tst-glibcelf.out: tst-glibcelf.py elf.h $(..)/scripts/glibcelf.py \
           --cc="$(CC) $(patsubst -DMODULE_NAME=%,-DMODULE_NAME=testsuite,$(CPPFLAGS))" \
 	  < /dev/null > $@ 2>&1; $(evaluate-test)
 
+ifeq ($(run-built-tests),yes)
+tests-special += $(objpfx)tst-tls-allocation-failure-static-patched.out
+endif
+
 # The test requires shared _and_ PIE because the executable
 # unit test driver must be able to link with the shared object
 # that is going to eventually go into an installed DSO.
@@ -2937,3 +2944,15 @@  $(eval $(call tst-trace-skeleton,4,\
 	$(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4))
 $(eval $(call tst-trace-skeleton,5,\
 	$(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4:$(objpfx)libtracemod5))
+
+$(objpfx)tst-tls-allocation-failure-static-patched: \
+  $(objpfx)tst-tls-allocation-failure-static $(..)scripts/tst-elf-edit.py
+	cp $< $@
+	$(PYTHON) $(..)scripts/tst-elf-edit.py --maximize-tls-size $@
+
+$(objpfx)tst-tls-allocation-failure-static-patched.out: \
+  $(objpfx)tst-tls-allocation-failure-static-patched
+	$< > $@ 2>&1; echo "status: $$?" >> $@
+	grep -q '^Fatal glibc error: Cannot allocate TLS block$$' $@ \
+	  && grep -q '^status: 127$$' $@; \
+	  $(evaluate-test)
diff --git a/elf/dl-early_allocate.c b/elf/dl-early_allocate.c
new file mode 100644
index 0000000000..61677aaa03
--- /dev/null
+++ b/elf/dl-early_allocate.c
@@ -0,0 +1,30 @@ 
+/* Early memory allocation for the dynamic loader.  Generic version.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+#include <stddef.h>
+#include <unistd.h>
+
+void *
+_dl_early_allocate (size_t size)
+{
+  void *result = __sbrk (size);
+  if (result == (void *) -1)
+    result = NULL;
+  return result;
+}
diff --git a/elf/tst-tls-allocation-failure-static.c b/elf/tst-tls-allocation-failure-static.c
new file mode 100644
index 0000000000..8de831b246
--- /dev/null
+++ b/elf/tst-tls-allocation-failure-static.c
@@ -0,0 +1,31 @@ 
+/* Base for test program with impossiblyh large PT_TLS segment.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* The test actual binary is patched using scripts/tst-elf-edit.py
+   --maximize-tls-size, and this introduces the expected test
+   allocation failure due to an excessive PT_LS p_memsz value.
+
+   Patching the binary is required because on some 64-bit targets, TLS
+   relocations can only cover a 32-bit range, and glibc-internal TLS
+   variables such as errno end up outside that range.  */
+
+int
+main (void)
+{
+  return 0;
+}
diff --git a/scripts/tst-elf-edit.py b/scripts/tst-elf-edit.py
index a514179bbf..0e19ce1e73 100644
--- a/scripts/tst-elf-edit.py
+++ b/scripts/tst-elf-edit.py
@@ -43,9 +43,11 @@  EI_DATA=5
 ELFDATA2LSB=b'\x01'
 ELFDATA2MSB=b'\x02'
 
+ET_EXEC=2
 ET_DYN=3
 
 PT_LOAD=1
+PT_TLS=7
 
 def elf_types_fmts(e_ident):
     endian = '<' if e_ident[EI_DATA] == ELFDATA2LSB else '>'
@@ -146,8 +148,15 @@  def elf_edit_align(phdr, align):
     else:
         phdr.p_align = int(align)
 
+def elf_edit_maximize_tls_size(phdr, elfclass):
+    if elfclass == ELFCLASS32:
+        # It is possible that the kernel can allocate half of the
+        # address space, so use something larger.
+        phdr.p_memsz = 0xfff00000
+    else:
+        phdr.p_memsz = 1 << 63
 
-def elf_edit(f, align):
+def elf_edit(f, opts):
     ei_nident_fmt = 'c' * EI_NIDENT
     ei_nident_len = struct.calcsize(ei_nident_fmt)
 
@@ -172,24 +181,35 @@  def elf_edit(f, align):
 
     ehdr = Elf_Ehdr(e_ident)
     ehdr.read(f)
-    if ehdr.e_type != ET_DYN:
-       error('{}: not a shared library'.format(f.name))
+    if ehdr.e_type not in (ET_EXEC, ET_DYN):
+       error('{}: not an executable or shared library'.format(f.name))
 
     phdr = Elf_Phdr(e_ident)
+    maximize_tls_size_done = False
     for i in range(0, ehdr.e_phnum):
         f.seek(ehdr.e_phoff + i * phdr.len)
         phdr.read(f)
-        if phdr.p_type == PT_LOAD:
-            elf_edit_align(phdr, align)
+        if phdr.p_type == PT_LOAD and opts.align is not None:
+            elf_edit_align(phdr, opts.align)
+            f.seek(ehdr.e_phoff + i * phdr.len)
+            phdr.write(f)
+            break
+        if phdr.p_type == PT_TLS and opts.maximize_tls_size:
+            elf_edit_maximize_tls_size(phdr, e_ident[EI_CLASS])
             f.seek(ehdr.e_phoff + i * phdr.len)
             phdr.write(f)
+            maximize_tls_size_done = True
             break
 
+    if opts.maximize_tls_size and not maximize_tls_size_done:
+        error('{}: TLS maximum size was not updated'.format(f.name))
 
 def get_parser():
     parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument('-a', dest='align', required=True,
+    parser.add_argument('-a', dest='align',
                         help='How to set the LOAD alignment')
+    parser.add_argument('--maximize-tls-size', action='store_true',
+                        help='Set maximum PT_TLS size')
     parser.add_argument('output',
                         help='ELF file to edit')
     return parser
@@ -199,7 +219,7 @@  def main(argv):
     parser = get_parser()
     opts = parser.parse_args(argv)
     with open(opts.output, 'r+b') as fout:
-       elf_edit(fout, opts.align)
+       elf_edit(fout, opts)
 
 
 if __name__ == '__main__':
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index 4a5e698db2..5d0369358d 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -1211,6 +1211,11 @@  extern struct link_map * _dl_get_dl_main_map (void)
 # endif
 #endif
 
+/* Perform early memory allocation, avoding a TCB dependency.
+   Terminate the process if allocation fails.  May attempt to use
+   brk.  */
+void *_dl_early_allocate (size_t size) attribute_hidden;
+
 /* Initialize the DSO sort algorithm to use.  */
 #if !HAVE_TUNABLES
 static inline void
diff --git a/sysdeps/unix/sysv/linux/dl-early_allocate.c b/sysdeps/unix/sysv/linux/dl-early_allocate.c
new file mode 100644
index 0000000000..2c77b6e3ff
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/dl-early_allocate.c
@@ -0,0 +1,81 @@ 
+/* Early memory allocation for the dynamic loader.  Generic version.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Mark symbols hidden in static PIE for early self relocation to work.  */
+#if BUILD_PIE_DEFAULT
+# pragma GCC visibility push(hidden)
+#endif
+#include <startup.h>
+
+#include <ldsodefs.h>
+#include <stddef.h>
+#include <string.h>
+#include <sysdep.h>
+#include <unistd.h>
+
+#include <mmap_call.h>
+
+/* Defined in brk.c.  */
+extern void *__curbrk;
+
+void *
+_dl_early_allocate (size_t size)
+{
+  void *result;
+
+  if (__curbrk != NULL)
+    /* If the break has been initialized, brk must have run before,
+       so just call it once more.  */
+    {
+      result = __sbrk (size);
+      if (result == (void *) -1)
+        result = NULL;
+    }
+  else
+    {
+      /* If brk has not been invoked, there is no need to update
+         __curbrk.  The first call to brk will take care of that.  */
+      void *previous = (void *) INTERNAL_SYSCALL_CALL (brk, 0);
+      result = (void *) INTERNAL_SYSCALL_CALL (brk, previous + size);
+      if (result == previous)
+        result = NULL;
+      else
+        result = previous;
+    }
+
+  /* If brk fails, fall back to mmap.  This can happen due to
+     unfortunate ASLR layout decisions and kernel bugs, particularly
+     for static PIE.  */
+  if (result == NULL)
+    {
+      long int ret;
+      int prot = PROT_READ | PROT_WRITE;
+      int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+#ifdef __NR_mmap2
+      ret = MMAP_CALL_INTERNAL (mmap2, 0, size, prot, flags, -1, 0);
+#else
+      ret = MMAP_CALL_INTERNAL (mmap, 0, size, prot, flags, -1, 0);
+#endif
+      if (INTERNAL_SYSCALL_ERROR_P (ret))
+        result = NULL;
+      else
+        result = (void *) ret;
+    }
+
+  return result;
+}