Message ID | 20211113094525.96299-4-maxim.blinov@embecosm.com |
---|---|
State | New |
Headers | show |
Series | [1/4] Generate off-stack nested function trampolines | expand |
Hi all, apologies for forgetting to add the cover letter. The motivation of this work is to provide (limited) support for GCC nested function trampolines on targets that do not have an executable stack. This code has been (roughly) tested by creating several thousand nested functions (i.e. enough to force allocation of a new page), making sure all the nested functions execute correctly, and consequently returning back up and ensuring that the pages are freed when there are no more active trampolines in them. I was provided the initial design and prototype implementation by Andrew Burgess, and have since refactored the code to support allocation/deallocation of trampoline pages, and added AArch64 Linux/Darwin support. One of the limitations of the implementation in its current state is the inability to track longjmps. There has been some discussion about instrumenting calls to setjmp/longjmp so that the state of trampolines is correctly tracked and freed when necessary, however that hasn't been worked on yet. On Sat, 13 Nov 2021 at 09:45, Maxim Blinov <maxim.blinov@embecosm.com> wrote: > > Note: This patch is not yet ready for trunk as its dependent on some > patches that are not-yet-upstream, however it serves as motivation for > the previous patch(es) which are independent. > > ---- > > Implement the __builtin_nested_func_ptr_{created,deleted} functions > for the aarch64-darwin platform. For this platform > --enable-off-stack-trampolines is enabled by default, and > -foff-stack-trampolines is enabled by default if the host MacOS > operating system version is 11.x or greater. > > Co-authored-by: Andrew Burgess <andrew.burgess@embecosm.com> > > libgcc/ChangeLog: > > * config/aarch64/heap-trampoline.c (allocate_trampoline_page): > Request for MAP_JIT in the case of __APPLE__. > Provide __APPLE__ variant of aarch64_trampoline_insns that uses > x16 as the chain pointer. > (__builtin_nested_func_ptr_created): Call > pthread_jit_write_protect_np() to toggle read/write permission on > page. > * config.host (aarch64*-*darwin* | arm64*-*darwin*): Handle > --enable-off-stack-trampolines. > * configure.ac (--enable-off-stack-trampolines): Permit setting > for target aarch64*-*darwin* | arm64*-*darwin*, and set default to > enabled. > * configure: Regenerate. > --- > gcc/config.gcc | 7 +++++ > libgcc/config.host | 4 +++ > libgcc/config/aarch64/heap-trampoline.c | 36 +++++++++++++++++++++++++ > libgcc/configure | 6 +++++ > libgcc/configure.ac | 6 +++++ > 5 files changed, 59 insertions(+) > > diff --git a/gcc/config.gcc b/gcc/config.gcc > index 031be563c5d..c13f7629d44 100644 > --- a/gcc/config.gcc > +++ b/gcc/config.gcc > @@ -1072,6 +1072,13 @@ esac > > # Figure out if we need to enable -foff-stack-trampolines by default. > case ${target} in > +aarch64*-*darwin* | arm64*-*darwin*) > + if test ${macos_maj} = 11 || test ${macos_maj} = 12; then > + tm_defines="$tm_defines OFF_STACK_TRAMPOLINES_INIT=1" > + else > + tm_defines="$tm_defines OFF_STACK_TRAMPOLINES_INIT=0" > + fi > + ;; > *) > tm_defines="$tm_defines OFF_STACK_TRAMPOLINES_INIT=0" > ;; > diff --git a/libgcc/config.host b/libgcc/config.host > index d1a491d27e7..3c536b0928a 100644 > --- a/libgcc/config.host > +++ b/libgcc/config.host > @@ -414,6 +414,10 @@ aarch64*-*darwin* | arm64*-*darwin* ) > tmake_file="${tmake_file} t-crtfm" > # No soft float for now because our long double is DF not TF. > md_unwind_header=aarch64/aarch64-unwind.h > + if test x$off_stack_trampolines = xyes; then > + extra_parts="$extra_parts heap-trampoline.o" > + tmake_file="${tmake_file} ${cpu_type}/t-heap-trampoline" > + fi > ;; > aarch64*-*-freebsd*) > extra_parts="$extra_parts crtfastmath.o" > diff --git a/libgcc/config/aarch64/heap-trampoline.c b/libgcc/config/aarch64/heap-trampoline.c > index 721a2bed400..6994602beaf 100644 > --- a/libgcc/config/aarch64/heap-trampoline.c > +++ b/libgcc/config/aarch64/heap-trampoline.c > @@ -5,6 +5,9 @@ > #include <stdio.h> > #include <string.h> > > +/* For pthread_jit_write_protect_np */ > +#include <pthread.h> > + > void *allocate_trampoline_page (void); > int get_trampolines_per_page (void); > struct tramp_ctrl_data *allocate_tramp_ctrl (struct tramp_ctrl_data *parent); > @@ -43,8 +46,15 @@ allocate_trampoline_page (void) > { > void *page; > > +#if defined(__gnu_linux__) > page = mmap (0, getpagesize (), PROT_WRITE | PROT_EXEC, > MAP_ANON | MAP_PRIVATE, 0, 0); > +#elif defined(__APPLE__) > + page = mmap (0, getpagesize (), PROT_WRITE | PROT_EXEC, > + MAP_ANON | MAP_PRIVATE | MAP_JIT, 0, 0); > +#else > + page = MAP_FAILED; > +#endif > > return page; > } > @@ -67,6 +77,7 @@ allocate_tramp_ctrl (struct tramp_ctrl_data *parent) > return p; > } > > +#if defined(__gnu_linux__) > static const uint32_t aarch64_trampoline_insns[] = { > 0xd503245f, /* hint 34 */ > 0x580000b1, /* ldr x17, .+20 */ > @@ -76,6 +87,20 @@ static const uint32_t aarch64_trampoline_insns[] = { > 0xd5033fdf /* isb */ > }; > > +#elif defined(__APPLE__) > +static const uint32_t aarch64_trampoline_insns[] = { > + 0xd503245f, /* hint 34 */ > + 0x580000b1, /* ldr x17, .+20 */ > + 0x580000d0, /* ldr x16, .+24 */ > + 0xd61f0220, /* br x17 */ > + 0xd5033f9f, /* dsb sy */ > + 0xd5033fdf /* isb */ > +}; > + > +#else > +#error "Unsupported AArch64 platform for heap trampolines" > +#endif > + > void > __builtin_nested_func_ptr_created (void *chain, void *func, void **dst) > { > @@ -99,11 +124,22 @@ __builtin_nested_func_ptr_created (void *chain, void *func, void **dst) > = &tramp_ctrl_curr->trampolines[get_trampolines_per_page () > - tramp_ctrl_curr->free_trampolines]; > > +#if defined(__APPLE__) > + /* Disable write protection for the MAP_JIT regions in this thread (see > + https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon) */ > + pthread_jit_write_protect_np (0); > +#endif > + > memcpy (trampoline->insns, aarch64_trampoline_insns, > sizeof(aarch64_trampoline_insns)); > trampoline->func_ptr = func; > trampoline->chain_ptr = chain; > > +#if defined(__APPLE__) > + /* Re-enable write protection. */ > + pthread_jit_write_protect_np (1); > +#endif > + > tramp_ctrl_curr->free_trampolines -= 1; > > __builtin___clear_cache ((void *)trampoline->insns, > diff --git a/libgcc/configure b/libgcc/configure > index 5abcea8bed3..35cdf8f8c05 100755 > --- a/libgcc/configure > +++ b/libgcc/configure > @@ -2267,6 +2267,9 @@ case "$target" in > aarch64*-*-linux* ) > off_stack_trampolines=$enableval > ;; > + aarch64*-*darwin* | arm64*-*darwin* ) > + off_stack_trampolines=$enableval > + ;; > *) > as_fn_error $? "Configure option --enable-off-stack-trampolines is not supported \ > for this platform" "$LINENO" 5 > @@ -2276,6 +2279,9 @@ esac > else > > case "$target" in > + aarch64*-*darwin* | arm64*-*darwin* ) > + off_stack_trampolines=yes > + ;; > *) > off_stack_trampolines=no > ;; > diff --git a/libgcc/configure.ac b/libgcc/configure.ac > index c6eaceec957..3b129f1a4b8 100644 > --- a/libgcc/configure.ac > +++ b/libgcc/configure.ac > @@ -78,6 +78,9 @@ case "$target" in > aarch64*-*-linux* ) > off_stack_trampolines=$enableval > ;; > + aarch64*-*darwin* | arm64*-*darwin* ) > + off_stack_trampolines=$enableval > + ;; > *) > AC_MSG_ERROR([Configure option --enable-off-stack-trampolines is not supported \ > for this platform]) > @@ -85,6 +88,9 @@ for this platform]) > ;; > esac],[ > case "$target" in > + aarch64*-*darwin* | arm64*-*darwin* ) > + off_stack_trampolines=yes > + ;; > *) > off_stack_trampolines=no > ;; > -- > 2.30.1 (Apple Git-130) >
On 11/22/2021 7:49 AM, Maxim Blinov wrote: > Hi all, apologies for forgetting to add the cover letter. No worries. I'd already assumed this was to support aarch64 trampolines on darwin by having them live elsewere as managed entities. > > The motivation of this work is to provide (limited) support for GCC > nested function trampolines on targets that do not have an executable > stack. This code has been (roughly) tested by creating several > thousand nested functions (i.e. enough to force allocation of a new > page), making sure all the nested functions execute correctly, and > consequently returning back up and ensuring that the pages are freed > when there are no more active trampolines in them. Right. I'm looking at this wondering if we should do something similar for our new architecture. Avoiding executable stacks is a good thing :-) > One of the limitations of the implementation in its current state is > the inability to track longjmps. There has been some discussion about > instrumenting calls to setjmp/longjmp so that the state of trampolines > is correctly tracked and freed when necessary, however that hasn't > been worked on yet. So in the longjmp case, we just leak trampolines, right? I'd think that should be quite uncommon. It'd be nice to fix, but the benefits of non-executable stacks may ultimately be enough to overcome the limitation. The other question is why not do a scheme similar to what Ada does with function descriptors? Is that not feasible for some reason? I realize that hasn't been plumbed into the C/C++ compilers, but it may be another viable option. Jeff
> On 3 Dec 2021, at 03:12, Jeff Law <jeffreyalaw@gmail.com> wrote: > > > > On 11/22/2021 7:49 AM, Maxim Blinov wrote: >> Hi all, apologies for forgetting to add the cover letter. > No worries. I'd already assumed this was to support aarch64 trampolines on darwin by having them live elsewere as managed entities. > >> >> The motivation of this work is to provide (limited) support for GCC >> nested function trampolines on targets that do not have an executable >> stack. This code has been (roughly) tested by creating several >> thousand nested functions (i.e. enough to force allocation of a new >> page), making sure all the nested functions execute correctly, and >> consequently returning back up and ensuring that the pages are freed >> when there are no more active trampolines in them. > Right. I'm looking at this wondering if we should do something similar for our new architecture. Avoiding executable stacks is a good thing :-) >> One of the limitations of the implementation in its current state is >> the inability to track longjmps. There has been some discussion about >> instrumenting calls to setjmp/longjmp so that the state of trampolines >> is correctly tracked and freed when necessary, however that hasn't >> been worked on yet. > So in the longjmp case, we just leak trampolines, right? I'd think that should be quite uncommon. It'd be nice to fix, but the benefits of non-executable stacks may ultimately be enough to overcome the limitation. > > The other question is why not do a scheme similar to what Ada does with function descriptors? Is that not feasible for some reason? I realize that hasn't been plumbed into the C/C++ compilers, but it may be another viable option. The problem is that it breaks ABI ;) [in a function ptr] we need an address bit to test to determine if we are handling a case which has the descriptor, or if we have a regular indirect call. Unfortunately, although aarch64 aligns functions to 4 bytes, the two lower bits are reserved by Arm and therefore we’d have to force function alignment to 8bytes and that’s an ABI break (that cannot reasonably be expected to happen) for aarch64-darwin (or any other arch that has a release in the wild, I’d imagine). (FWIW, This is what I’ve currently implemented on my development branch [not for C++, since that has no nested functions]. I implemented the change for Fortran and re-used Martin Uecker’s proposed C impl. - but I used one of the reserved address bits [as a work-around to get people going with the port]) Here’s the thread discussing the situation when Martin proposed the change for C. https://gcc.gnu.org/legacy-ml/gcc-patches/2018-12/msg01532.html Iain
On 12/3/2021 12:53 AM, Iain Sandoe wrote: > >> On 3 Dec 2021, at 03:12, Jeff Law <jeffreyalaw@gmail.com> wrote: >> >> >> >> On 11/22/2021 7:49 AM, Maxim Blinov wrote: >>> Hi all, apologies for forgetting to add the cover letter. >> No worries. I'd already assumed this was to support aarch64 trampolines on darwin by having them live elsewere as managed entities. >> >>> The motivation of this work is to provide (limited) support for GCC >>> nested function trampolines on targets that do not have an executable >>> stack. This code has been (roughly) tested by creating several >>> thousand nested functions (i.e. enough to force allocation of a new >>> page), making sure all the nested functions execute correctly, and >>> consequently returning back up and ensuring that the pages are freed >>> when there are no more active trampolines in them. >> Right. I'm looking at this wondering if we should do something similar for our new architecture. Avoiding executable stacks is a good thing :-) >>> One of the limitations of the implementation in its current state is >>> the inability to track longjmps. There has been some discussion about >>> instrumenting calls to setjmp/longjmp so that the state of trampolines >>> is correctly tracked and freed when necessary, however that hasn't >>> been worked on yet. >> So in the longjmp case, we just leak trampolines, right? I'd think that should be quite uncommon. It'd be nice to fix, but the benefits of non-executable stacks may ultimately be enough to overcome the limitation. >> >> The other question is why not do a scheme similar to what Ada does with function descriptors? Is that not feasible for some reason? I realize that hasn't been plumbed into the C/C++ compilers, but it may be another viable option. > The problem is that it breaks ABI ;) It was worth asking :-) > > [in a function ptr] we need an address bit to test to determine if we are handling a case which has the descriptor, or if we have a regular indirect call. > > Unfortunately, although aarch64 aligns functions to 4 bytes, the two lower bits are reserved by Arm and therefore we’d have to force function alignment to 8bytes and that’s an ABI break (that cannot reasonably be expected to happen) for aarch64-darwin (or any other arch that has a release in the wild, I’d imagine). > > (FWIW, This is what I’ve currently implemented on my development branch [not for C++, since that has no nested functions]. I implemented the change for Fortran and re-used Martin Uecker’s proposed C impl. - but I used one of the reserved address bits [as a work-around to get people going with the port]) Good to know. I keep thinking I should revamp how our port handles nested functions. Right now we're using the tried and true trampolines, but we've got low bits available, so we could go with function descriptor approach. Or we could go with trampolines in mmap'd space approach. I just don't want to use old fashioned trampolines :-) > > Here’s the thread discussing the situation when Martin proposed the change for C. > > https://gcc.gnu.org/legacy-ml/gcc-patches/2018-12/msg01532.html Yea, I vaguely remember the thread. ABI stability is a pain :( jeff
diff --git a/gcc/config.gcc b/gcc/config.gcc index 031be563c5d..c13f7629d44 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -1072,6 +1072,13 @@ esac # Figure out if we need to enable -foff-stack-trampolines by default. case ${target} in +aarch64*-*darwin* | arm64*-*darwin*) + if test ${macos_maj} = 11 || test ${macos_maj} = 12; then + tm_defines="$tm_defines OFF_STACK_TRAMPOLINES_INIT=1" + else + tm_defines="$tm_defines OFF_STACK_TRAMPOLINES_INIT=0" + fi + ;; *) tm_defines="$tm_defines OFF_STACK_TRAMPOLINES_INIT=0" ;; diff --git a/libgcc/config.host b/libgcc/config.host index d1a491d27e7..3c536b0928a 100644 --- a/libgcc/config.host +++ b/libgcc/config.host @@ -414,6 +414,10 @@ aarch64*-*darwin* | arm64*-*darwin* ) tmake_file="${tmake_file} t-crtfm" # No soft float for now because our long double is DF not TF. md_unwind_header=aarch64/aarch64-unwind.h + if test x$off_stack_trampolines = xyes; then + extra_parts="$extra_parts heap-trampoline.o" + tmake_file="${tmake_file} ${cpu_type}/t-heap-trampoline" + fi ;; aarch64*-*-freebsd*) extra_parts="$extra_parts crtfastmath.o" diff --git a/libgcc/config/aarch64/heap-trampoline.c b/libgcc/config/aarch64/heap-trampoline.c index 721a2bed400..6994602beaf 100644 --- a/libgcc/config/aarch64/heap-trampoline.c +++ b/libgcc/config/aarch64/heap-trampoline.c @@ -5,6 +5,9 @@ #include <stdio.h> #include <string.h> +/* For pthread_jit_write_protect_np */ +#include <pthread.h> + void *allocate_trampoline_page (void); int get_trampolines_per_page (void); struct tramp_ctrl_data *allocate_tramp_ctrl (struct tramp_ctrl_data *parent); @@ -43,8 +46,15 @@ allocate_trampoline_page (void) { void *page; +#if defined(__gnu_linux__) page = mmap (0, getpagesize (), PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE, 0, 0); +#elif defined(__APPLE__) + page = mmap (0, getpagesize (), PROT_WRITE | PROT_EXEC, + MAP_ANON | MAP_PRIVATE | MAP_JIT, 0, 0); +#else + page = MAP_FAILED; +#endif return page; } @@ -67,6 +77,7 @@ allocate_tramp_ctrl (struct tramp_ctrl_data *parent) return p; } +#if defined(__gnu_linux__) static const uint32_t aarch64_trampoline_insns[] = { 0xd503245f, /* hint 34 */ 0x580000b1, /* ldr x17, .+20 */ @@ -76,6 +87,20 @@ static const uint32_t aarch64_trampoline_insns[] = { 0xd5033fdf /* isb */ }; +#elif defined(__APPLE__) +static const uint32_t aarch64_trampoline_insns[] = { + 0xd503245f, /* hint 34 */ + 0x580000b1, /* ldr x17, .+20 */ + 0x580000d0, /* ldr x16, .+24 */ + 0xd61f0220, /* br x17 */ + 0xd5033f9f, /* dsb sy */ + 0xd5033fdf /* isb */ +}; + +#else +#error "Unsupported AArch64 platform for heap trampolines" +#endif + void __builtin_nested_func_ptr_created (void *chain, void *func, void **dst) { @@ -99,11 +124,22 @@ __builtin_nested_func_ptr_created (void *chain, void *func, void **dst) = &tramp_ctrl_curr->trampolines[get_trampolines_per_page () - tramp_ctrl_curr->free_trampolines]; +#if defined(__APPLE__) + /* Disable write protection for the MAP_JIT regions in this thread (see + https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon) */ + pthread_jit_write_protect_np (0); +#endif + memcpy (trampoline->insns, aarch64_trampoline_insns, sizeof(aarch64_trampoline_insns)); trampoline->func_ptr = func; trampoline->chain_ptr = chain; +#if defined(__APPLE__) + /* Re-enable write protection. */ + pthread_jit_write_protect_np (1); +#endif + tramp_ctrl_curr->free_trampolines -= 1; __builtin___clear_cache ((void *)trampoline->insns, diff --git a/libgcc/configure b/libgcc/configure index 5abcea8bed3..35cdf8f8c05 100755 --- a/libgcc/configure +++ b/libgcc/configure @@ -2267,6 +2267,9 @@ case "$target" in aarch64*-*-linux* ) off_stack_trampolines=$enableval ;; + aarch64*-*darwin* | arm64*-*darwin* ) + off_stack_trampolines=$enableval + ;; *) as_fn_error $? "Configure option --enable-off-stack-trampolines is not supported \ for this platform" "$LINENO" 5 @@ -2276,6 +2279,9 @@ esac else case "$target" in + aarch64*-*darwin* | arm64*-*darwin* ) + off_stack_trampolines=yes + ;; *) off_stack_trampolines=no ;; diff --git a/libgcc/configure.ac b/libgcc/configure.ac index c6eaceec957..3b129f1a4b8 100644 --- a/libgcc/configure.ac +++ b/libgcc/configure.ac @@ -78,6 +78,9 @@ case "$target" in aarch64*-*-linux* ) off_stack_trampolines=$enableval ;; + aarch64*-*darwin* | arm64*-*darwin* ) + off_stack_trampolines=$enableval + ;; *) AC_MSG_ERROR([Configure option --enable-off-stack-trampolines is not supported \ for this platform]) @@ -85,6 +88,9 @@ for this platform]) ;; esac],[ case "$target" in + aarch64*-*darwin* | arm64*-*darwin* ) + off_stack_trampolines=yes + ;; *) off_stack_trampolines=no ;;