diff mbox series

tcg: Fix execution on Apple Silicon

Message ID 20210102122101.39617-1-r.bolshakov@yadro.com
State New
Headers show
Series tcg: Fix execution on Apple Silicon | expand

Commit Message

Roman Bolshakov Jan. 2, 2021, 12:21 p.m. UTC
Pages can't be both write and executable at the same time on Apple
Silicon. macOS provides public API to switch write protection [1] for
JIT applications, like TCG.

1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon

Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
---

Happy holidays, everyone.

This is somewhat similar to https://patchwork.kernel.org/project/qemu-devel/patch/20201108232425.1705-7-j@getutm.app/
but I couldn't apply the series so I started from scratch.

The primary difference from the patch above is that public API is used.
Other differences:
  * TB pages are mostly kept write-locked except around tcg_qemu_tb_exec()
  * x86_64 macOS doesn't use MAP_JIT and W^X switches

Regards,
Roman

 accel/tcg/cpu-exec.c      | 10 ++++++++++
 accel/tcg/translate-all.c | 26 ++++++++++++++++++++++++++
 include/exec/exec-all.h   |  2 ++
 tcg/tcg.c                 |  1 +
 4 files changed, 39 insertions(+)

Comments

no-reply@patchew.org Jan. 2, 2021, 12:39 p.m. UTC | #1
Patchew URL: https://patchew.org/QEMU/20210102122101.39617-1-r.bolshakov@yadro.com/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Type: series
Message-id: 20210102122101.39617-1-r.bolshakov@yadro.com
Subject: [PATCH] tcg: Fix execution on Apple Silicon

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 - [tag update]      patchew/20201215064200.28751-1-jiaxun.yang@flygoat.com -> patchew/20201215064200.28751-1-jiaxun.yang@flygoat.com
 - [tag update]      patchew/20201225201956.692861-1-richard.henderson@linaro.org -> patchew/20201225201956.692861-1-richard.henderson@linaro.org
 * [new tag]         patchew/20210102122101.39617-1-r.bolshakov@yadro.com -> patchew/20210102122101.39617-1-r.bolshakov@yadro.com
Switched to a new branch 'test'
3286c7e tcg: Fix execution on Apple Silicon

=== OUTPUT BEGIN ===
WARNING: architecture specific defines should be avoided
#80: FILE: accel/tcg/translate-all.c:1075:
+#if defined(__APPLE__) && defined(__aarch64__)

WARNING: architecture specific defines should be avoided
#101: FILE: accel/tcg/translate-all.c:2731:
+#if defined(__APPLE__) && defined(__aarch64__)

ERROR: space required before the open brace '{'
#104: FILE: accel/tcg/translate-all.c:2734:
+    if (pthread_jit_write_protect_supported_np()){

total: 1 errors, 2 warnings, 104 lines checked

Commit 3286c7e4aef9 (tcg: Fix execution on Apple Silicon) has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
=== OUTPUT END ===

Test command exited with code: 1


The full log is available at
http://patchew.org/logs/20210102122101.39617-1-r.bolshakov@yadro.com/testing.checkpatch/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-devel@redhat.com
Roman Bolshakov Jan. 2, 2021, 4:13 p.m. UTC | #2
On Sat, Jan 02, 2021 at 03:21:02PM +0300, Roman Bolshakov wrote:
> Pages can't be both write and executable at the same time on Apple
> Silicon. macOS provides public API to switch write protection [1] for
> JIT applications, like TCG.
> 
> 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
> 
> Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> ---
> 
> Happy holidays, everyone.
> 
> This is somewhat similar to https://patchwork.kernel.org/project/qemu-devel/patch/20201108232425.1705-7-j@getutm.app/
> but I couldn't apply the series so I started from scratch.
> 
> The primary difference from the patch above is that public API is used.
> Other differences:
>   * TB pages are mostly kept write-locked except around tcg_qemu_tb_exec()
>   * x86_64 macOS doesn't use MAP_JIT and W^X switches
> 
> Regards,
> Roman
> 
>  accel/tcg/cpu-exec.c      | 10 ++++++++++
>  accel/tcg/translate-all.c | 26 ++++++++++++++++++++++++++
>  include/exec/exec-all.h   |  2 ++
>  tcg/tcg.c                 |  1 +
>  4 files changed, 39 insertions(+)
> 
> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> index 8689c54499..0042fc9f2b 100644
> --- a/accel/tcg/cpu-exec.c
> +++ b/accel/tcg/cpu-exec.c
> @@ -175,7 +175,9 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
>      }
>  #endif /* DEBUG_DISAS */
>  
> +    tb_write_lock();
>      ret = tcg_qemu_tb_exec(env, tb_ptr);
> +    tb_write_unlock();
>      cpu->can_do_io = 1;
>      last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
>      tb_exit = ret & TB_EXIT_MASK;
> @@ -220,9 +222,11 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
>      cflags |= MIN(max_cycles, CF_COUNT_MASK);
>  
>      mmap_lock();
> +    tb_write_unlock();
>      tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base,
>                       orig_tb->flags, cflags);
>      tb->orig_tb = orig_tb;
> +    tb_write_lock();
>      mmap_unlock();
>  
>      /* execute the generated code */
> @@ -268,7 +272,9 @@ void cpu_exec_step_atomic(CPUState *cpu)
>          tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
>          if (tb == NULL) {
>              mmap_lock();
> +            tb_write_unlock();
>              tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
> +            tb_write_lock();
>              mmap_unlock();
>          }
>  
> @@ -428,7 +434,9 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
>      tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
>      if (tb == NULL) {
>          mmap_lock();
> +        tb_write_unlock();
>          tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
> +        tb_write_lock();
>          mmap_unlock();
>          /* We add the TB in the virtual pc hash table for the fast lookup */
>          qatomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
> @@ -444,7 +452,9 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
>  #endif
>      /* See if we can patch the calling TB. */
>      if (last_tb) {
> +        tb_write_unlock();
>          tb_add_jump(last_tb, tb_exit, tb);
> +        tb_write_lock();
>      }
>      return tb;
>  }
> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> index b7d50a73d4..1562076ffb 100644
> --- a/accel/tcg/translate-all.c
> +++ b/accel/tcg/translate-all.c
> @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
>      size_t size = tcg_ctx->code_gen_buffer_size;
>      void *buf;
>  
> +#if defined(__APPLE__) && defined(__aarch64__)
> +    flags |= MAP_JIT;
> +#endif
>      buf = mmap(NULL, size, prot, flags, -1, 0);
>      if (buf == MAP_FAILED) {
>          return NULL;
> @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
>  
>  static void tb_phys_invalidate__locked(TranslationBlock *tb)
>  {
> +    tb_write_unlock();
>      do_tb_phys_invalidate(tb, true);
> +    tb_write_lock();
>  }
>  
>  /* invalidate one TB
> @@ -2722,3 +2727,24 @@ void tcg_flush_softmmu_tlb(CPUState *cs)
>      tlb_flush(cs);
>  #endif
>  }
> +
> +#if defined(__APPLE__) && defined(__aarch64__)
> +static void tb_write_protect(bool locked)
> +{
> +    if (pthread_jit_write_protect_supported_np()){
> +        pthread_jit_write_protect_np(locked);
> +    }
> +}
> +#else
> +static void tb_write_protect(bool locked) {}
> +#endif
> +
> +void tb_write_lock(void)
> +{
> +    tb_write_protect(true);
> +}
> +
> +void tb_write_unlock(void)
> +{
> +    tb_write_protect(false);
> +}
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index fab573da06..962dca0975 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -549,6 +549,8 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
>                                     target_ulong cs_base, uint32_t flags,
>                                     uint32_t cf_mask);
>  void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
> +void tb_write_lock(void);
> +void tb_write_unlock(void);
>  
>  /* GETPC is the true target of the return instruction that we'll execute.  */
>  #if defined(CONFIG_TCG_INTERPRETER)
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 43c6cf8f52..303bb436bd 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
>      s->pool_labels = NULL;
>  #endif
>  
> +    tb_write_unlock();
>      /* Generate the prologue.  */
>      tcg_target_qemu_prologue(s);
>  
> -- 
> 2.29.2
> 

I've also noticed that Apple doesn't worry about sticking to particular
W^X mode:

https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch

We might also drop lock/unlock symmetry from here. E.g. we can have two
functions that switch the mode (they might be moved to util/osdep.c):

  qemu_jit_write();
  qemu_jit_execute();

Then we use them just before writing or before executing like advised on
their documentation page.

-Roman
Joelle van Dyne Jan. 2, 2021, 7:55 p.m. UTC | #3
I see you didn't touch cpu_loop_exit() and I'm curious how async
interrupts are handled. Have you tested this and it works i.e. booting
Windows 7 or Ubuntu 20.04? Also I've seen do_tb_phys_invalidate()
called both from code generation context (write unlocked) and
execution context (write locked), how does this patch differentiate
the two?

-j

On Sat, Jan 2, 2021 at 8:13 AM Roman Bolshakov <r.bolshakov@yadro.com> wrote:
>
> On Sat, Jan 02, 2021 at 03:21:02PM +0300, Roman Bolshakov wrote:
> > Pages can't be both write and executable at the same time on Apple
> > Silicon. macOS provides public API to switch write protection [1] for
> > JIT applications, like TCG.
> >
> > 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
> >
> > Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> > ---
> >
> > Happy holidays, everyone.
> >
> > This is somewhat similar to https://patchwork.kernel.org/project/qemu-devel/patch/20201108232425.1705-7-j@getutm.app/
> > but I couldn't apply the series so I started from scratch.
> >
> > The primary difference from the patch above is that public API is used.
> > Other differences:
> >   * TB pages are mostly kept write-locked except around tcg_qemu_tb_exec()
> >   * x86_64 macOS doesn't use MAP_JIT and W^X switches
> >
> > Regards,
> > Roman
> >
> >  accel/tcg/cpu-exec.c      | 10 ++++++++++
> >  accel/tcg/translate-all.c | 26 ++++++++++++++++++++++++++
> >  include/exec/exec-all.h   |  2 ++
> >  tcg/tcg.c                 |  1 +
> >  4 files changed, 39 insertions(+)
> >
> > diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> > index 8689c54499..0042fc9f2b 100644
> > --- a/accel/tcg/cpu-exec.c
> > +++ b/accel/tcg/cpu-exec.c
> > @@ -175,7 +175,9 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
> >      }
> >  #endif /* DEBUG_DISAS */
> >
> > +    tb_write_lock();
> >      ret = tcg_qemu_tb_exec(env, tb_ptr);
> > +    tb_write_unlock();
> >      cpu->can_do_io = 1;
> >      last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
> >      tb_exit = ret & TB_EXIT_MASK;
> > @@ -220,9 +222,11 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
> >      cflags |= MIN(max_cycles, CF_COUNT_MASK);
> >
> >      mmap_lock();
> > +    tb_write_unlock();
> >      tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base,
> >                       orig_tb->flags, cflags);
> >      tb->orig_tb = orig_tb;
> > +    tb_write_lock();
> >      mmap_unlock();
> >
> >      /* execute the generated code */
> > @@ -268,7 +272,9 @@ void cpu_exec_step_atomic(CPUState *cpu)
> >          tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
> >          if (tb == NULL) {
> >              mmap_lock();
> > +            tb_write_unlock();
> >              tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
> > +            tb_write_lock();
> >              mmap_unlock();
> >          }
> >
> > @@ -428,7 +434,9 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
> >      tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
> >      if (tb == NULL) {
> >          mmap_lock();
> > +        tb_write_unlock();
> >          tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
> > +        tb_write_lock();
> >          mmap_unlock();
> >          /* We add the TB in the virtual pc hash table for the fast lookup */
> >          qatomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
> > @@ -444,7 +452,9 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
> >  #endif
> >      /* See if we can patch the calling TB. */
> >      if (last_tb) {
> > +        tb_write_unlock();
> >          tb_add_jump(last_tb, tb_exit, tb);
> > +        tb_write_lock();
> >      }
> >      return tb;
> >  }
> > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> > index b7d50a73d4..1562076ffb 100644
> > --- a/accel/tcg/translate-all.c
> > +++ b/accel/tcg/translate-all.c
> > @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
> >      size_t size = tcg_ctx->code_gen_buffer_size;
> >      void *buf;
> >
> > +#if defined(__APPLE__) && defined(__aarch64__)
> > +    flags |= MAP_JIT;
> > +#endif
> >      buf = mmap(NULL, size, prot, flags, -1, 0);
> >      if (buf == MAP_FAILED) {
> >          return NULL;
> > @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
> >
> >  static void tb_phys_invalidate__locked(TranslationBlock *tb)
> >  {
> > +    tb_write_unlock();
> >      do_tb_phys_invalidate(tb, true);
> > +    tb_write_lock();
> >  }
> >
> >  /* invalidate one TB
> > @@ -2722,3 +2727,24 @@ void tcg_flush_softmmu_tlb(CPUState *cs)
> >      tlb_flush(cs);
> >  #endif
> >  }
> > +
> > +#if defined(__APPLE__) && defined(__aarch64__)
> > +static void tb_write_protect(bool locked)
> > +{
> > +    if (pthread_jit_write_protect_supported_np()){
> > +        pthread_jit_write_protect_np(locked);
> > +    }
> > +}
> > +#else
> > +static void tb_write_protect(bool locked) {}
> > +#endif
> > +
> > +void tb_write_lock(void)
> > +{
> > +    tb_write_protect(true);
> > +}
> > +
> > +void tb_write_unlock(void)
> > +{
> > +    tb_write_protect(false);
> > +}
> > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> > index fab573da06..962dca0975 100644
> > --- a/include/exec/exec-all.h
> > +++ b/include/exec/exec-all.h
> > @@ -549,6 +549,8 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
> >                                     target_ulong cs_base, uint32_t flags,
> >                                     uint32_t cf_mask);
> >  void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
> > +void tb_write_lock(void);
> > +void tb_write_unlock(void);
> >
> >  /* GETPC is the true target of the return instruction that we'll execute.  */
> >  #if defined(CONFIG_TCG_INTERPRETER)
> > diff --git a/tcg/tcg.c b/tcg/tcg.c
> > index 43c6cf8f52..303bb436bd 100644
> > --- a/tcg/tcg.c
> > +++ b/tcg/tcg.c
> > @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
> >      s->pool_labels = NULL;
> >  #endif
> >
> > +    tb_write_unlock();
> >      /* Generate the prologue.  */
> >      tcg_target_qemu_prologue(s);
> >
> > --
> > 2.29.2
> >
>
> I've also noticed that Apple doesn't worry about sticking to particular
> W^X mode:
>
> https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
>
> We might also drop lock/unlock symmetry from here. E.g. we can have two
> functions that switch the mode (they might be moved to util/osdep.c):
>
>   qemu_jit_write();
>   qemu_jit_execute();
>
> Then we use them just before writing or before executing like advised on
> their documentation page.
>
> -Roman
Roman Bolshakov Jan. 3, 2021, 2:20 p.m. UTC | #4
On Sat, Jan 02, 2021 at 11:55:29AM -0800, Joelle van Dyne wrote:
> I see you didn't touch cpu_loop_exit() and I'm curious how async
> interrupts are handled. Have you tested this and it works i.e. booting
> Windows 7 or Ubuntu 20.04? Also I've seen do_tb_phys_invalidate()
> called both from code generation context (write unlocked) and
> execution context (write locked), how does this patch differentiate
> the two?
> 

Hi Joelle,

I used the following rule of thumb when finding places where exec/write
protection has to be lifted. If it returns EXC_BAD_ACCESS under lldb and
stack backtrace is meaningful, then a write-protected region is
accessed. If the stack couldn't be unwinded and EXC_BAD_ACCESS is
returned then the region has exec restrictions.

With the patch I wasn't able to see any EXC_BAD_ACCESS.

I've tested x86_64 Ubuntu 20.04 Desktop. It boots but it's very slow
(but faster than TCG on x86). Windows XP is much faster and quite
responsive. I also tested Windows 95. I'll test Win 7/Win 10 a bit
later.

I'm going to update v2 shortly and going to introduce assymetric changes of
permissions akin to Apple's JavaScriptCore. In v2, I'm not changing
permission back and force unless it's needed to avoid EXC_BAD_ACCESS.

Regards,
Roman

> -j
> 
> On Sat, Jan 2, 2021 at 8:13 AM Roman Bolshakov <r.bolshakov@yadro.com> wrote:
> >
> > On Sat, Jan 02, 2021 at 03:21:02PM +0300, Roman Bolshakov wrote:
> > > Pages can't be both write and executable at the same time on Apple
> > > Silicon. macOS provides public API to switch write protection [1] for
> > > JIT applications, like TCG.
> > >
> > > 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
> > >
> > > Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> > > ---
> > >
> > > Happy holidays, everyone.
> > >
> > > This is somewhat similar to https://patchwork.kernel.org/project/qemu-devel/patch/20201108232425.1705-7-j@getutm.app/
> > > but I couldn't apply the series so I started from scratch.
> > >
> > > The primary difference from the patch above is that public API is used.
> > > Other differences:
> > >   * TB pages are mostly kept write-locked except around tcg_qemu_tb_exec()
> > >   * x86_64 macOS doesn't use MAP_JIT and W^X switches
> > >
> > > Regards,
> > > Roman
> > >
> > >  accel/tcg/cpu-exec.c      | 10 ++++++++++
> > >  accel/tcg/translate-all.c | 26 ++++++++++++++++++++++++++
> > >  include/exec/exec-all.h   |  2 ++
> > >  tcg/tcg.c                 |  1 +
> > >  4 files changed, 39 insertions(+)
> > >
> > > diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> > > index 8689c54499..0042fc9f2b 100644
> > > --- a/accel/tcg/cpu-exec.c
> > > +++ b/accel/tcg/cpu-exec.c
> > > @@ -175,7 +175,9 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
> > >      }
> > >  #endif /* DEBUG_DISAS */
> > >
> > > +    tb_write_lock();
> > >      ret = tcg_qemu_tb_exec(env, tb_ptr);
> > > +    tb_write_unlock();
> > >      cpu->can_do_io = 1;
> > >      last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
> > >      tb_exit = ret & TB_EXIT_MASK;
> > > @@ -220,9 +222,11 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
> > >      cflags |= MIN(max_cycles, CF_COUNT_MASK);
> > >
> > >      mmap_lock();
> > > +    tb_write_unlock();
> > >      tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base,
> > >                       orig_tb->flags, cflags);
> > >      tb->orig_tb = orig_tb;
> > > +    tb_write_lock();
> > >      mmap_unlock();
> > >
> > >      /* execute the generated code */
> > > @@ -268,7 +272,9 @@ void cpu_exec_step_atomic(CPUState *cpu)
> > >          tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
> > >          if (tb == NULL) {
> > >              mmap_lock();
> > > +            tb_write_unlock();
> > >              tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
> > > +            tb_write_lock();
> > >              mmap_unlock();
> > >          }
> > >
> > > @@ -428,7 +434,9 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
> > >      tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
> > >      if (tb == NULL) {
> > >          mmap_lock();
> > > +        tb_write_unlock();
> > >          tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
> > > +        tb_write_lock();
> > >          mmap_unlock();
> > >          /* We add the TB in the virtual pc hash table for the fast lookup */
> > >          qatomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
> > > @@ -444,7 +452,9 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
> > >  #endif
> > >      /* See if we can patch the calling TB. */
> > >      if (last_tb) {
> > > +        tb_write_unlock();
> > >          tb_add_jump(last_tb, tb_exit, tb);
> > > +        tb_write_lock();
> > >      }
> > >      return tb;
> > >  }
> > > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> > > index b7d50a73d4..1562076ffb 100644
> > > --- a/accel/tcg/translate-all.c
> > > +++ b/accel/tcg/translate-all.c
> > > @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
> > >      size_t size = tcg_ctx->code_gen_buffer_size;
> > >      void *buf;
> > >
> > > +#if defined(__APPLE__) && defined(__aarch64__)
> > > +    flags |= MAP_JIT;
> > > +#endif
> > >      buf = mmap(NULL, size, prot, flags, -1, 0);
> > >      if (buf == MAP_FAILED) {
> > >          return NULL;
> > > @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
> > >
> > >  static void tb_phys_invalidate__locked(TranslationBlock *tb)
> > >  {
> > > +    tb_write_unlock();
> > >      do_tb_phys_invalidate(tb, true);
> > > +    tb_write_lock();
> > >  }
> > >
> > >  /* invalidate one TB
> > > @@ -2722,3 +2727,24 @@ void tcg_flush_softmmu_tlb(CPUState *cs)
> > >      tlb_flush(cs);
> > >  #endif
> > >  }
> > > +
> > > +#if defined(__APPLE__) && defined(__aarch64__)
> > > +static void tb_write_protect(bool locked)
> > > +{
> > > +    if (pthread_jit_write_protect_supported_np()){
> > > +        pthread_jit_write_protect_np(locked);
> > > +    }
> > > +}
> > > +#else
> > > +static void tb_write_protect(bool locked) {}
> > > +#endif
> > > +
> > > +void tb_write_lock(void)
> > > +{
> > > +    tb_write_protect(true);
> > > +}
> > > +
> > > +void tb_write_unlock(void)
> > > +{
> > > +    tb_write_protect(false);
> > > +}
> > > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> > > index fab573da06..962dca0975 100644
> > > --- a/include/exec/exec-all.h
> > > +++ b/include/exec/exec-all.h
> > > @@ -549,6 +549,8 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
> > >                                     target_ulong cs_base, uint32_t flags,
> > >                                     uint32_t cf_mask);
> > >  void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
> > > +void tb_write_lock(void);
> > > +void tb_write_unlock(void);
> > >
> > >  /* GETPC is the true target of the return instruction that we'll execute.  */
> > >  #if defined(CONFIG_TCG_INTERPRETER)
> > > diff --git a/tcg/tcg.c b/tcg/tcg.c
> > > index 43c6cf8f52..303bb436bd 100644
> > > --- a/tcg/tcg.c
> > > +++ b/tcg/tcg.c
> > > @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
> > >      s->pool_labels = NULL;
> > >  #endif
> > >
> > > +    tb_write_unlock();
> > >      /* Generate the prologue.  */
> > >      tcg_target_qemu_prologue(s);
> > >
> > > --
> > > 2.29.2
> > >
> >
> > I've also noticed that Apple doesn't worry about sticking to particular
> > W^X mode:
> >
> > https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
> >
> > We might also drop lock/unlock symmetry from here. E.g. we can have two
> > functions that switch the mode (they might be moved to util/osdep.c):
> >
> >   qemu_jit_write();
> >   qemu_jit_execute();
> >
> > Then we use them just before writing or before executing like advised on
> > their documentation page.
> >
> > -Roman
Joelle van Dyne Jan. 3, 2021, 4:46 p.m. UTC | #5
Can you test with a low memory (-m 512) and also with single threaded
SMP (-smp 4)? Wondering if you're hitting all the edge cases because
there's oddities with cache flushing (can be done both in code gen and
code exec) and interrupt handling that caused issues for me.

-j

On Sun, Jan 3, 2021 at 6:20 AM Roman Bolshakov <r.bolshakov@yadro.com> wrote:
>
> On Sat, Jan 02, 2021 at 11:55:29AM -0800, Joelle van Dyne wrote:
> > I see you didn't touch cpu_loop_exit() and I'm curious how async
> > interrupts are handled. Have you tested this and it works i.e. booting
> > Windows 7 or Ubuntu 20.04? Also I've seen do_tb_phys_invalidate()
> > called both from code generation context (write unlocked) and
> > execution context (write locked), how does this patch differentiate
> > the two?
> >
>
> Hi Joelle,
>
> I used the following rule of thumb when finding places where exec/write
> protection has to be lifted. If it returns EXC_BAD_ACCESS under lldb and
> stack backtrace is meaningful, then a write-protected region is
> accessed. If the stack couldn't be unwinded and EXC_BAD_ACCESS is
> returned then the region has exec restrictions.
>
> With the patch I wasn't able to see any EXC_BAD_ACCESS.
>
> I've tested x86_64 Ubuntu 20.04 Desktop. It boots but it's very slow
> (but faster than TCG on x86). Windows XP is much faster and quite
> responsive. I also tested Windows 95. I'll test Win 7/Win 10 a bit
> later.
>
> I'm going to update v2 shortly and going to introduce assymetric changes of
> permissions akin to Apple's JavaScriptCore. In v2, I'm not changing
> permission back and force unless it's needed to avoid EXC_BAD_ACCESS.
>
> Regards,
> Roman
>
> > -j
> >
> > On Sat, Jan 2, 2021 at 8:13 AM Roman Bolshakov <r.bolshakov@yadro.com> wrote:
> > >
> > > On Sat, Jan 02, 2021 at 03:21:02PM +0300, Roman Bolshakov wrote:
> > > > Pages can't be both write and executable at the same time on Apple
> > > > Silicon. macOS provides public API to switch write protection [1] for
> > > > JIT applications, like TCG.
> > > >
> > > > 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
> > > >
> > > > Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> > > > ---
> > > >
> > > > Happy holidays, everyone.
> > > >
> > > > This is somewhat similar to https://patchwork.kernel.org/project/qemu-devel/patch/20201108232425.1705-7-j@getutm.app/
> > > > but I couldn't apply the series so I started from scratch.
> > > >
> > > > The primary difference from the patch above is that public API is used.
> > > > Other differences:
> > > >   * TB pages are mostly kept write-locked except around tcg_qemu_tb_exec()
> > > >   * x86_64 macOS doesn't use MAP_JIT and W^X switches
> > > >
> > > > Regards,
> > > > Roman
> > > >
> > > >  accel/tcg/cpu-exec.c      | 10 ++++++++++
> > > >  accel/tcg/translate-all.c | 26 ++++++++++++++++++++++++++
> > > >  include/exec/exec-all.h   |  2 ++
> > > >  tcg/tcg.c                 |  1 +
> > > >  4 files changed, 39 insertions(+)
> > > >
> > > > diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> > > > index 8689c54499..0042fc9f2b 100644
> > > > --- a/accel/tcg/cpu-exec.c
> > > > +++ b/accel/tcg/cpu-exec.c
> > > > @@ -175,7 +175,9 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
> > > >      }
> > > >  #endif /* DEBUG_DISAS */
> > > >
> > > > +    tb_write_lock();
> > > >      ret = tcg_qemu_tb_exec(env, tb_ptr);
> > > > +    tb_write_unlock();
> > > >      cpu->can_do_io = 1;
> > > >      last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
> > > >      tb_exit = ret & TB_EXIT_MASK;
> > > > @@ -220,9 +222,11 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
> > > >      cflags |= MIN(max_cycles, CF_COUNT_MASK);
> > > >
> > > >      mmap_lock();
> > > > +    tb_write_unlock();
> > > >      tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base,
> > > >                       orig_tb->flags, cflags);
> > > >      tb->orig_tb = orig_tb;
> > > > +    tb_write_lock();
> > > >      mmap_unlock();
> > > >
> > > >      /* execute the generated code */
> > > > @@ -268,7 +272,9 @@ void cpu_exec_step_atomic(CPUState *cpu)
> > > >          tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
> > > >          if (tb == NULL) {
> > > >              mmap_lock();
> > > > +            tb_write_unlock();
> > > >              tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
> > > > +            tb_write_lock();
> > > >              mmap_unlock();
> > > >          }
> > > >
> > > > @@ -428,7 +434,9 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
> > > >      tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
> > > >      if (tb == NULL) {
> > > >          mmap_lock();
> > > > +        tb_write_unlock();
> > > >          tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
> > > > +        tb_write_lock();
> > > >          mmap_unlock();
> > > >          /* We add the TB in the virtual pc hash table for the fast lookup */
> > > >          qatomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
> > > > @@ -444,7 +452,9 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
> > > >  #endif
> > > >      /* See if we can patch the calling TB. */
> > > >      if (last_tb) {
> > > > +        tb_write_unlock();
> > > >          tb_add_jump(last_tb, tb_exit, tb);
> > > > +        tb_write_lock();
> > > >      }
> > > >      return tb;
> > > >  }
> > > > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> > > > index b7d50a73d4..1562076ffb 100644
> > > > --- a/accel/tcg/translate-all.c
> > > > +++ b/accel/tcg/translate-all.c
> > > > @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
> > > >      size_t size = tcg_ctx->code_gen_buffer_size;
> > > >      void *buf;
> > > >
> > > > +#if defined(__APPLE__) && defined(__aarch64__)
> > > > +    flags |= MAP_JIT;
> > > > +#endif
> > > >      buf = mmap(NULL, size, prot, flags, -1, 0);
> > > >      if (buf == MAP_FAILED) {
> > > >          return NULL;
> > > > @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
> > > >
> > > >  static void tb_phys_invalidate__locked(TranslationBlock *tb)
> > > >  {
> > > > +    tb_write_unlock();
> > > >      do_tb_phys_invalidate(tb, true);
> > > > +    tb_write_lock();
> > > >  }
> > > >
> > > >  /* invalidate one TB
> > > > @@ -2722,3 +2727,24 @@ void tcg_flush_softmmu_tlb(CPUState *cs)
> > > >      tlb_flush(cs);
> > > >  #endif
> > > >  }
> > > > +
> > > > +#if defined(__APPLE__) && defined(__aarch64__)
> > > > +static void tb_write_protect(bool locked)
> > > > +{
> > > > +    if (pthread_jit_write_protect_supported_np()){
> > > > +        pthread_jit_write_protect_np(locked);
> > > > +    }
> > > > +}
> > > > +#else
> > > > +static void tb_write_protect(bool locked) {}
> > > > +#endif
> > > > +
> > > > +void tb_write_lock(void)
> > > > +{
> > > > +    tb_write_protect(true);
> > > > +}
> > > > +
> > > > +void tb_write_unlock(void)
> > > > +{
> > > > +    tb_write_protect(false);
> > > > +}
> > > > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> > > > index fab573da06..962dca0975 100644
> > > > --- a/include/exec/exec-all.h
> > > > +++ b/include/exec/exec-all.h
> > > > @@ -549,6 +549,8 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
> > > >                                     target_ulong cs_base, uint32_t flags,
> > > >                                     uint32_t cf_mask);
> > > >  void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
> > > > +void tb_write_lock(void);
> > > > +void tb_write_unlock(void);
> > > >
> > > >  /* GETPC is the true target of the return instruction that we'll execute.  */
> > > >  #if defined(CONFIG_TCG_INTERPRETER)
> > > > diff --git a/tcg/tcg.c b/tcg/tcg.c
> > > > index 43c6cf8f52..303bb436bd 100644
> > > > --- a/tcg/tcg.c
> > > > +++ b/tcg/tcg.c
> > > > @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
> > > >      s->pool_labels = NULL;
> > > >  #endif
> > > >
> > > > +    tb_write_unlock();
> > > >      /* Generate the prologue.  */
> > > >      tcg_target_qemu_prologue(s);
> > > >
> > > > --
> > > > 2.29.2
> > > >
> > >
> > > I've also noticed that Apple doesn't worry about sticking to particular
> > > W^X mode:
> > >
> > > https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
> > >
> > > We might also drop lock/unlock symmetry from here. E.g. we can have two
> > > functions that switch the mode (they might be moved to util/osdep.c):
> > >
> > >   qemu_jit_write();
> > >   qemu_jit_execute();
> > >
> > > Then we use them just before writing or before executing like advised on
> > > their documentation page.
> > >
> > > -Roman
Roman Bolshakov Jan. 3, 2021, 10:54 p.m. UTC | #6
On Sun, Jan 03, 2021 at 08:46:27AM -0800, Joelle van Dyne wrote:
> Can you test with a low memory (-m 512) and also with single threaded
> SMP (-smp 4)? Wondering if you're hitting all the edge cases because
> there's oddities with cache flushing (can be done both in code gen and
> code exec) and interrupt handling that caused issues for me.
> 

I tested XP with default memory (128m) and -m 512. I did run Ubuntu with with
-smp 1/2/4 and multiple variants of memory (2048,4096). I've just
installed Windows 10 and it somehow works noticably faster than Ubuntu
20.04 (what makes me wonder why Ubuntu 20.04 peforms worse).

But you know, I've noticed that MTTCG is disabled by default on arm
hosts, so -smp 4 has no effect (it should print a warning IMO that smp
is noop, or even quit from qemu to disallow single-threaded TCG and -smp
flag with a value beyond 1).

If I try to enable MTTCG, I get a warning from QEMU and only one CPU
inside VM (according to Windows Task Manager).

$ build/qemu-system-x86_64 -cpu Nehalem -accel tcg,thread=multi -smp 4 -m 4096 -hda ~/vms/win10.qcow2

qemu-system-x86_64: -accel tcg,thread=multi: warning: Guest expects a stronger memory ordering than the host provides
This may cause strange/hard to debug errors

As far as I understand from the ticket below this is intentional:
https://bugs.launchpad.net/qemu/+bug/1824768

> There aren't many people overall who want to try to run emulation on
> anything other than x86 host.

Perhaps we could enable MTTCG by enabling TSO in M1 like it's done in
Rosetta to avoid performance overheads of implicit barriers?

BTW, I wonder if you tried my patch? Do you hit the mentioned issues?

With regards to do_tb_phys_invalidate(), the function doesn't care
about whether it was write or exec locked. It needs write permissions
at least for TB spin lock. And something after return from
tb_phys_invalidate() needs exec permssions. I can try to find "that
something" and move change of permissions to rx closer to the place that
needs exec permissions. And then, move change of permissions to rw
inside do_tb_phys_invalidate() just before TB spin lock is acquired.

Regards,
Roman

> -j
> 
> On Sun, Jan 3, 2021 at 6:20 AM Roman Bolshakov <r.bolshakov@yadro.com> wrote:
> >
> > On Sat, Jan 02, 2021 at 11:55:29AM -0800, Joelle van Dyne wrote:
> > > I see you didn't touch cpu_loop_exit() and I'm curious how async
> > > interrupts are handled. Have you tested this and it works i.e. booting
> > > Windows 7 or Ubuntu 20.04? Also I've seen do_tb_phys_invalidate()
> > > called both from code generation context (write unlocked) and
> > > execution context (write locked), how does this patch differentiate
> > > the two?
> > >
> >
> > Hi Joelle,
> >
> > I used the following rule of thumb when finding places where exec/write
> > protection has to be lifted. If it returns EXC_BAD_ACCESS under lldb and
> > stack backtrace is meaningful, then a write-protected region is
> > accessed. If the stack couldn't be unwinded and EXC_BAD_ACCESS is
> > returned then the region has exec restrictions.
> >
> > With the patch I wasn't able to see any EXC_BAD_ACCESS.
> >
> > I've tested x86_64 Ubuntu 20.04 Desktop. It boots but it's very slow
> > (but faster than TCG on x86). Windows XP is much faster and quite
> > responsive. I also tested Windows 95. I'll test Win 7/Win 10 a bit
> > later.
> >
> > I'm going to update v2 shortly and going to introduce assymetric changes of
> > permissions akin to Apple's JavaScriptCore. In v2, I'm not changing
> > permission back and force unless it's needed to avoid EXC_BAD_ACCESS.
> >
> > Regards,
> > Roman
> >
> > > -j
> > >
> > > On Sat, Jan 2, 2021 at 8:13 AM Roman Bolshakov <r.bolshakov@yadro.com> wrote:
> > > >
> > > > On Sat, Jan 02, 2021 at 03:21:02PM +0300, Roman Bolshakov wrote:
> > > > > Pages can't be both write and executable at the same time on Apple
> > > > > Silicon. macOS provides public API to switch write protection [1] for
> > > > > JIT applications, like TCG.
> > > > >
> > > > > 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
> > > > >
> > > > > Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> > > > > ---
> > > > >
> > > > > Happy holidays, everyone.
> > > > >
> > > > > This is somewhat similar to https://patchwork.kernel.org/project/qemu-devel/patch/20201108232425.1705-7-j@getutm.app/
> > > > > but I couldn't apply the series so I started from scratch.
> > > > >
> > > > > The primary difference from the patch above is that public API is used.
> > > > > Other differences:
> > > > >   * TB pages are mostly kept write-locked except around tcg_qemu_tb_exec()
> > > > >   * x86_64 macOS doesn't use MAP_JIT and W^X switches
> > > > >
> > > > > Regards,
> > > > > Roman
> > > > >
> > > > >  accel/tcg/cpu-exec.c      | 10 ++++++++++
> > > > >  accel/tcg/translate-all.c | 26 ++++++++++++++++++++++++++
> > > > >  include/exec/exec-all.h   |  2 ++
> > > > >  tcg/tcg.c                 |  1 +
> > > > >  4 files changed, 39 insertions(+)
> > > > >
> > > > > diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> > > > > index 8689c54499..0042fc9f2b 100644
> > > > > --- a/accel/tcg/cpu-exec.c
> > > > > +++ b/accel/tcg/cpu-exec.c
> > > > > @@ -175,7 +175,9 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
> > > > >      }
> > > > >  #endif /* DEBUG_DISAS */
> > > > >
> > > > > +    tb_write_lock();
> > > > >      ret = tcg_qemu_tb_exec(env, tb_ptr);
> > > > > +    tb_write_unlock();
> > > > >      cpu->can_do_io = 1;
> > > > >      last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
> > > > >      tb_exit = ret & TB_EXIT_MASK;
> > > > > @@ -220,9 +222,11 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
> > > > >      cflags |= MIN(max_cycles, CF_COUNT_MASK);
> > > > >
> > > > >      mmap_lock();
> > > > > +    tb_write_unlock();
> > > > >      tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base,
> > > > >                       orig_tb->flags, cflags);
> > > > >      tb->orig_tb = orig_tb;
> > > > > +    tb_write_lock();
> > > > >      mmap_unlock();
> > > > >
> > > > >      /* execute the generated code */
> > > > > @@ -268,7 +272,9 @@ void cpu_exec_step_atomic(CPUState *cpu)
> > > > >          tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
> > > > >          if (tb == NULL) {
> > > > >              mmap_lock();
> > > > > +            tb_write_unlock();
> > > > >              tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
> > > > > +            tb_write_lock();
> > > > >              mmap_unlock();
> > > > >          }
> > > > >
> > > > > @@ -428,7 +434,9 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
> > > > >      tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
> > > > >      if (tb == NULL) {
> > > > >          mmap_lock();
> > > > > +        tb_write_unlock();
> > > > >          tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
> > > > > +        tb_write_lock();
> > > > >          mmap_unlock();
> > > > >          /* We add the TB in the virtual pc hash table for the fast lookup */
> > > > >          qatomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
> > > > > @@ -444,7 +452,9 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
> > > > >  #endif
> > > > >      /* See if we can patch the calling TB. */
> > > > >      if (last_tb) {
> > > > > +        tb_write_unlock();
> > > > >          tb_add_jump(last_tb, tb_exit, tb);
> > > > > +        tb_write_lock();
> > > > >      }
> > > > >      return tb;
> > > > >  }
> > > > > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> > > > > index b7d50a73d4..1562076ffb 100644
> > > > > --- a/accel/tcg/translate-all.c
> > > > > +++ b/accel/tcg/translate-all.c
> > > > > @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
> > > > >      size_t size = tcg_ctx->code_gen_buffer_size;
> > > > >      void *buf;
> > > > >
> > > > > +#if defined(__APPLE__) && defined(__aarch64__)
> > > > > +    flags |= MAP_JIT;
> > > > > +#endif
> > > > >      buf = mmap(NULL, size, prot, flags, -1, 0);
> > > > >      if (buf == MAP_FAILED) {
> > > > >          return NULL;
> > > > > @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
> > > > >
> > > > >  static void tb_phys_invalidate__locked(TranslationBlock *tb)
> > > > >  {
> > > > > +    tb_write_unlock();
> > > > >      do_tb_phys_invalidate(tb, true);
> > > > > +    tb_write_lock();
> > > > >  }
> > > > >
> > > > >  /* invalidate one TB
> > > > > @@ -2722,3 +2727,24 @@ void tcg_flush_softmmu_tlb(CPUState *cs)
> > > > >      tlb_flush(cs);
> > > > >  #endif
> > > > >  }
> > > > > +
> > > > > +#if defined(__APPLE__) && defined(__aarch64__)
> > > > > +static void tb_write_protect(bool locked)
> > > > > +{
> > > > > +    if (pthread_jit_write_protect_supported_np()){
> > > > > +        pthread_jit_write_protect_np(locked);
> > > > > +    }
> > > > > +}
> > > > > +#else
> > > > > +static void tb_write_protect(bool locked) {}
> > > > > +#endif
> > > > > +
> > > > > +void tb_write_lock(void)
> > > > > +{
> > > > > +    tb_write_protect(true);
> > > > > +}
> > > > > +
> > > > > +void tb_write_unlock(void)
> > > > > +{
> > > > > +    tb_write_protect(false);
> > > > > +}
> > > > > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> > > > > index fab573da06..962dca0975 100644
> > > > > --- a/include/exec/exec-all.h
> > > > > +++ b/include/exec/exec-all.h
> > > > > @@ -549,6 +549,8 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
> > > > >                                     target_ulong cs_base, uint32_t flags,
> > > > >                                     uint32_t cf_mask);
> > > > >  void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
> > > > > +void tb_write_lock(void);
> > > > > +void tb_write_unlock(void);
> > > > >
> > > > >  /* GETPC is the true target of the return instruction that we'll execute.  */
> > > > >  #if defined(CONFIG_TCG_INTERPRETER)
> > > > > diff --git a/tcg/tcg.c b/tcg/tcg.c
> > > > > index 43c6cf8f52..303bb436bd 100644
> > > > > --- a/tcg/tcg.c
> > > > > +++ b/tcg/tcg.c
> > > > > @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
> > > > >      s->pool_labels = NULL;
> > > > >  #endif
> > > > >
> > > > > +    tb_write_unlock();
> > > > >      /* Generate the prologue.  */
> > > > >      tcg_target_qemu_prologue(s);
> > > > >
> > > > > --
> > > > > 2.29.2
> > > > >
> > > >
> > > > I've also noticed that Apple doesn't worry about sticking to particular
> > > > W^X mode:
> > > >
> > > > https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
> > > >
> > > > We might also drop lock/unlock symmetry from here. E.g. we can have two
> > > > functions that switch the mode (they might be moved to util/osdep.c):
> > > >
> > > >   qemu_jit_write();
> > > >   qemu_jit_execute();
> > > >
> > > > Then we use them just before writing or before executing like advised on
> > > > their documentation page.
> > > >
> > > > -Roman
Joelle van Dyne Jan. 4, 2021, 1:47 a.m. UTC | #7
The MTTCG issue is known and I’ve only used multiple core emulation with
single core.

I haven’t tried your patch yet but I plan to when I get time next week. I’m
surprised because it’s (your v1 patchset) exactly what I had originally
before hitting crashes and having to add the calls to more places. But
perhaps something changed between QEMU 4.2.0 and 5.2.0 that made things
work more “as expected”. If so then I plan to undo the changes I’ve added.

-j

On Sunday, January 3, 2021, Roman Bolshakov <r.bolshakov@yadro.com> wrote:

> On Sun, Jan 03, 2021 at 08:46:27AM -0800, Joelle van Dyne wrote:
> > Can you test with a low memory (-m 512) and also with single threaded
> > SMP (-smp 4)? Wondering if you're hitting all the edge cases because
> > there's oddities with cache flushing (can be done both in code gen and
> > code exec) and interrupt handling that caused issues for me.
> >
>
> I tested XP with default memory (128m) and -m 512. I did run Ubuntu with
> with
> -smp 1/2/4 and multiple variants of memory (2048,4096). I've just
> installed Windows 10 and it somehow works noticably faster than Ubuntu
> 20.04 (what makes me wonder why Ubuntu 20.04 peforms worse).
>
> But you know, I've noticed that MTTCG is disabled by default on arm
> hosts, so -smp 4 has no effect (it should print a warning IMO that smp
> is noop, or even quit from qemu to disallow single-threaded TCG and -smp
> flag with a value beyond 1).
>
> If I try to enable MTTCG, I get a warning from QEMU and only one CPU
> inside VM (according to Windows Task Manager).
>
> $ build/qemu-system-x86_64 -cpu Nehalem -accel tcg,thread=multi -smp 4 -m
> 4096 -hda ~/vms/win10.qcow2
>
> qemu-system-x86_64: -accel tcg,thread=multi: warning: Guest expects a
> stronger memory ordering than the host provides
> This may cause strange/hard to debug errors
>
> As far as I understand from the ticket below this is intentional:
> https://bugs.launchpad.net/qemu/+bug/1824768
>
> > There aren't many people overall who want to try to run emulation on
> > anything other than x86 host.
>
> Perhaps we could enable MTTCG by enabling TSO in M1 like it's done in
> Rosetta to avoid performance overheads of implicit barriers?
>
> BTW, I wonder if you tried my patch? Do you hit the mentioned issues?
>
> With regards to do_tb_phys_invalidate(), the function doesn't care
> about whether it was write or exec locked. It needs write permissions
> at least for TB spin lock. And something after return from
> tb_phys_invalidate() needs exec permssions. I can try to find "that
> something" and move change of permissions to rx closer to the place that
> needs exec permissions. And then, move change of permissions to rw
> inside do_tb_phys_invalidate() just before TB spin lock is acquired.
>
> Regards,
> Roman
>
> > -j
> >
> > On Sun, Jan 3, 2021 at 6:20 AM Roman Bolshakov <r.bolshakov@yadro.com>
> wrote:
> > >
> > > On Sat, Jan 02, 2021 at 11:55:29AM -0800, Joelle van Dyne wrote:
> > > > I see you didn't touch cpu_loop_exit() and I'm curious how async
> > > > interrupts are handled. Have you tested this and it works i.e.
> booting
> > > > Windows 7 or Ubuntu 20.04? Also I've seen do_tb_phys_invalidate()
> > > > called both from code generation context (write unlocked) and
> > > > execution context (write locked), how does this patch differentiate
> > > > the two?
> > > >
> > >
> > > Hi Joelle,
> > >
> > > I used the following rule of thumb when finding places where exec/write
> > > protection has to be lifted. If it returns EXC_BAD_ACCESS under lldb
> and
> > > stack backtrace is meaningful, then a write-protected region is
> > > accessed. If the stack couldn't be unwinded and EXC_BAD_ACCESS is
> > > returned then the region has exec restrictions.
> > >
> > > With the patch I wasn't able to see any EXC_BAD_ACCESS.
> > >
> > > I've tested x86_64 Ubuntu 20.04 Desktop. It boots but it's very slow
> > > (but faster than TCG on x86). Windows XP is much faster and quite
> > > responsive. I also tested Windows 95. I'll test Win 7/Win 10 a bit
> > > later.
> > >
> > > I'm going to update v2 shortly and going to introduce assymetric
> changes of
> > > permissions akin to Apple's JavaScriptCore. In v2, I'm not changing
> > > permission back and force unless it's needed to avoid EXC_BAD_ACCESS.
> > >
> > > Regards,
> > > Roman
> > >
> > > > -j
> > > >
> > > > On Sat, Jan 2, 2021 at 8:13 AM Roman Bolshakov <
> r.bolshakov@yadro.com> wrote:
> > > > >
> > > > > On Sat, Jan 02, 2021 at 03:21:02PM +0300, Roman Bolshakov wrote:
> > > > > > Pages can't be both write and executable at the same time on
> Apple
> > > > > > Silicon. macOS provides public API to switch write protection
> [1] for
> > > > > > JIT applications, like TCG.
> > > > > >
> > > > > > 1. https://developer.apple.com/documentation/apple_silicon/
> porting_just-in-time_compilers_to_apple_silicon
> > > > > >
> > > > > > Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> > > > > > ---
> > > > > >
> > > > > > Happy holidays, everyone.
> > > > > >
> > > > > > This is somewhat similar to https://patchwork.kernel.org/
> project/qemu-devel/patch/20201108232425.1705-7-j@getutm.app/
> > > > > > but I couldn't apply the series so I started from scratch.
> > > > > >
> > > > > > The primary difference from the patch above is that public API
> is used.
> > > > > > Other differences:
> > > > > >   * TB pages are mostly kept write-locked except around
> tcg_qemu_tb_exec()
> > > > > >   * x86_64 macOS doesn't use MAP_JIT and W^X switches
> > > > > >
> > > > > > Regards,
> > > > > > Roman
> > > > > >
> > > > > >  accel/tcg/cpu-exec.c      | 10 ++++++++++
> > > > > >  accel/tcg/translate-all.c | 26 ++++++++++++++++++++++++++
> > > > > >  include/exec/exec-all.h   |  2 ++
> > > > > >  tcg/tcg.c                 |  1 +
> > > > > >  4 files changed, 39 insertions(+)
> > > > > >
> > > > > > diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> > > > > > index 8689c54499..0042fc9f2b 100644
> > > > > > --- a/accel/tcg/cpu-exec.c
> > > > > > +++ b/accel/tcg/cpu-exec.c
> > > > > > @@ -175,7 +175,9 @@ static inline tcg_target_ulong
> cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
> > > > > >      }
> > > > > >  #endif /* DEBUG_DISAS */
> > > > > >
> > > > > > +    tb_write_lock();
> > > > > >      ret = tcg_qemu_tb_exec(env, tb_ptr);
> > > > > > +    tb_write_unlock();
> > > > > >      cpu->can_do_io = 1;
> > > > > >      last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
> > > > > >      tb_exit = ret & TB_EXIT_MASK;
> > > > > > @@ -220,9 +222,11 @@ static void cpu_exec_nocache(CPUState *cpu,
> int max_cycles,
> > > > > >      cflags |= MIN(max_cycles, CF_COUNT_MASK);
> > > > > >
> > > > > >      mmap_lock();
> > > > > > +    tb_write_unlock();
> > > > > >      tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base,
> > > > > >                       orig_tb->flags, cflags);
> > > > > >      tb->orig_tb = orig_tb;
> > > > > > +    tb_write_lock();
> > > > > >      mmap_unlock();
> > > > > >
> > > > > >      /* execute the generated code */
> > > > > > @@ -268,7 +272,9 @@ void cpu_exec_step_atomic(CPUState *cpu)
> > > > > >          tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags,
> cf_mask);
> > > > > >          if (tb == NULL) {
> > > > > >              mmap_lock();
> > > > > > +            tb_write_unlock();
> > > > > >              tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
> > > > > > +            tb_write_lock();
> > > > > >              mmap_unlock();
> > > > > >          }
> > > > > >
> > > > > > @@ -428,7 +434,9 @@ static inline TranslationBlock
> *tb_find(CPUState *cpu,
> > > > > >      tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags,
> cf_mask);
> > > > > >      if (tb == NULL) {
> > > > > >          mmap_lock();
> > > > > > +        tb_write_unlock();
> > > > > >          tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
> > > > > > +        tb_write_lock();
> > > > > >          mmap_unlock();
> > > > > >          /* We add the TB in the virtual pc hash table for the
> fast lookup */
> > > > > >          qatomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)],
> tb);
> > > > > > @@ -444,7 +452,9 @@ static inline TranslationBlock
> *tb_find(CPUState *cpu,
> > > > > >  #endif
> > > > > >      /* See if we can patch the calling TB. */
> > > > > >      if (last_tb) {
> > > > > > +        tb_write_unlock();
> > > > > >          tb_add_jump(last_tb, tb_exit, tb);
> > > > > > +        tb_write_lock();
> > > > > >      }
> > > > > >      return tb;
> > > > > >  }
> > > > > > diff --git a/accel/tcg/translate-all.c
> b/accel/tcg/translate-all.c
> > > > > > index b7d50a73d4..1562076ffb 100644
> > > > > > --- a/accel/tcg/translate-all.c
> > > > > > +++ b/accel/tcg/translate-all.c
> > > > > > @@ -1072,6 +1072,9 @@ static inline void
> *alloc_code_gen_buffer(void)
> > > > > >      size_t size = tcg_ctx->code_gen_buffer_size;
> > > > > >      void *buf;
> > > > > >
> > > > > > +#if defined(__APPLE__) && defined(__aarch64__)
> > > > > > +    flags |= MAP_JIT;
> > > > > > +#endif
> > > > > >      buf = mmap(NULL, size, prot, flags, -1, 0);
> > > > > >      if (buf == MAP_FAILED) {
> > > > > >          return NULL;
> > > > > > @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock
> *tb, bool rm_from_page_list)
> > > > > >
> > > > > >  static void tb_phys_invalidate__locked(TranslationBlock *tb)
> > > > > >  {
> > > > > > +    tb_write_unlock();
> > > > > >      do_tb_phys_invalidate(tb, true);
> > > > > > +    tb_write_lock();
> > > > > >  }
> > > > > >
> > > > > >  /* invalidate one TB
> > > > > > @@ -2722,3 +2727,24 @@ void tcg_flush_softmmu_tlb(CPUState *cs)
> > > > > >      tlb_flush(cs);
> > > > > >  #endif
> > > > > >  }
> > > > > > +
> > > > > > +#if defined(__APPLE__) && defined(__aarch64__)
> > > > > > +static void tb_write_protect(bool locked)
> > > > > > +{
> > > > > > +    if (pthread_jit_write_protect_supported_np()){
> > > > > > +        pthread_jit_write_protect_np(locked);
> > > > > > +    }
> > > > > > +}
> > > > > > +#else
> > > > > > +static void tb_write_protect(bool locked) {}
> > > > > > +#endif
> > > > > > +
> > > > > > +void tb_write_lock(void)
> > > > > > +{
> > > > > > +    tb_write_protect(true);
> > > > > > +}
> > > > > > +
> > > > > > +void tb_write_unlock(void)
> > > > > > +{
> > > > > > +    tb_write_protect(false);
> > > > > > +}
> > > > > > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> > > > > > index fab573da06..962dca0975 100644
> > > > > > --- a/include/exec/exec-all.h
> > > > > > +++ b/include/exec/exec-all.h
> > > > > > @@ -549,6 +549,8 @@ TranslationBlock *tb_htable_lookup(CPUState
> *cpu, target_ulong pc,
> > > > > >                                     target_ulong cs_base,
> uint32_t flags,
> > > > > >                                     uint32_t cf_mask);
> > > > > >  void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t
> addr);
> > > > > > +void tb_write_lock(void);
> > > > > > +void tb_write_unlock(void);
> > > > > >
> > > > > >  /* GETPC is the true target of the return instruction that
> we'll execute.  */
> > > > > >  #if defined(CONFIG_TCG_INTERPRETER)
> > > > > > diff --git a/tcg/tcg.c b/tcg/tcg.c
> > > > > > index 43c6cf8f52..303bb436bd 100644
> > > > > > --- a/tcg/tcg.c
> > > > > > +++ b/tcg/tcg.c
> > > > > > @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
> > > > > >      s->pool_labels = NULL;
> > > > > >  #endif
> > > > > >
> > > > > > +    tb_write_unlock();
> > > > > >      /* Generate the prologue.  */
> > > > > >      tcg_target_qemu_prologue(s);
> > > > > >
> > > > > > --
> > > > > > 2.29.2
> > > > > >
> > > > >
> > > > > I've also noticed that Apple doesn't worry about sticking to
> particular
> > > > > W^X mode:
> > > > >
> > > > > https://bugs.webkit.org/attachment.cgi?id=402515&
> action=prettypatch
> > > > >
> > > > > We might also drop lock/unlock symmetry from here. E.g. we can
> have two
> > > > > functions that switch the mode (they might be moved to
> util/osdep.c):
> > > > >
> > > > >   qemu_jit_write();
> > > > >   qemu_jit_execute();
> > > > >
> > > > > Then we use them just before writing or before executing like
> advised on
> > > > > their documentation page.
> > > > >
> > > > > -Roman
>
diff mbox series

Patch

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index 8689c54499..0042fc9f2b 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -175,7 +175,9 @@  static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
     }
 #endif /* DEBUG_DISAS */
 
+    tb_write_lock();
     ret = tcg_qemu_tb_exec(env, tb_ptr);
+    tb_write_unlock();
     cpu->can_do_io = 1;
     last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
     tb_exit = ret & TB_EXIT_MASK;
@@ -220,9 +222,11 @@  static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
     cflags |= MIN(max_cycles, CF_COUNT_MASK);
 
     mmap_lock();
+    tb_write_unlock();
     tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base,
                      orig_tb->flags, cflags);
     tb->orig_tb = orig_tb;
+    tb_write_lock();
     mmap_unlock();
 
     /* execute the generated code */
@@ -268,7 +272,9 @@  void cpu_exec_step_atomic(CPUState *cpu)
         tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
         if (tb == NULL) {
             mmap_lock();
+            tb_write_unlock();
             tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
+            tb_write_lock();
             mmap_unlock();
         }
 
@@ -428,7 +434,9 @@  static inline TranslationBlock *tb_find(CPUState *cpu,
     tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
     if (tb == NULL) {
         mmap_lock();
+        tb_write_unlock();
         tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
+        tb_write_lock();
         mmap_unlock();
         /* We add the TB in the virtual pc hash table for the fast lookup */
         qatomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
@@ -444,7 +452,9 @@  static inline TranslationBlock *tb_find(CPUState *cpu,
 #endif
     /* See if we can patch the calling TB. */
     if (last_tb) {
+        tb_write_unlock();
         tb_add_jump(last_tb, tb_exit, tb);
+        tb_write_lock();
     }
     return tb;
 }
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index b7d50a73d4..1562076ffb 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1072,6 +1072,9 @@  static inline void *alloc_code_gen_buffer(void)
     size_t size = tcg_ctx->code_gen_buffer_size;
     void *buf;
 
+#if defined(__APPLE__) && defined(__aarch64__)
+    flags |= MAP_JIT;
+#endif
     buf = mmap(NULL, size, prot, flags, -1, 0);
     if (buf == MAP_FAILED) {
         return NULL;
@@ -1485,7 +1488,9 @@  static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
 
 static void tb_phys_invalidate__locked(TranslationBlock *tb)
 {
+    tb_write_unlock();
     do_tb_phys_invalidate(tb, true);
+    tb_write_lock();
 }
 
 /* invalidate one TB
@@ -2722,3 +2727,24 @@  void tcg_flush_softmmu_tlb(CPUState *cs)
     tlb_flush(cs);
 #endif
 }
+
+#if defined(__APPLE__) && defined(__aarch64__)
+static void tb_write_protect(bool locked)
+{
+    if (pthread_jit_write_protect_supported_np()){
+        pthread_jit_write_protect_np(locked);
+    }
+}
+#else
+static void tb_write_protect(bool locked) {}
+#endif
+
+void tb_write_lock(void)
+{
+    tb_write_protect(true);
+}
+
+void tb_write_unlock(void)
+{
+    tb_write_protect(false);
+}
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index fab573da06..962dca0975 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -549,6 +549,8 @@  TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
                                    target_ulong cs_base, uint32_t flags,
                                    uint32_t cf_mask);
 void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
+void tb_write_lock(void);
+void tb_write_unlock(void);
 
 /* GETPC is the true target of the return instruction that we'll execute.  */
 #if defined(CONFIG_TCG_INTERPRETER)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 43c6cf8f52..303bb436bd 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1065,6 +1065,7 @@  void tcg_prologue_init(TCGContext *s)
     s->pool_labels = NULL;
 #endif
 
+    tb_write_unlock();
     /* Generate the prologue.  */
     tcg_target_qemu_prologue(s);