tcg: Record code_gen_buffer address for user-only memory helpers

Message ID 20171114094203.28030-1-richard.henderson@linaro.org
State New
Headers show
Series
  • tcg: Record code_gen_buffer address for user-only memory helpers
Related show

Commit Message

Richard Henderson Nov. 14, 2017, 9:42 a.m.
When we handle a signal from a fault within a user-only memory helper,
we cannot cpu_restore_state with the PC found within the signal frame.
Use a TLS variable, helper_retaddr, to record the unwind start point
to find the faulting guest insn.

Reported-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---

Tested only with a silly test case --

int main()
{
  int new = 1, old = 0;
  __atomic_compare_exchange((int *)0, &old, &new, 0, 0, 0);
  return old;
}

which even before the patch does not fail in the way Peter describes.

As I post this, I remember in theory we should use __atomic_signal_fence
after setting helper_retaddr, but as far as I know this is a no-op on all
supported hosts.  It might still generate a compiler barrier though, so it's
worth considering.


r~
---

 accel/tcg/atomic_template.h               | 32 +++++++++++++----
 include/exec/cpu_ldst.h                   |  2 ++
 include/exec/cpu_ldst_useronly_template.h | 14 ++++++--
 accel/tcg/cputlb.c                        |  1 +
 accel/tcg/user-exec.c                     | 58 +++++++++++++++++++++++++------
 5 files changed, 87 insertions(+), 20 deletions(-)

Comments

Peter Maydell Nov. 14, 2017, 12:01 p.m. | #1
On 14 November 2017 at 09:42, Richard Henderson
<richard.henderson@linaro.org> wrote:
> When we handle a signal from a fault within a user-only memory helper,
> we cannot cpu_restore_state with the PC found within the signal frame.
> Use a TLS variable, helper_retaddr, to record the unwind start point
> to find the faulting guest insn.
>
> Reported-by: Peter Maydell <peter.maydell@linaro.org>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>
> Tested only with a silly test case --
>
> int main()
> {
>   int new = 1, old = 0;
>   __atomic_compare_exchange((int *)0, &old, &new, 0, 0, 0);
>   return old;
> }
>
> which even before the patch does not fail in the way Peter describes.

Probably because the 32-bit atomic compare gets done inline, rather than
via a helper. Here's a test case which does fail:

===begin===
/* segv on atomic access: check we report correct PC
 *
 * Build with:
 *  aarch64-linux-gnu-gcc -g -Wall -static -o segv-atomic segv-atomic.c
 *
 * Copyright (c) 2017 Linaro Ltd
 * License: 2-clause BSD.
 */

#include <stdlib.h>
#include <signal.h>
#include <sys/mman.h>
#include <stdio.h>
#include <string.h>
#include <sys/ucontext.h>
#include <stdint.h>
#include <inttypes.h>

static int sigcount = 0;
static unsigned char *mem = 0;

void sighandler(int s, siginfo_t *info, void *puc)
{
    struct ucontext *uc = puc;
    uint64_t pc = uc->uc_mcontext.pc;
    uint64_t faultaddr = uc->uc_mcontext.fault_address;

    printf("in sighandler, pc = %" PRIx64 " faultaddr %" PRIx64 "\n",
           pc, faultaddr);
    sigcount++;
    if (sigcount == 2) {
       printf("too many signals\n");
       exit(1);
    }
    /* Advance past the insn */
    uc->uc_mcontext.pc += 4;
}

int main(void)
{
    struct sigaction sa;
    int x;

    mem = mmap(0, 4096, PROT_READ | PROT_WRITE,
               MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
    printf("memory at %p\n", mem);

    memset(&sa, 0, sizeof(sa));
    sa.sa_sigaction = sighandler;
    sa.sa_flags = SA_SIGINFO;
    sigaction(SIGSEGV, &sa, NULL);

    memset(mem, 42, 4096);

    mprotect(mem, 4096, PROT_READ);

    /* This should fault the first time around, and then
     * the segv handler will advance the PC by 4 to skip the insn.
     * We surround it with some movs so we know the insn
     * is not at the start of a TB and so we can tell if we
     * correctly skipped it.
     */
    printf("Trying atomic stxp...\n");
    __asm__ volatile("mov %[x], #1\n"
                     "ldxp x4, x3, [%[addr]]\n"
                     "mov %[x], #2\n"
                     "stxp w5, x4, x3, [%[addr]]\n"
                     "mov %[x], #3\n"
                     : [x] "=&r" (x)
                     : [addr] "r" (mem)
                     : "w5", "x4", "x3", "memory");

    printf("...done, with x %d\n", x);
    return 0;
}
===endit===

On real hardware:
pm215@gcc115:~$ ./segv-atomic
memory at 0x7f99810000
Trying atomic stlxr...
in sighandler, pc = 4007f0 faultaddr 7f99810000
...done, with x 3

On QEMU linux-user:
$ ~/linaro/qemu-from-laptop/qemu/build/all-linux-static/aarch64-linux-user/qemu-aarch64
-d in_asm,out_asm,cpu,exec,int -D /tmp/atomic.log ./segv-atomic
memory at 0x4000801000
Trying atomic stlxr...
in sighandler, pc = 4007d8 faultaddr 4000801000
in sighandler, pc = 4007e0 faultaddr a32
too many signals

and the log file shows that the relevant TB is:

0x004007d8:  d0000460  adrp     x0, #0x48e000
0x004007dc:  9128c000  add      x0, x0, #0xa30
0x004007e0:  f9400001  ldr      x1, [x0]
0x004007e4:  d2800020  movz     x0, #0x1
0x004007e8:  c87f0c24  ldxp     x4, x3, [x1]
0x004007ec:  d2800040  movz     x0, #0x2
0x004007f0:  c8250c24  stxp     w5, x4, x3, [x1]
0x004007f4:  d2800060  movz     x0, #0x3
0x004007f8:  b9001fa0  str      w0, [x29, #0x1c]
0x004007fc:  f00002e0  adrp     x0, #0x45f000
0x00400800:  91044000  add      x0, x0, #0x110
0x00400804:  b9401fa1  ldr      w1, [x29, #0x1c]
0x00400808:  940018b6  bl       #0x406ae0

so the stxp faults but the PC reported to the handler is the start
of the TB, at which point it goes off into the weeds because we
restart in the wrong place.

Unfortunately it still fails even with your patch...haven't
investigated why yet.

thanks
-- PMM
Alex Bennée Nov. 14, 2017, 4:09 p.m. | #2
Richard Henderson <richard.henderson@linaro.org> writes:

> When we handle a signal from a fault within a user-only memory helper,
> we cannot cpu_restore_state with the PC found within the signal frame.
> Use a TLS variable, helper_retaddr, to record the unwind start point
> to find the faulting guest insn.
>
> Reported-by: Peter Maydell <peter.maydell@linaro.org>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>
> Tested only with a silly test case --
>
> int main()
> {
>   int new = 1, old = 0;
>   __atomic_compare_exchange((int *)0, &old, &new, 0, 0, 0);
>   return old;
> }
>
> which even before the patch does not fail in the way Peter describes.
>
> As I post this, I remember in theory we should use __atomic_signal_fence
> after setting helper_retaddr, but as far as I know this is a no-op on all
> supported hosts.  It might still generate a compiler barrier though, so it's
> worth considering.
>
>
> r~
> ---
>
>  accel/tcg/atomic_template.h               | 32 +++++++++++++----
>  include/exec/cpu_ldst.h                   |  2 ++
>  include/exec/cpu_ldst_useronly_template.h | 14 ++++++--
>  accel/tcg/cputlb.c                        |  1 +
>  accel/tcg/user-exec.c                     | 58 +++++++++++++++++++++++++------
>  5 files changed, 87 insertions(+), 20 deletions(-)
>
> diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
> index b400b2a3d3..1c7c17526c 100644
> --- a/accel/tcg/atomic_template.h
> +++ b/accel/tcg/atomic_template.h
> @@ -62,7 +62,9 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
>                                ABI_TYPE cmpv, ABI_TYPE newv EXTRA_ARGS)
>  {
>      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
> -    return atomic_cmpxchg__nocheck(haddr, cmpv, newv);
> +    DATA_TYPE ret = atomic_cmpxchg__nocheck(haddr, cmpv, newv);
> +    ATOMIC_MMU_CLEANUP;
> +    return ret;
>  }
>
>  #if DATA_SIZE >= 16
> @@ -70,6 +72,7 @@ ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
>  {
>      DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
>      __atomic_load(haddr, &val, __ATOMIC_RELAXED);
> +    ATOMIC_MMU_CLEANUP;
>      return val;
>  }
>
> @@ -78,13 +81,16 @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
>  {
>      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
>      __atomic_store(haddr, &val, __ATOMIC_RELAXED);
> +    ATOMIC_MMU_CLEANUP;
>  }
>  #else
>  ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
>                             ABI_TYPE val EXTRA_ARGS)
>  {
>      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
> -    return atomic_xchg__nocheck(haddr, val);
> +    DATA_TYPE ret = atomic_xchg__nocheck(haddr, val);
> +    ATOMIC_MMU_CLEANUP;
> +    return ret;
>  }
>
>  #define GEN_ATOMIC_HELPER(X)                                        \
> @@ -92,8 +98,10 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
>                   ABI_TYPE val EXTRA_ARGS)                           \
>  {                                                                   \
>      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
> -    return atomic_##X(haddr, val);                                  \
> -}                                                                   \
> +    DATA_TYPE ret = atomic_##X(haddr, val);                         \
> +    ATOMIC_MMU_CLEANUP;                                             \
> +    return ret;                                                     \
> +}
>
>  GEN_ATOMIC_HELPER(fetch_add)
>  GEN_ATOMIC_HELPER(fetch_and)
> @@ -123,7 +131,9 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
>                                ABI_TYPE cmpv, ABI_TYPE newv EXTRA_ARGS)
>  {
>      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
> -    return BSWAP(atomic_cmpxchg__nocheck(haddr, BSWAP(cmpv), BSWAP(newv)));
> +    DATA_TYPE ret = atomic_cmpxchg__nocheck(haddr, BSWAP(cmpv), BSWAP(newv));
> +    ATOMIC_MMU_CLEANUP;
> +    return BSWAP(ret);
>  }
>
>  #if DATA_SIZE >= 16
> @@ -131,6 +141,7 @@ ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
>  {
>      DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
>      __atomic_load(haddr, &val, __ATOMIC_RELAXED);
> +    ATOMIC_MMU_CLEANUP;
>      return BSWAP(val);
>  }
>
> @@ -140,13 +151,16 @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
>      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
>      val = BSWAP(val);
>      __atomic_store(haddr, &val, __ATOMIC_RELAXED);
> +    ATOMIC_MMU_CLEANUP;
>  }
>  #else
>  ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
>                             ABI_TYPE val EXTRA_ARGS)
>  {
>      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
> -    return BSWAP(atomic_xchg__nocheck(haddr, BSWAP(val)));
> +    ABI_TYPE ret = atomic_xchg__nocheck(haddr, BSWAP(val));
> +    ATOMIC_MMU_CLEANUP;
> +    return BSWAP(ret);
>  }
>
>  #define GEN_ATOMIC_HELPER(X)                                        \
> @@ -154,7 +168,9 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
>                   ABI_TYPE val EXTRA_ARGS)                           \
>  {                                                                   \
>      DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
> -    return BSWAP(atomic_##X(haddr, BSWAP(val)));                    \
> +    DATA_TYPE ret = atomic_##X(haddr, BSWAP(val));                  \
> +    ATOMIC_MMU_CLEANUP;                                             \
> +    return BSWAP(ret);                                              \
>  }
>
>  GEN_ATOMIC_HELPER(fetch_and)
> @@ -180,6 +196,7 @@ ABI_TYPE ATOMIC_NAME(fetch_add)(CPUArchState *env, target_ulong addr,
>          sto = BSWAP(ret + val);
>          ldn = atomic_cmpxchg__nocheck(haddr, ldo, sto);
>          if (ldn == ldo) {
> +            ATOMIC_MMU_CLEANUP;
>              return ret;
>          }
>          ldo = ldn;
> @@ -198,6 +215,7 @@ ABI_TYPE ATOMIC_NAME(add_fetch)(CPUArchState *env, target_ulong addr,
>          sto = BSWAP(ret);
>          ldn = atomic_cmpxchg__nocheck(haddr, ldo, sto);
>          if (ldn == ldo) {
> +            ATOMIC_MMU_CLEANUP;
>              return ret;
>          }
>          ldo = ldn;
> diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
> index 6eb5fe80dc..191f2e962a 100644
> --- a/include/exec/cpu_ldst.h
> +++ b/include/exec/cpu_ldst.h
> @@ -76,6 +76,8 @@
>
>  #if defined(CONFIG_USER_ONLY)
>
> +extern __thread uintptr_t helper_retaddr;
> +
>  /* In user-only mode we provide only the _code and _data accessors. */
>
>  #define MEMSUFFIX _data
> diff --git a/include/exec/cpu_ldst_useronly_template.h b/include/exec/cpu_ldst_useronly_template.h
> index 7b8c7c506e..c168f31bba 100644
> --- a/include/exec/cpu_ldst_useronly_template.h
> +++ b/include/exec/cpu_ldst_useronly_template.h
> @@ -73,7 +73,11 @@ glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
>                                                    target_ulong ptr,
>                                                    uintptr_t retaddr)
>  {
> -    return glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(env, ptr);
> +    RES_TYPE ret;
> +    helper_retaddr = retaddr;
> +    ret = glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(env, ptr);
> +    helper_retaddr = 0;
> +    return ret;
>  }
>
>  #if DATA_SIZE <= 2
> @@ -93,7 +97,11 @@ glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
>                                                    target_ulong ptr,
>                                                    uintptr_t retaddr)
>  {
> -    return glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(env, ptr);
> +    int ret;
> +    helper_retaddr = retaddr;
> +    ret = glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(env, ptr);
> +    helper_retaddr = 0;
> +    return ret;
>  }
>  #endif
>
> @@ -116,7 +124,9 @@ glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
>                                                    RES_TYPE v,
>                                                    uintptr_t retaddr)
>  {
> +    helper_retaddr = retaddr;
>      glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(env, ptr, v);
> +    helper_retaddr = 0;
>  }
>  #endif
>
> diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
> index a23919c3a8..d071ca4d14 100644
> --- a/accel/tcg/cputlb.c
> +++ b/accel/tcg/cputlb.c
> @@ -1041,6 +1041,7 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
>  #define ATOMIC_NAME(X) \
>      HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
>  #define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, retaddr)
> +#define ATOMIC_MMU_CLEANUP do { } while (0)
>
>  #define DATA_SIZE 1
>  #include "atomic_template.h"
> diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
> index 492ea0826c..0324ba8ad1 100644
> --- a/accel/tcg/user-exec.c
> +++ b/accel/tcg/user-exec.c
> @@ -39,6 +39,8 @@
>  #include <sys/ucontext.h>
>  #endif
>
> +__thread uintptr_t helper_retaddr;
> +
>  //#define DEBUG_SIGNAL
>
>  /* exit the current TB from a signal handler. The host registers are
> @@ -62,6 +64,27 @@ static inline int handle_cpu_signal(uintptr_t pc, unsigned long address,
>      CPUClass *cc;
>      int ret;
>
> +    /* We must handle PC addresses from two different sources:
> +     * a call return address and a signal frame address.
> +     *
> +     * Within cpu_restore_state_from_tb we assume the former and adjust
> +     * the address by -GETPC_ADJ so that the address is within the call
> +     * insn so that addr does not accidentally match the beginning of the
> +     * next guest insn.
> +     *
> +     * However, when the PC comes from the signal frame, it points to
> +     * the actual faulting host insn and not a call insn.  Subtracting
> +     * GETPC_ADJ in that case may accidentally match the previous guest insn.
> +     *
> +     * So for the later case, adjust forward to compensate for what
> +     * will be done later by cpu_restore_state_from_tb.
> +     */
> +    if (helper_retaddr) {
> +        pc = helper_retaddr;
> +    } else {
> +        pc += GETPC_ADJ;
> +    }
> +
>      /* For synchronous signals we expect to be coming from the vCPU
>       * thread (so current_cpu should be valid) and either from running
>       * code or during translation which can fault as we cross pages.
> @@ -84,21 +107,24 @@ static inline int handle_cpu_signal(uintptr_t pc, unsigned long address,
>          switch (page_unprotect(h2g(address), pc)) {
>          case 0:
>              /* Fault not caused by a page marked unwritable to protect
> -             * cached translations, must be the guest binary's problem
> +             * cached translations, must be the guest binary's problem.
>               */
>              break;
>          case 1:
>              /* Fault caused by protection of cached translation; TBs
> -             * invalidated, so resume execution
> +             * invalidated, so resume execution.  Retain helper_retaddr
> +             * for a possible second fault.
>               */
>              return 1;
>          case 2:
>              /* Fault caused by protection of cached translation, and the
>               * currently executing TB was modified and must be exited
> -             * immediately.
> +             * immediately.  Clear helper_retaddr for next execution.
>               */
> +            helper_retaddr = 0;
>              cpu_exit_tb_from_sighandler(cpu, old_set);
> -            g_assert_not_reached();
> +            /* NORETURN */
> +
>          default:
>              g_assert_not_reached();
>          }
> @@ -112,17 +138,25 @@ static inline int handle_cpu_signal(uintptr_t pc, unsigned long address,
>      /* see if it is an MMU fault */
>      g_assert(cc->handle_mmu_fault);
>      ret = cc->handle_mmu_fault(cpu, address, is_write, MMU_USER_IDX);
> +
> +    if (ret == 0) {
> +        /* The MMU fault was handled without causing real CPU fault.
> +         *  Retain helper_retaddr for a possible second fault.
> +         */
> +        return 1;
> +    }
> +
> +    /* All other paths lead to cpu_exit; clear helper_retaddr
> +     * for next execution.
> +     */
> +    helper_retaddr = 0;
> +
>      if (ret < 0) {
>          return 0; /* not an MMU fault */
>      }
> -    if (ret == 0) {
> -        return 1; /* the MMU fault was handled without causing real CPU fault */
> -    }
>
> -    /* Now we have a real cpu fault.  Since this is the exact location of
> -     * the exception, we must undo the adjustment done by cpu_restore_state
> -     * for handling call return addresses.  */
> -    cpu_restore_state(cpu, pc + GETPC_ADJ);
> +    /* Now we have a real cpu fault.  */
> +    cpu_restore_state(cpu, pc);

I can't help thinking when we get it wrong we should be doing something
here, maybe a LOG_UNIMP? Otherwise we silently fail or at least the
user-space falls off a cliff later.

Anyway, other than that minor nit:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Tested-by: Alex Bennée <alex.bennee@linaro.org>


>
>      sigprocmask(SIG_SETMASK, old_set, NULL);
>      cpu_loop_exit(cpu);
> @@ -585,11 +619,13 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
>      if (unlikely(addr & (size - 1))) {
>          cpu_loop_exit_atomic(ENV_GET_CPU(env), retaddr);
>      }
> +    helper_retaddr = retaddr;
>      return g2h(addr);
>  }
>
>  /* Macro to call the above, with local variables from the use context.  */
>  #define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, DATA_SIZE, GETPC())
> +#define ATOMIC_MMU_CLEANUP do { helper_retaddr = 0; } while (0)
>
>  #define ATOMIC_NAME(X)   HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
>  #define EXTRA_ARGS


--
Alex Bennée
Richard Henderson Nov. 15, 2017, 9:39 a.m. | #3
On 11/14/2017 05:09 PM, Alex Bennée wrote:
>> -    /* Now we have a real cpu fault.  Since this is the exact location of
>> -     * the exception, we must undo the adjustment done by cpu_restore_state
>> -     * for handling call return addresses.  */
>> -    cpu_restore_state(cpu, pc + GETPC_ADJ);
>> +    /* Now we have a real cpu fault.  */
>> +    cpu_restore_state(cpu, pc);
> 
> I can't help thinking when we get it wrong we should be doing something
> here, maybe a LOG_UNIMP? Otherwise we silently fail or at least the
> user-space falls off a cliff later.

Oh we silently get it wrong in so many ways.  E.g. zero callers of
cpu_restore_state_from_tb check its return status.  Anyway, I think this sort
of cleanup has to wait til next cycle.


r~

Patch

diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
index b400b2a3d3..1c7c17526c 100644
--- a/accel/tcg/atomic_template.h
+++ b/accel/tcg/atomic_template.h
@@ -62,7 +62,9 @@  ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
                               ABI_TYPE cmpv, ABI_TYPE newv EXTRA_ARGS)
 {
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
-    return atomic_cmpxchg__nocheck(haddr, cmpv, newv);
+    DATA_TYPE ret = atomic_cmpxchg__nocheck(haddr, cmpv, newv);
+    ATOMIC_MMU_CLEANUP;
+    return ret;
 }
 
 #if DATA_SIZE >= 16
@@ -70,6 +72,7 @@  ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
 {
     DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
     __atomic_load(haddr, &val, __ATOMIC_RELAXED);
+    ATOMIC_MMU_CLEANUP;
     return val;
 }
 
@@ -78,13 +81,16 @@  void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
 {
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
     __atomic_store(haddr, &val, __ATOMIC_RELAXED);
+    ATOMIC_MMU_CLEANUP;
 }
 #else
 ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
                            ABI_TYPE val EXTRA_ARGS)
 {
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
-    return atomic_xchg__nocheck(haddr, val);
+    DATA_TYPE ret = atomic_xchg__nocheck(haddr, val);
+    ATOMIC_MMU_CLEANUP;
+    return ret;
 }
 
 #define GEN_ATOMIC_HELPER(X)                                        \
@@ -92,8 +98,10 @@  ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                  ABI_TYPE val EXTRA_ARGS)                           \
 {                                                                   \
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
-    return atomic_##X(haddr, val);                                  \
-}                                                                   \
+    DATA_TYPE ret = atomic_##X(haddr, val);                         \
+    ATOMIC_MMU_CLEANUP;                                             \
+    return ret;                                                     \
+}
 
 GEN_ATOMIC_HELPER(fetch_add)
 GEN_ATOMIC_HELPER(fetch_and)
@@ -123,7 +131,9 @@  ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
                               ABI_TYPE cmpv, ABI_TYPE newv EXTRA_ARGS)
 {
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
-    return BSWAP(atomic_cmpxchg__nocheck(haddr, BSWAP(cmpv), BSWAP(newv)));
+    DATA_TYPE ret = atomic_cmpxchg__nocheck(haddr, BSWAP(cmpv), BSWAP(newv));
+    ATOMIC_MMU_CLEANUP;
+    return BSWAP(ret);
 }
 
 #if DATA_SIZE >= 16
@@ -131,6 +141,7 @@  ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
 {
     DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
     __atomic_load(haddr, &val, __ATOMIC_RELAXED);
+    ATOMIC_MMU_CLEANUP;
     return BSWAP(val);
 }
 
@@ -140,13 +151,16 @@  void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
     val = BSWAP(val);
     __atomic_store(haddr, &val, __ATOMIC_RELAXED);
+    ATOMIC_MMU_CLEANUP;
 }
 #else
 ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
                            ABI_TYPE val EXTRA_ARGS)
 {
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
-    return BSWAP(atomic_xchg__nocheck(haddr, BSWAP(val)));
+    ABI_TYPE ret = atomic_xchg__nocheck(haddr, BSWAP(val));
+    ATOMIC_MMU_CLEANUP;
+    return BSWAP(ret);
 }
 
 #define GEN_ATOMIC_HELPER(X)                                        \
@@ -154,7 +168,9 @@  ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,       \
                  ABI_TYPE val EXTRA_ARGS)                           \
 {                                                                   \
     DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;                           \
-    return BSWAP(atomic_##X(haddr, BSWAP(val)));                    \
+    DATA_TYPE ret = atomic_##X(haddr, BSWAP(val));                  \
+    ATOMIC_MMU_CLEANUP;                                             \
+    return BSWAP(ret);                                              \
 }
 
 GEN_ATOMIC_HELPER(fetch_and)
@@ -180,6 +196,7 @@  ABI_TYPE ATOMIC_NAME(fetch_add)(CPUArchState *env, target_ulong addr,
         sto = BSWAP(ret + val);
         ldn = atomic_cmpxchg__nocheck(haddr, ldo, sto);
         if (ldn == ldo) {
+            ATOMIC_MMU_CLEANUP;
             return ret;
         }
         ldo = ldn;
@@ -198,6 +215,7 @@  ABI_TYPE ATOMIC_NAME(add_fetch)(CPUArchState *env, target_ulong addr,
         sto = BSWAP(ret);
         ldn = atomic_cmpxchg__nocheck(haddr, ldo, sto);
         if (ldn == ldo) {
+            ATOMIC_MMU_CLEANUP;
             return ret;
         }
         ldo = ldn;
diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index 6eb5fe80dc..191f2e962a 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -76,6 +76,8 @@ 
 
 #if defined(CONFIG_USER_ONLY)
 
+extern __thread uintptr_t helper_retaddr;
+
 /* In user-only mode we provide only the _code and _data accessors. */
 
 #define MEMSUFFIX _data
diff --git a/include/exec/cpu_ldst_useronly_template.h b/include/exec/cpu_ldst_useronly_template.h
index 7b8c7c506e..c168f31bba 100644
--- a/include/exec/cpu_ldst_useronly_template.h
+++ b/include/exec/cpu_ldst_useronly_template.h
@@ -73,7 +73,11 @@  glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
                                                   target_ulong ptr,
                                                   uintptr_t retaddr)
 {
-    return glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(env, ptr);
+    RES_TYPE ret;
+    helper_retaddr = retaddr;
+    ret = glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(env, ptr);
+    helper_retaddr = 0;
+    return ret;
 }
 
 #if DATA_SIZE <= 2
@@ -93,7 +97,11 @@  glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
                                                   target_ulong ptr,
                                                   uintptr_t retaddr)
 {
-    return glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(env, ptr);
+    int ret;
+    helper_retaddr = retaddr;
+    ret = glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(env, ptr);
+    helper_retaddr = 0;
+    return ret;
 }
 #endif
 
@@ -116,7 +124,9 @@  glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
                                                   RES_TYPE v,
                                                   uintptr_t retaddr)
 {
+    helper_retaddr = retaddr;
     glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(env, ptr, v);
+    helper_retaddr = 0;
 }
 #endif
 
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index a23919c3a8..d071ca4d14 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -1041,6 +1041,7 @@  static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 #define ATOMIC_NAME(X) \
     HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
 #define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, retaddr)
+#define ATOMIC_MMU_CLEANUP do { } while (0)
 
 #define DATA_SIZE 1
 #include "atomic_template.h"
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index 492ea0826c..0324ba8ad1 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -39,6 +39,8 @@ 
 #include <sys/ucontext.h>
 #endif
 
+__thread uintptr_t helper_retaddr;
+
 //#define DEBUG_SIGNAL
 
 /* exit the current TB from a signal handler. The host registers are
@@ -62,6 +64,27 @@  static inline int handle_cpu_signal(uintptr_t pc, unsigned long address,
     CPUClass *cc;
     int ret;
 
+    /* We must handle PC addresses from two different sources:
+     * a call return address and a signal frame address.
+     *
+     * Within cpu_restore_state_from_tb we assume the former and adjust
+     * the address by -GETPC_ADJ so that the address is within the call
+     * insn so that addr does not accidentally match the beginning of the
+     * next guest insn.
+     *
+     * However, when the PC comes from the signal frame, it points to
+     * the actual faulting host insn and not a call insn.  Subtracting
+     * GETPC_ADJ in that case may accidentally match the previous guest insn.
+     *
+     * So for the later case, adjust forward to compensate for what
+     * will be done later by cpu_restore_state_from_tb.
+     */
+    if (helper_retaddr) {
+        pc = helper_retaddr;
+    } else {
+        pc += GETPC_ADJ;
+    }
+
     /* For synchronous signals we expect to be coming from the vCPU
      * thread (so current_cpu should be valid) and either from running
      * code or during translation which can fault as we cross pages.
@@ -84,21 +107,24 @@  static inline int handle_cpu_signal(uintptr_t pc, unsigned long address,
         switch (page_unprotect(h2g(address), pc)) {
         case 0:
             /* Fault not caused by a page marked unwritable to protect
-             * cached translations, must be the guest binary's problem
+             * cached translations, must be the guest binary's problem.
              */
             break;
         case 1:
             /* Fault caused by protection of cached translation; TBs
-             * invalidated, so resume execution
+             * invalidated, so resume execution.  Retain helper_retaddr
+             * for a possible second fault.
              */
             return 1;
         case 2:
             /* Fault caused by protection of cached translation, and the
              * currently executing TB was modified and must be exited
-             * immediately.
+             * immediately.  Clear helper_retaddr for next execution.
              */
+            helper_retaddr = 0;
             cpu_exit_tb_from_sighandler(cpu, old_set);
-            g_assert_not_reached();
+            /* NORETURN */
+
         default:
             g_assert_not_reached();
         }
@@ -112,17 +138,25 @@  static inline int handle_cpu_signal(uintptr_t pc, unsigned long address,
     /* see if it is an MMU fault */
     g_assert(cc->handle_mmu_fault);
     ret = cc->handle_mmu_fault(cpu, address, is_write, MMU_USER_IDX);
+
+    if (ret == 0) {
+        /* The MMU fault was handled without causing real CPU fault.
+         *  Retain helper_retaddr for a possible second fault.
+         */
+        return 1;
+    }
+
+    /* All other paths lead to cpu_exit; clear helper_retaddr
+     * for next execution.
+     */
+    helper_retaddr = 0;
+
     if (ret < 0) {
         return 0; /* not an MMU fault */
     }
-    if (ret == 0) {
-        return 1; /* the MMU fault was handled without causing real CPU fault */
-    }
 
-    /* Now we have a real cpu fault.  Since this is the exact location of
-     * the exception, we must undo the adjustment done by cpu_restore_state
-     * for handling call return addresses.  */
-    cpu_restore_state(cpu, pc + GETPC_ADJ);
+    /* Now we have a real cpu fault.  */
+    cpu_restore_state(cpu, pc);
 
     sigprocmask(SIG_SETMASK, old_set, NULL);
     cpu_loop_exit(cpu);
@@ -585,11 +619,13 @@  static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
     if (unlikely(addr & (size - 1))) {
         cpu_loop_exit_atomic(ENV_GET_CPU(env), retaddr);
     }
+    helper_retaddr = retaddr;
     return g2h(addr);
 }
 
 /* Macro to call the above, with local variables from the use context.  */
 #define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, DATA_SIZE, GETPC())
+#define ATOMIC_MMU_CLEANUP do { helper_retaddr = 0; } while (0)
 
 #define ATOMIC_NAME(X)   HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
 #define EXTRA_ARGS