Patchwork [1/1] tcg/aarch64: Implement tlb lookup fast path

login
register
mail settings
Submitter Jani Kokkonen
Date June 20, 2013, 10:53 a.m.
Message ID <51C2DF23.3020202@huawei.com>
Download mbox | patch
Permalink /patch/252875/
State New
Headers show

Comments

Jani Kokkonen - June 20, 2013, 10:53 a.m.
Supports CONFIG_QEMU_LDST_OPTIMIZATION

Signed-off-by: Jani Kokkonen <jani.kokkonen@huawei.com>
---
 configure                |    2 +-
 include/exec/exec-all.h  |   16 +++-
 tcg/aarch64/tcg-target.c |  197 ++++++++++++++++++++++++++++++++++------------
 3 files changed, 162 insertions(+), 53 deletions(-)
Claudio Fontana - June 20, 2013, 2:58 p.m.
Hi Jani,

On 20.06.2013 12:53, Jani Kokkonen wrote:
> Supports CONFIG_QEMU_LDST_OPTIMIZATION
> 
> Signed-off-by: Jani Kokkonen <jani.kokkonen@huawei.com>
> ---
>  configure                |    2 +-
>  include/exec/exec-all.h  |   16 +++-
>  tcg/aarch64/tcg-target.c |  197 ++++++++++++++++++++++++++++++++++------------
>  3 files changed, 162 insertions(+), 53 deletions(-)
> 
> diff --git a/configure b/configure
> index bed0104..6a36682 100755
> --- a/configure
> +++ b/configure
> @@ -3599,7 +3599,7 @@ echo "libs_softmmu=$libs_softmmu" >> $config_host_mak
>  echo "ARCH=$ARCH" >> $config_host_mak
>  
>  case "$cpu" in
> -  arm|i386|x86_64|ppc)
> +  arm|i386|x86_64|ppc|aarch64)
>      # The TCG interpreter currently does not support ld/st optimization.
>      if test "$tcg_interpreter" = "no" ; then
>          echo "CONFIG_QEMU_LDST_OPTIMIZATION=y" >> $config_host_mak
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 5c31863..39c28d1 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -19,9 +19,7 @@
>  
>  #ifndef _EXEC_ALL_H_
>  #define _EXEC_ALL_H_
> -
>  #include "qemu-common.h"
> -

lets remove the whitespace change. There is some of this below as well.

>  /* allow to see translation results - the slowdown should be negligible, so we leave it */
>  #define DEBUG_DISAS
>  
> @@ -358,6 +356,20 @@ static inline uintptr_t tcg_getpc_ldst(uintptr_t ra)
>                                     not the start of the next opcode  */
>      return ra;
>  }
> +#elif defined(__aarch64__)
> +#  define GETRA()       ((uintptr_t)__builtin_return_address(0))
> +#  define GETPC_LDST()  tcg_getpc_ldst(GETRA())
> +static inline uintptr_t tcg_getpc_ldst(uintptr_t ra)
> +{
> +    int32_t b;
> +    ra += 4;                    /* skip one instruction */
> +    b = *(int32_t *)ra;         /* load the branch insn */
> +    b = (b << 6) >> (6 - 2);    /* extract the displacement */
> +    ra += b;                    /* apply the displacement  */
> +    ra -= 4;                    /* return a pointer into the current opcode,
> +                                   not the start of the next opcode  */
> +    return ra;
> +}
>  # else
>  #  error "CONFIG_QEMU_LDST_OPTIMIZATION needs GETPC_LDST() implementation!"
>  # endif
> diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
> index 8bb195e..03da082 100644
> --- a/tcg/aarch64/tcg-target.c
> +++ b/tcg/aarch64/tcg-target.c
> @@ -706,6 +706,41 @@ static inline void tcg_out_uxt(TCGContext *s, int s_bits,
>      tcg_out_ubfm(s, 0, rd, rn, 0, bits);
>  }
>  
> +static inline void tcg_out_addi(TCGContext *s, int ext,
> +                                TCGReg rd, TCGReg rn, unsigned int aimm)
> +{
> +    /* add immediate aimm unsigned 12bit value (with LSL 0 or 12) */
> +    /* using ADD 0x11000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
> +    unsigned int base = ext ? 0x91000000 : 0x11000000;
> +
> +    if (aimm <= 0xfff) {
> +        aimm <<= 10;
> +    } else {
> +        /* we can only shift left by 12, on assert we cannot represent */
> +        assert(!(aimm & 0xfff));
> +        assert(aimm <= 0xfff000);
> +        base |= 1 << 22; /* apply LSL 12 */
> +        aimm >>= 2;
> +    }
> +
> +    tcg_out32(s, base | aimm | (rn << 5) | rd);
> +}
> +
> +static inline void tcg_out_subi(TCGContext *s, int ext,
> +                                TCGReg rd, TCGReg rn, unsigned int aimm)
> +{
> +    /* sub immediate aimm unsigned 12bit value (we use LSL 0 - no shift) */
> +    /* using SUB 0x51000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
> +    unsigned int base = ext ? 0xd1000000 : 0x51000000;

hmm maybe it makes sense to add the 12 bit left shift here as well, just like
tcg_out_addi, for consistency.

> +    assert(aimm <= 0xfff);
> +    tcg_out32(s, base | (aimm << 10) | (rn << 5) | rd);
> +}
> +
> +static inline void tcg_out_nop(TCGContext *s)
> +{
> +    tcg_out32(s, 0xd503201f);
> +}
> +
>  #ifdef CONFIG_SOFTMMU
>  #include "exec/softmmu_defs.h"
>  
> @@ -727,7 +762,106 @@ static const void * const qemu_st_helpers[4] = {
>      helper_stq_mmu,
>  };
>  
> -#else /* !CONFIG_SOFTMMU */
> +static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
> +{
> +    reloc_pc19(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
> +    tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
> +    tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, lb->addrlo_reg);
> +    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, lb->mem_index);
> +    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
> +                 (tcg_target_long)qemu_ld_helpers[lb->opc & 3]);
> +    tcg_out_callr(s, TCG_REG_TMP);
> +    if (lb->opc & 0x04) {
> +        tcg_out_sxt(s, 1, lb->opc & 3, lb->datalo_reg, TCG_REG_X0);
> +    } else {
> +        tcg_out_movr(s, 1, lb->datalo_reg, TCG_REG_X0);
> +    }
> +
> +    tcg_out_goto(s, (tcg_target_long)lb->raddr);
> +}
> +
> +static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
> +{
> +    reloc_pc19(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
> +
> +    tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
> +    tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, lb->addrlo_reg);
> +    tcg_out_movr(s, 1, TCG_REG_X2, lb->datalo_reg);
> +    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, lb->mem_index);
> +    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
> +                 (tcg_target_long)qemu_st_helpers[lb->opc & 3]);
> +    tcg_out_callr(s, TCG_REG_TMP);
> +
> +    tcg_out_nop(s);
> +    tcg_out_goto(s, (tcg_target_long)lb->raddr);
> +}
> +
> +void tcg_out_tb_finalize(TCGContext *s)
> +{
> +    int i;
> +    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
> +        TCGLabelQemuLdst *label = &s->qemu_ldst_labels[i];
> +        if (label->is_ld) {
> +            tcg_out_qemu_ld_slow_path(s, label);
> +        } else {
> +            tcg_out_qemu_st_slow_path(s, label);
> +        }
> +    }
> +}
> +
> +static void add_qemu_ldst_label(TCGContext *s, int is_ld, int opc,
> +                                int data_reg, int addr_reg,
> +                                int mem_index,
> +                                uint8_t *raddr, uint8_t *label_ptr)
> +{
> +    int idx;
> +    TCGLabelQemuLdst *label;
> +
> +    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
> +        tcg_abort();
> +    }
> +
> +    idx = s->nb_qemu_ldst_labels++;
> +    label = &s->qemu_ldst_labels[idx];
> +    label->is_ld = is_ld;
> +    label->opc = opc;
> +    label->datalo_reg = data_reg;
> +    label->addrlo_reg = addr_reg;
> +    label->mem_index = mem_index;
> +    label->raddr = raddr;
> +    label->label_ptr[0] = label_ptr;
> +}
> +
> +/* Load and compare a TLB entry, emitting the conditional jump to the
> +slow path for the failure case, which will be patched later when finalizing
> +the slow pathClobbers X0,X1,X2,X3 and TMP.  */

"the slow path. Clobbers ..."
btw we should mention here that your function generates code that for the in-line fast path
puts the host addend into X1.

> +
> +static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg,
> +            int s_bits, uint8_t **label_ptr, int tlb_offset, int is_read)
> +{
> +    TCGReg base = TCG_AREG0;
> +
> +    tcg_out_ubfm(s, TARGET_LONG_BITS == 64 ? 1 : 0, TCG_REG_X0, addr_reg,
> +           TARGET_PAGE_BITS, TARGET_PAGE_BITS + CPU_TLB_BITS);
> +
> +    tcg_out_andi(s, TARGET_LONG_BITS == 64 ? 1 : 0, TCG_REG_X3, addr_reg,
> +           TARGET_LONG_BITS - TARGET_PAGE_BITS + s_bits,
> +                   TARGET_LONG_BITS - TARGET_PAGE_BITS);
> +
> +    tcg_out_addi(s, 1, TCG_REG_X2, base, tlb_offset & 0xfff000);
> +    tcg_out_arith(s, ARITH_ADD, 1, TCG_REG_X2, TCG_REG_X2,
> +                   TCG_REG_X0, -CPU_TLB_ENTRY_BITS);
> +    tcg_out_ldst(s, TARGET_LONG_BITS == 64 ? LDST_64 : LDST_32,
> +                  LDST_LD, TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff);
> +    tcg_out_ldst(s, LDST_64, LDST_LD, TCG_REG_X1, TCG_REG_X2,
> +        (tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend) -
> +             (is_read ? offsetof(CPUTLBEntry, addr_read) :
> +                   offsetof(CPUTLBEntry, addr_write))));
> +
> +    tcg_out_cmp(s, 1, TCG_REG_X0, TCG_REG_X3, 0);
> +    *label_ptr = s->code_ptr;
> +    tcg_out_goto_cond_noaddr(s, TCG_COND_NE);
> +}

hmm should not the compare and branch actually be before the loading of the addend?
If we jump to the slow path we don't need to load the addend do we?

>  
>  static void tcg_out_qemu_ld_direct(TCGContext *s, int opc, TCGReg data_r,
>                                     TCGReg addr_r, TCGReg off_r)
> @@ -822,6 +956,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
>      TCGReg addr_reg, data_reg;
>  #ifdef CONFIG_SOFTMMU
>      int mem_index, s_bits;
> +    uint8_t *label_ptr;
>  #endif
>      data_reg = args[0];
>      addr_reg = args[1];
> @@ -829,23 +964,11 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
>  #ifdef CONFIG_SOFTMMU
>      mem_index = args[2];
>      s_bits = opc & 3;
> -
> -    /* TODO: insert TLB lookup here */
> -
> -    /* all arguments passed via registers */
> -    tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
> -    tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, addr_reg);
> -    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, mem_index);
> -    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
> -                 (tcg_target_long)qemu_ld_helpers[s_bits]);
> -    tcg_out_callr(s, TCG_REG_TMP);
> -
> -    if (opc & 0x04) { /* sign extend */
> -        tcg_out_sxt(s, 1, s_bits, data_reg, TCG_REG_X0);
> -    } else {
> -        tcg_out_movr(s, 1, data_reg, TCG_REG_X0);
> -    }
> -
> +    tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr,
> +              offsetof(CPUArchState, tlb_table[mem_index][0].addr_read), 1);
> +    tcg_out_qemu_ld_direct(s, opc, data_reg, addr_reg, TCG_REG_X1);
> +    add_qemu_ldst_label(s, 1, opc, data_reg, addr_reg,
> +                        mem_index, s->code_ptr, label_ptr);
>  #else /* !CONFIG_SOFTMMU */
>      tcg_out_qemu_ld_direct(s, opc, data_reg, addr_reg,
>                             GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR);
> @@ -857,25 +980,19 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
>      TCGReg addr_reg, data_reg;
>  #ifdef CONFIG_SOFTMMU
>      int mem_index, s_bits;
> +    uint8_t *label_ptr;
>  #endif
>      data_reg = args[0];
>      addr_reg = args[1];
> -

- whitespace change

>  #ifdef CONFIG_SOFTMMU
>      mem_index = args[2];
>      s_bits = opc & 3;
>  
> -    /* TODO: insert TLB lookup here */
> -
> -    /* all arguments passed via registers */
> -    tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
> -    tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, addr_reg);
> -    tcg_out_movr(s, 1, TCG_REG_X2, data_reg);
> -    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, mem_index);
> -    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
> -                 (tcg_target_long)qemu_st_helpers[s_bits]);
> -    tcg_out_callr(s, TCG_REG_TMP);
> -
> +    tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr,
> +             offsetof(CPUArchState, tlb_table[mem_index][0].addr_write), 0);
> +    tcg_out_qemu_st_direct(s, opc, data_reg, addr_reg, TCG_REG_X1);
> +    add_qemu_ldst_label(s, 0, opc, data_reg, addr_reg,
> +                        mem_index, s->code_ptr, label_ptr);
>  #else /* !CONFIG_SOFTMMU */
>      tcg_out_qemu_st_direct(s, opc, data_reg, addr_reg,
>                             GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR);
> @@ -1318,26 +1435,6 @@ static void tcg_target_init(TCGContext *s)
>      tcg_add_target_add_op_defs(aarch64_op_defs);
>  }
>  
> -static inline void tcg_out_addi(TCGContext *s, int ext,
> -                                TCGReg rd, TCGReg rn, unsigned int aimm)
> -{
> -    /* add immediate aimm unsigned 12bit value (we use LSL 0 - no shift) */
> -    /* using ADD 0x11000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
> -    unsigned int base = ext ? 0x91000000 : 0x11000000;
> -    assert(aimm <= 0xfff);
> -    tcg_out32(s, base | (aimm << 10) | (rn << 5) | rd);
> -}
> -
> -static inline void tcg_out_subi(TCGContext *s, int ext,
> -                                TCGReg rd, TCGReg rn, unsigned int aimm)
> -{
> -    /* sub immediate aimm unsigned 12bit value (we use LSL 0 - no shift) */
> -    /* using SUB 0x51000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
> -    unsigned int base = ext ? 0xd1000000 : 0x51000000;
> -    assert(aimm <= 0xfff);
> -    tcg_out32(s, base | (aimm << 10) | (rn << 5) | rd);
> -}
> -
>  static void tcg_target_qemu_prologue(TCGContext *s)
>  {
>      /* NB: frame sizes are in 16 byte stack units! */
> 

I'll take a more in depth look next week.

Ciao,

Claudio
Richard Henderson - June 20, 2013, 4:55 p.m.
On 06/20/2013 07:58 AM, Claudio Fontana wrote:
>> > +    tcg_out_ldst(s, TARGET_LONG_BITS == 64 ? LDST_64 : LDST_32,
>> > +                  LDST_LD, TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff);
>> > +    tcg_out_ldst(s, LDST_64, LDST_LD, TCG_REG_X1, TCG_REG_X2,
>> > +        (tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend) -
>> > +             (is_read ? offsetof(CPUTLBEntry, addr_read) :
>> > +                   offsetof(CPUTLBEntry, addr_write))));
>> > +
>> > +    tcg_out_cmp(s, 1, TCG_REG_X0, TCG_REG_X3, 0);
>> > +    *label_ptr = s->code_ptr;
>> > +    tcg_out_goto_cond_noaddr(s, TCG_COND_NE);
>> > +}
> hmm should not the compare and branch actually be before the loading of the addend?
> If we jump to the slow path we don't need to load the addend do we?
> 

No, but it's the slow path, and we don't care if we do extra work.
What's more important is minimizing the memory load delay for the
fast path.


r~
Richard Henderson - June 20, 2013, 5:05 p.m.
On 06/20/2013 03:53 AM, Jani Kokkonen wrote:
>  #ifndef _EXEC_ALL_H_
>  #define _EXEC_ALL_H_
> -
>  #include "qemu-common.h"
> -

Whitespace change?

> +/* Load and compare a TLB entry, emitting the conditional jump to the
> +slow path for the failure case, which will be patched later when finalizing
> +the slow pathClobbers X0,X1,X2,X3 and TMP.  */

Indentation.

> +    tcg_out_ldst(s, TARGET_LONG_BITS == 64 ? LDST_64 : LDST_32,
> +                  LDST_LD, TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff);
> +    tcg_out_ldst(s, LDST_64, LDST_LD, TCG_REG_X1, TCG_REG_X2,
> +        (tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend) -
> +             (is_read ? offsetof(CPUTLBEntry, addr_read) :
> +                   offsetof(CPUTLBEntry, addr_write))));

I wonder if it wouldn't be clearer to not include the addr_read/write offset in
the passed tlb_offset value.  So more like

  int tlb_offset = offsetof(CPUArchState, tlb_table[mem_index])

  tcg_out_ldst(s, TARGET_LONG_BITS == 64 ? LDST_64 : LDST_32,
               LDST_LD, TCG_REG_X0, TCG_REG_X2,
               (tlb_offset & 0xfff) +
               (is_read ? offsetof(CPUTLBEntry, addr_read)
                : offsetof(CPUTLBEntry, addr_write)));
  tcg_out_ldst(s, LDST_64, LDST_LD, TCG_REG_X1, TCG_REG_X2,
               (tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend));

and then in the two callers pass down mem_index instead of tlb_offset.

In addition, the function could use some commentary.


r~

Patch

diff --git a/configure b/configure
index bed0104..6a36682 100755
--- a/configure
+++ b/configure
@@ -3599,7 +3599,7 @@  echo "libs_softmmu=$libs_softmmu" >> $config_host_mak
 echo "ARCH=$ARCH" >> $config_host_mak
 
 case "$cpu" in
-  arm|i386|x86_64|ppc)
+  arm|i386|x86_64|ppc|aarch64)
     # The TCG interpreter currently does not support ld/st optimization.
     if test "$tcg_interpreter" = "no" ; then
         echo "CONFIG_QEMU_LDST_OPTIMIZATION=y" >> $config_host_mak
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 5c31863..39c28d1 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -19,9 +19,7 @@ 
 
 #ifndef _EXEC_ALL_H_
 #define _EXEC_ALL_H_
-
 #include "qemu-common.h"
-
 /* allow to see translation results - the slowdown should be negligible, so we leave it */
 #define DEBUG_DISAS
 
@@ -358,6 +356,20 @@  static inline uintptr_t tcg_getpc_ldst(uintptr_t ra)
                                    not the start of the next opcode  */
     return ra;
 }
+#elif defined(__aarch64__)
+#  define GETRA()       ((uintptr_t)__builtin_return_address(0))
+#  define GETPC_LDST()  tcg_getpc_ldst(GETRA())
+static inline uintptr_t tcg_getpc_ldst(uintptr_t ra)
+{
+    int32_t b;
+    ra += 4;                    /* skip one instruction */
+    b = *(int32_t *)ra;         /* load the branch insn */
+    b = (b << 6) >> (6 - 2);    /* extract the displacement */
+    ra += b;                    /* apply the displacement  */
+    ra -= 4;                    /* return a pointer into the current opcode,
+                                   not the start of the next opcode  */
+    return ra;
+}
 # else
 #  error "CONFIG_QEMU_LDST_OPTIMIZATION needs GETPC_LDST() implementation!"
 # endif
diff --git a/tcg/aarch64/tcg-target.c b/tcg/aarch64/tcg-target.c
index 8bb195e..03da082 100644
--- a/tcg/aarch64/tcg-target.c
+++ b/tcg/aarch64/tcg-target.c
@@ -706,6 +706,41 @@  static inline void tcg_out_uxt(TCGContext *s, int s_bits,
     tcg_out_ubfm(s, 0, rd, rn, 0, bits);
 }
 
+static inline void tcg_out_addi(TCGContext *s, int ext,
+                                TCGReg rd, TCGReg rn, unsigned int aimm)
+{
+    /* add immediate aimm unsigned 12bit value (with LSL 0 or 12) */
+    /* using ADD 0x11000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
+    unsigned int base = ext ? 0x91000000 : 0x11000000;
+
+    if (aimm <= 0xfff) {
+        aimm <<= 10;
+    } else {
+        /* we can only shift left by 12, on assert we cannot represent */
+        assert(!(aimm & 0xfff));
+        assert(aimm <= 0xfff000);
+        base |= 1 << 22; /* apply LSL 12 */
+        aimm >>= 2;
+    }
+
+    tcg_out32(s, base | aimm | (rn << 5) | rd);
+}
+
+static inline void tcg_out_subi(TCGContext *s, int ext,
+                                TCGReg rd, TCGReg rn, unsigned int aimm)
+{
+    /* sub immediate aimm unsigned 12bit value (we use LSL 0 - no shift) */
+    /* using SUB 0x51000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
+    unsigned int base = ext ? 0xd1000000 : 0x51000000;
+    assert(aimm <= 0xfff);
+    tcg_out32(s, base | (aimm << 10) | (rn << 5) | rd);
+}
+
+static inline void tcg_out_nop(TCGContext *s)
+{
+    tcg_out32(s, 0xd503201f);
+}
+
 #ifdef CONFIG_SOFTMMU
 #include "exec/softmmu_defs.h"
 
@@ -727,7 +762,106 @@  static const void * const qemu_st_helpers[4] = {
     helper_stq_mmu,
 };
 
-#else /* !CONFIG_SOFTMMU */
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+{
+    reloc_pc19(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
+    tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
+    tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, lb->addrlo_reg);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
+                 (tcg_target_long)qemu_ld_helpers[lb->opc & 3]);
+    tcg_out_callr(s, TCG_REG_TMP);
+    if (lb->opc & 0x04) {
+        tcg_out_sxt(s, 1, lb->opc & 3, lb->datalo_reg, TCG_REG_X0);
+    } else {
+        tcg_out_movr(s, 1, lb->datalo_reg, TCG_REG_X0);
+    }
+
+    tcg_out_goto(s, (tcg_target_long)lb->raddr);
+}
+
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+{
+    reloc_pc19(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
+
+    tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
+    tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, lb->addrlo_reg);
+    tcg_out_movr(s, 1, TCG_REG_X2, lb->datalo_reg);
+    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, lb->mem_index);
+    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
+                 (tcg_target_long)qemu_st_helpers[lb->opc & 3]);
+    tcg_out_callr(s, TCG_REG_TMP);
+
+    tcg_out_nop(s);
+    tcg_out_goto(s, (tcg_target_long)lb->raddr);
+}
+
+void tcg_out_tb_finalize(TCGContext *s)
+{
+    int i;
+    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
+        TCGLabelQemuLdst *label = &s->qemu_ldst_labels[i];
+        if (label->is_ld) {
+            tcg_out_qemu_ld_slow_path(s, label);
+        } else {
+            tcg_out_qemu_st_slow_path(s, label);
+        }
+    }
+}
+
+static void add_qemu_ldst_label(TCGContext *s, int is_ld, int opc,
+                                int data_reg, int addr_reg,
+                                int mem_index,
+                                uint8_t *raddr, uint8_t *label_ptr)
+{
+    int idx;
+    TCGLabelQemuLdst *label;
+
+    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
+        tcg_abort();
+    }
+
+    idx = s->nb_qemu_ldst_labels++;
+    label = &s->qemu_ldst_labels[idx];
+    label->is_ld = is_ld;
+    label->opc = opc;
+    label->datalo_reg = data_reg;
+    label->addrlo_reg = addr_reg;
+    label->mem_index = mem_index;
+    label->raddr = raddr;
+    label->label_ptr[0] = label_ptr;
+}
+
+/* Load and compare a TLB entry, emitting the conditional jump to the
+slow path for the failure case, which will be patched later when finalizing
+the slow pathClobbers X0,X1,X2,X3 and TMP.  */
+
+static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg,
+            int s_bits, uint8_t **label_ptr, int tlb_offset, int is_read)
+{
+    TCGReg base = TCG_AREG0;
+
+    tcg_out_ubfm(s, TARGET_LONG_BITS == 64 ? 1 : 0, TCG_REG_X0, addr_reg,
+           TARGET_PAGE_BITS, TARGET_PAGE_BITS + CPU_TLB_BITS);
+
+    tcg_out_andi(s, TARGET_LONG_BITS == 64 ? 1 : 0, TCG_REG_X3, addr_reg,
+           TARGET_LONG_BITS - TARGET_PAGE_BITS + s_bits,
+                   TARGET_LONG_BITS - TARGET_PAGE_BITS);
+
+    tcg_out_addi(s, 1, TCG_REG_X2, base, tlb_offset & 0xfff000);
+    tcg_out_arith(s, ARITH_ADD, 1, TCG_REG_X2, TCG_REG_X2,
+                   TCG_REG_X0, -CPU_TLB_ENTRY_BITS);
+    tcg_out_ldst(s, TARGET_LONG_BITS == 64 ? LDST_64 : LDST_32,
+                  LDST_LD, TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff);
+    tcg_out_ldst(s, LDST_64, LDST_LD, TCG_REG_X1, TCG_REG_X2,
+        (tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend) -
+             (is_read ? offsetof(CPUTLBEntry, addr_read) :
+                   offsetof(CPUTLBEntry, addr_write))));
+
+    tcg_out_cmp(s, 1, TCG_REG_X0, TCG_REG_X3, 0);
+    *label_ptr = s->code_ptr;
+    tcg_out_goto_cond_noaddr(s, TCG_COND_NE);
+}
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, int opc, TCGReg data_r,
                                    TCGReg addr_r, TCGReg off_r)
@@ -822,6 +956,7 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
     TCGReg addr_reg, data_reg;
 #ifdef CONFIG_SOFTMMU
     int mem_index, s_bits;
+    uint8_t *label_ptr;
 #endif
     data_reg = args[0];
     addr_reg = args[1];
@@ -829,23 +964,11 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
 #ifdef CONFIG_SOFTMMU
     mem_index = args[2];
     s_bits = opc & 3;
-
-    /* TODO: insert TLB lookup here */
-
-    /* all arguments passed via registers */
-    tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
-    tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, addr_reg);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, mem_index);
-    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
-                 (tcg_target_long)qemu_ld_helpers[s_bits]);
-    tcg_out_callr(s, TCG_REG_TMP);
-
-    if (opc & 0x04) { /* sign extend */
-        tcg_out_sxt(s, 1, s_bits, data_reg, TCG_REG_X0);
-    } else {
-        tcg_out_movr(s, 1, data_reg, TCG_REG_X0);
-    }
-
+    tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr,
+              offsetof(CPUArchState, tlb_table[mem_index][0].addr_read), 1);
+    tcg_out_qemu_ld_direct(s, opc, data_reg, addr_reg, TCG_REG_X1);
+    add_qemu_ldst_label(s, 1, opc, data_reg, addr_reg,
+                        mem_index, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     tcg_out_qemu_ld_direct(s, opc, data_reg, addr_reg,
                            GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR);
@@ -857,25 +980,19 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
     TCGReg addr_reg, data_reg;
 #ifdef CONFIG_SOFTMMU
     int mem_index, s_bits;
+    uint8_t *label_ptr;
 #endif
     data_reg = args[0];
     addr_reg = args[1];
-
 #ifdef CONFIG_SOFTMMU
     mem_index = args[2];
     s_bits = opc & 3;
 
-    /* TODO: insert TLB lookup here */
-
-    /* all arguments passed via registers */
-    tcg_out_movr(s, 1, TCG_REG_X0, TCG_AREG0);
-    tcg_out_movr(s, (TARGET_LONG_BITS == 64), TCG_REG_X1, addr_reg);
-    tcg_out_movr(s, 1, TCG_REG_X2, data_reg);
-    tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, mem_index);
-    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP,
-                 (tcg_target_long)qemu_st_helpers[s_bits]);
-    tcg_out_callr(s, TCG_REG_TMP);
-
+    tcg_out_tlb_read(s, addr_reg, s_bits, &label_ptr,
+             offsetof(CPUArchState, tlb_table[mem_index][0].addr_write), 0);
+    tcg_out_qemu_st_direct(s, opc, data_reg, addr_reg, TCG_REG_X1);
+    add_qemu_ldst_label(s, 0, opc, data_reg, addr_reg,
+                        mem_index, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     tcg_out_qemu_st_direct(s, opc, data_reg, addr_reg,
                            GUEST_BASE ? TCG_REG_GUEST_BASE : TCG_REG_XZR);
@@ -1318,26 +1435,6 @@  static void tcg_target_init(TCGContext *s)
     tcg_add_target_add_op_defs(aarch64_op_defs);
 }
 
-static inline void tcg_out_addi(TCGContext *s, int ext,
-                                TCGReg rd, TCGReg rn, unsigned int aimm)
-{
-    /* add immediate aimm unsigned 12bit value (we use LSL 0 - no shift) */
-    /* using ADD 0x11000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
-    unsigned int base = ext ? 0x91000000 : 0x11000000;
-    assert(aimm <= 0xfff);
-    tcg_out32(s, base | (aimm << 10) | (rn << 5) | rd);
-}
-
-static inline void tcg_out_subi(TCGContext *s, int ext,
-                                TCGReg rd, TCGReg rn, unsigned int aimm)
-{
-    /* sub immediate aimm unsigned 12bit value (we use LSL 0 - no shift) */
-    /* using SUB 0x51000000 | (ext) | (aimm << 10) | (rn << 5) | rd */
-    unsigned int base = ext ? 0xd1000000 : 0x51000000;
-    assert(aimm <= 0xfff);
-    tcg_out32(s, base | (aimm << 10) | (rn << 5) | rd);
-}
-
 static void tcg_target_qemu_prologue(TCGContext *s)
 {
     /* NB: frame sizes are in 16 byte stack units! */