Patchwork [v5,18/19] tcg-arm: Convert to CONFIG_QEMU_LDST_OPTIMIZATION

login
register
mail settings
Submitter Richard Henderson
Date March 31, 2013, 10:35 p.m.
Message ID <1364769305-3687-19-git-send-email-rth@twiddle.net>
Download mbox | patch
Permalink /patch/232640/
State New
Headers show

Comments

Richard Henderson - March 31, 2013, 10:35 p.m.
Move the slow path out of line, as the TODO's mention.
This allows the fast path to be unconditional, which can
speed up the fast path as well, depending on the core.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 configure               |   2 +-
 include/exec/exec-all.h |  17 +++
 tcg/arm/tcg-target.c    | 344 +++++++++++++++++++++++++++++-------------------
 3 files changed, 230 insertions(+), 133 deletions(-)
Aurelien Jarno - April 22, 2013, 12:59 p.m.
On Sun, Mar 31, 2013 at 03:35:04PM -0700, Richard Henderson wrote:
> Move the slow path out of line, as the TODO's mention.
> This allows the fast path to be unconditional, which can
> speed up the fast path as well, depending on the core.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  configure               |   2 +-
>  include/exec/exec-all.h |  17 +++
>  tcg/arm/tcg-target.c    | 344 +++++++++++++++++++++++++++++-------------------
>  3 files changed, 230 insertions(+), 133 deletions(-)
> 
> diff --git a/configure b/configure
> index f2af714..bae3132 100755
> --- a/configure
> +++ b/configure
> @@ -4124,7 +4124,7 @@ upper() {
>  }
>  
>  case "$cpu" in
> -  i386|x86_64|ppc)
> +  arm|i386|x86_64|ppc)
>      # The TCG interpreter currently does not support ld/st optimization.
>      if test "$tcg_interpreter" = "no" ; then
>          echo "CONFIG_QEMU_LDST_OPTIMIZATION=y" >> $config_target_mak
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index e856191..88e87f0 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -338,6 +338,23 @@ extern uintptr_t tci_tb_ptr;
>  # elif defined (_ARCH_PPC) && !defined (_ARCH_PPC64)
>  #  define GETRA() ((uintptr_t)__builtin_return_address(0))
>  #  define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() - 4)) - 1))
> +# elif defined (__arm__)
> +/* We define two insns between the return address and the branch back to
> +   straight-line.  Find and decode that branch insn.  */
> +#  define GETRA()       ((uintptr_t)__builtin_return_address(0))
> +#  define GETPC_LDST()  tcg_getpc_ldst(GETRA())
> +static inline uintptr_t tcg_getpc_ldst(uintptr_t ra)
> +{
> +    int32_t b;
> +    ra += 8;                    /* skip the two insns */
> +    b = *(int32_t *)ra;         /* load the branch insn */
> +    b = (b << 8) >> (8 - 2);    /* extract the displacement */
> +    ra += 8;                    /* branches are relative to pc+8 */
> +    ra += b;                    /* apply the displacement */
> +    ra -= 4;                    /* return a pointer into the current opcode,
> +                                   not the start of the next opcode  */
> +    return ra;
> +}
>  # else
>  #  error "CONFIG_QEMU_LDST_OPTIMIZATION needs GETPC_LDST() implementation!"
>  # endif
> diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
> index ff6dc90..c27e4ee 100644
> --- a/tcg/arm/tcg-target.c
> +++ b/tcg/arm/tcg-target.c
> @@ -415,6 +415,20 @@ static inline void tcg_out_dat_reg(TCGContext *s,
>                      (rn << 16) | (rd << 12) | shift | rm);
>  }
>  
> +static inline void tcg_out_nop(TCGContext *s)
> +{
> +    if (use_armv7_instructions) {
> +        /* Architected nop introduced in v6k.  */
> +        /* ??? This is an MSR (imm) 0,0,0 insn.  Anyone know if this
> +           also Just So Happened to do nothing on pre-v6k so that we
> +           don't need to conditionalize it?  */
> +        tcg_out32(s, 0xe320f000);
> +    } else {
> +        /* Prior to that the assembler uses mov r0, r0.  */
> +        tcg_out_dat_reg(s, COND_AL, ARITH_MOV, 0, 0, 0, SHIFT_IMM_LSL(0));
> +    }
> +}
> +
>  static inline void tcg_out_mov_reg(TCGContext *s, int cond, int rd, int rm)
>  {
>      /* Simple reg-reg move, optimising out the 'do nothing' case */
> @@ -979,27 +993,17 @@ static inline void tcg_out_st8(TCGContext *s, int cond,
>   */
>  static inline void tcg_out_goto(TCGContext *s, int cond, uint32_t addr)
>  {
> -    int32_t val;
> +    int32_t val = addr - (uint32_t)s->code_ptr;
>  
> -    if (addr & 1) {
> -        /* goto to a Thumb destination isn't supported */
> -        tcg_abort();
> -    }
> +    /* We don't support a goto with a thumb destination.  */
> +    assert((addr & 3) == 0);
>  
> -    val = addr - (tcg_target_long) s->code_ptr;
> -    if (val - 8 < 0x01fffffd && val - 8 > -0x01fffffd)
> -        tcg_out_b(s, cond, val);
> -    else {
> -        if (cond == COND_AL) {
> -            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
> -            tcg_out32(s, addr);
> -        } else {
> -            tcg_out_movi32(s, cond, TCG_REG_TMP, val - 8);
> -            tcg_out_dat_reg(s, cond, ARITH_ADD,
> -                            TCG_REG_PC, TCG_REG_PC,
> -                            TCG_REG_TMP, SHIFT_IMM_LSL(0));
> -        }
> -    }
> +    /* The code buffer is limited to 16MB, with the prologue located
> +       at the end of it.  Therefore we needn't care for any out of
> +       range branches.  */
> +    assert(val - 8 < 0x01fffffd && val - 8 > -0x01fffffd);
> +
> +    tcg_out_b(s, cond, val);

While this is currently true, I am not sure we want to get rid of that
code, and I hope we'll be able to eventually get rid of the 16MB limit.

For me this dramatically reduce the boot time of guests. That said it is
not a real benchmark, and it should theoretically reduce the
performances in some cases as doing so interleaves code and data.
Someone has to spend time doing benchmarks before we can progress on that.

>  }
>  
>  /* The call case is mostly used for helpers - so it's not unreasonable
> @@ -1044,14 +1048,9 @@ static inline void tcg_out_goto_label(TCGContext *s, int cond, int label_index)
>  {
>      TCGLabel *l = &s->labels[label_index];
>  
> -    if (l->has_value)
> +    if (l->has_value) {
>          tcg_out_goto(s, cond, l->u.value);
> -    else if (cond == COND_AL) {
> -        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
> -        tcg_out_reloc(s, s->code_ptr, R_ARM_ABS32, label_index, 31337);
> -        s->code_ptr += 4;
>      } else {
> -        /* Probably this should be preferred even for COND_AL... */
>          tcg_out_reloc(s, s->code_ptr, R_ARM_PC24, label_index, 31337);
>          tcg_out_b_noaddr(s, cond);
>      }
> @@ -1203,6 +1202,134 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
>                          TCG_REG_R1, addrhi, SHIFT_IMM_LSL(0));
>      }
>  }
> +
> +/* Record the context of a call to the out of line helper code for the slow
> +   path for a load or store, so that we can later generate the correct
> +   helper code.  */
> +static void add_qemu_ldst_label(TCGContext *s, int is_ld, int opc,
> +                                int data_reg, int data_reg2, int addrlo_reg,
> +                                int addrhi_reg, int mem_index,
> +                                uint8_t *raddr, uint8_t *label_ptr)
> +{
> +    int idx;
> +    TCGLabelQemuLdst *label;
> +
> +    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
> +        tcg_abort();
> +    }
> +
> +    idx = s->nb_qemu_ldst_labels++;
> +    label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
> +    label->is_ld = is_ld;
> +    label->opc = opc;
> +    label->datalo_reg = data_reg;
> +    label->datahi_reg = data_reg2;
> +    label->addrlo_reg = addrlo_reg;
> +    label->addrhi_reg = addrhi_reg;
> +    label->mem_index = mem_index;
> +    label->raddr = raddr;
> +    label->label_ptr[0] = label_ptr;
> +}
> +
> +static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
> +{
> +    TCGReg argreg, data_reg, data_reg2;
> +    uint8_t *start;
> +
> +    reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
> +
> +    argreg = tcg_out_arg_reg32(s, TCG_REG_R0, TCG_AREG0);
> +    if (TARGET_LONG_BITS == 64) {
> +        argreg = tcg_out_arg_reg64(s, argreg, lb->addrlo_reg, lb->addrhi_reg);
> +    } else {
> +        argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg);
> +    }
> +    argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
> +    tcg_out_call(s, (tcg_target_long) qemu_ld_helpers[lb->opc & 3]);
> +
> +    data_reg = lb->datalo_reg;
> +    data_reg2 = lb->datahi_reg;
> +
> +    start = s->code_ptr;
> +    switch (lb->opc) {
> +    case 0 | 4:
> +        tcg_out_ext8s(s, COND_AL, data_reg, TCG_REG_R0);
> +        break;
> +    case 1 | 4:
> +        tcg_out_ext16s(s, COND_AL, data_reg, TCG_REG_R0);
> +        break;
> +    case 0:
> +    case 1:
> +    case 2:
> +    default:
> +        tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
> +        break;
> +    case 3:
> +        tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
> +        tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1);
> +        break;
> +    }
> +
> +    /* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between
> +       the call and the branch back to straight-line code.  Note that the
> +       moves above could be elided by register allocation, nor do we know
> +       which code alternative we chose for extension.  */
> +    switch (s->code_ptr - start) {
> +    case 0:
> +        tcg_out_nop(s);
> +        /* FALLTHRU */
> +    case 4:
> +        tcg_out_nop(s);
> +        /* FALLTHRU */
> +    case 8:
> +        break;
> +    default:
> +        abort();
> +    }
> +
> +    tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr);
> +}
> +
> +static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
> +{
> +    TCGReg argreg, data_reg, data_reg2;
> +
> +    reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
> +
> +    argreg = TCG_REG_R0;
> +    argreg = tcg_out_arg_reg32(s, argreg, TCG_AREG0);
> +    if (TARGET_LONG_BITS == 64) {
> +        argreg = tcg_out_arg_reg64(s, argreg, lb->addrlo_reg, lb->addrhi_reg);
> +    } else {
> +        argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg);
> +    }
> +
> +    data_reg = lb->datalo_reg;
> +    data_reg2 = lb->datahi_reg;
> +    switch (lb->opc) {
> +    case 0:
> +        argreg = tcg_out_arg_reg8(s, argreg, data_reg);
> +        break;
> +    case 1:
> +        argreg = tcg_out_arg_reg16(s, argreg, data_reg);
> +        break;
> +    case 2:
> +        argreg = tcg_out_arg_reg32(s, argreg, data_reg);
> +        break;
> +    case 3:
> +        argreg = tcg_out_arg_reg64(s, argreg, data_reg, data_reg2);
> +        break;
> +    }
> +
> +    argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
> +    tcg_out_call(s, (tcg_target_long) qemu_st_helpers[lb->opc & 3]);
> +
> +    /* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between
> +       the call and the branch back to straight-line code.  */
> +    tcg_out_nop(s);
> +    tcg_out_nop(s);
> +    tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr);
> +}
>  #endif /* SOFTMMU */
>  
>  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
> @@ -1211,8 +1338,8 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
>      bool bswap;
>  #ifdef CONFIG_SOFTMMU
>      int mem_index, s_bits;
> -    TCGReg argreg, addr_reg2;
> -    uint32_t *label_ptr;
> +    TCGReg addr_reg2;
> +    uint8_t *label_ptr;
>  #endif
>  #ifdef TARGET_WORDS_BIGENDIAN
>      bswap = 1;
> @@ -1231,89 +1358,56 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
>      tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
>                       offsetof(CPUArchState, tlb_table[mem_index][0].addr_read));
>  
> -    tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R2,
> +    label_ptr = s->code_ptr;
> +    tcg_out_b_noaddr(s, COND_NE);
> +
> +    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
>                      offsetof(CPUTLBEntry, addend)
>                      - offsetof(CPUTLBEntry, addr_read));
>  
>      switch (opc) {
>      case 0:
> -        tcg_out_ld8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
> +        tcg_out_ld8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
>          break;
>      case 0 | 4:
> -        tcg_out_ld8s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
> +        tcg_out_ld8s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
>          break;
>      case 1:
> -        tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
> +        tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
>          if (bswap) {
> -            tcg_out_bswap16(s, COND_EQ, data_reg, data_reg);
> +            tcg_out_bswap16(s, COND_AL, data_reg, data_reg);
>          }
>          break;
>      case 1 | 4:
>          if (bswap) {
> -            tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
> -            tcg_out_bswap16s(s, COND_EQ, data_reg, data_reg);
> +            tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
> +            tcg_out_bswap16s(s, COND_AL, data_reg, data_reg);
>          } else {
> -            tcg_out_ld16s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
> +            tcg_out_ld16s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
>          }
>          break;
>      case 2:
>      default:
> -        tcg_out_ld32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
> +        tcg_out_ld32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
>          if (bswap) {
> -            tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
> +            tcg_out_bswap32(s, COND_AL, data_reg, data_reg);
>          }
>          break;
>      case 3:
>          if (bswap) {
> -            tcg_out_ld32_rwb(s, COND_EQ, data_reg2, TCG_REG_R1, addr_reg);
> -            tcg_out_ld32_12(s, COND_EQ, data_reg, TCG_REG_R1, 4);
> -            tcg_out_bswap32(s, COND_EQ, data_reg2, data_reg2);
> -            tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
> +            tcg_out_ld32_rwb(s, COND_AL, data_reg2, TCG_REG_R1, addr_reg);
> +            tcg_out_ld32_12(s, COND_AL, data_reg, TCG_REG_R1, 4);
> +            tcg_out_bswap32(s, COND_AL, data_reg2, data_reg2);
> +            tcg_out_bswap32(s, COND_AL, data_reg, data_reg);
>          } else {
> -            tcg_out_ld32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
> -            tcg_out_ld32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
> +            tcg_out_ld32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg);
> +            tcg_out_ld32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4);
>          }
>          break;
>      }
>  
> -    label_ptr = (void *) s->code_ptr;
> -    tcg_out_b_noaddr(s, COND_EQ);
> -
> -    /* TODO: move this code to where the constants pool will be */
> -    /* Note that this code relies on the constraints we set in arm_op_defs[]
> -     * to ensure that later arguments are not passed to us in registers we
> -     * trash by moving the earlier arguments into them.
> -     */
> -    argreg = TCG_REG_R0;
> -    argreg = tcg_out_arg_reg32(s, argreg, TCG_AREG0);
> -    if (TARGET_LONG_BITS == 64) {
> -        argreg = tcg_out_arg_reg64(s, argreg, addr_reg, addr_reg2);
> -    } else {
> -        argreg = tcg_out_arg_reg32(s, argreg, addr_reg);
> -    }
> -    argreg = tcg_out_arg_imm32(s, argreg, mem_index);
> -    tcg_out_call(s, (tcg_target_long) qemu_ld_helpers[s_bits]);
> -
> -    switch (opc) {
> -    case 0 | 4:
> -        tcg_out_ext8s(s, COND_AL, data_reg, TCG_REG_R0);
> -        break;
> -    case 1 | 4:
> -        tcg_out_ext16s(s, COND_AL, data_reg, TCG_REG_R0);
> -        break;
> -    case 0:
> -    case 1:
> -    case 2:
> -    default:
> -        tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
> -        break;
> -    case 3:
> -        tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
> -        tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1);
> -        break;
> -    }
> -
> -    reloc_pc24(label_ptr, (tcg_target_long)s->code_ptr);
> +    add_qemu_ldst_label(s, 1, opc, data_reg, data_reg2, addr_reg, addr_reg2,
> +                        mem_index, s->code_ptr, label_ptr);
>  #else /* !CONFIG_SOFTMMU */
>      if (GUEST_BASE) {
>          uint32_t offset = GUEST_BASE;
> @@ -1382,8 +1476,8 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
>      bool bswap;
>  #ifdef CONFIG_SOFTMMU
>      int mem_index, s_bits;
> -    TCGReg argreg, addr_reg2;
> -    uint32_t *label_ptr;
> +    TCGReg addr_reg2;
> +    uint8_t *label_ptr;
>  #endif
>  #ifdef TARGET_WORDS_BIGENDIAN
>      bswap = 1;
> @@ -1403,79 +1497,49 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
>                       offsetof(CPUArchState,
>                                tlb_table[mem_index][0].addr_write));
>  
> -    tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R2,
> +    label_ptr = s->code_ptr;
> +    tcg_out_b_noaddr(s, COND_NE);
> +
> +    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
>                      offsetof(CPUTLBEntry, addend)
>                      - offsetof(CPUTLBEntry, addr_write));
>  
>      switch (opc) {
>      case 0:
> -        tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
> +        tcg_out_st8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
>          break;
>      case 1:
>          if (bswap) {
> -            tcg_out_bswap16st(s, COND_EQ, TCG_REG_R0, data_reg);
> -            tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
> +            tcg_out_bswap16st(s, COND_AL, TCG_REG_R0, data_reg);
> +            tcg_out_st16_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1);
>          } else {
> -            tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
> +            tcg_out_st16_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
>          }
>          break;
>      case 2:
>      default:
>          if (bswap) {
> -            tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
> -            tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
> +            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg);
> +            tcg_out_st32_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1);
>          } else {
> -            tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
> +            tcg_out_st32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
>          }
>          break;
>      case 3:
>          if (bswap) {
> -            tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg2);
> -            tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, addr_reg);
> -            tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
> -            tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, 4);
> +            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg2);
> +            tcg_out_st32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R1, addr_reg);
> +            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg);
> +            tcg_out_st32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R1, 4);
>          } else {
> -            tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
> -            tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
> +            tcg_out_st32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg);
> +            tcg_out_st32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4);
>          }
>          break;
>      }
>  
> -    label_ptr = (void *) s->code_ptr;
> -    tcg_out_b_noaddr(s, COND_EQ);
> -
> -    /* TODO: move this code to where the constants pool will be */
> -    /* Note that this code relies on the constraints we set in arm_op_defs[]
> -     * to ensure that later arguments are not passed to us in registers we
> -     * trash by moving the earlier arguments into them.
> -     */
> -    argreg = TCG_REG_R0;
> -    argreg = tcg_out_arg_reg32(s, argreg, TCG_AREG0);
> -    if (TARGET_LONG_BITS == 64) {
> -        argreg = tcg_out_arg_reg64(s, argreg, addr_reg, addr_reg2);
> -    } else {
> -        argreg = tcg_out_arg_reg32(s, argreg, addr_reg);
> -    }
> -
> -    switch (opc) {
> -    case 0:
> -        argreg = tcg_out_arg_reg8(s, argreg, data_reg);
> -        break;
> -    case 1:
> -        argreg = tcg_out_arg_reg16(s, argreg, data_reg);
> -        break;
> -    case 2:
> -        argreg = tcg_out_arg_reg32(s, argreg, data_reg);
> -        break;
> -    case 3:
> -        argreg = tcg_out_arg_reg64(s, argreg, data_reg, data_reg2);
> -        break;
> -    }
> -
> -    argreg = tcg_out_arg_imm32(s, argreg, mem_index);
> -    tcg_out_call(s, (tcg_target_long) qemu_st_helpers[s_bits]);
> -
> -    reloc_pc24(label_ptr, (tcg_target_long)s->code_ptr);
> +    add_qemu_ldst_label(s, 0, opc, data_reg, data_reg2, addr_reg, addr_reg2,
> +                        mem_index, s->code_ptr, label_ptr);
>  #else /* !CONFIG_SOFTMMU */
>      if (GUEST_BASE) {
>          uint32_t offset = GUEST_BASE;
> @@ -1875,6 +1939,22 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>      }
>  }
>  
> +#ifdef CONFIG_SOFTMMU
> +/* Generate TB finalization at the end of block.  */
> +void tcg_out_tb_finalize(TCGContext *s)
> +{
> +    int i;
> +    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
> +        TCGLabelQemuLdst *label = &s->qemu_ldst_labels[i];
> +        if (label->is_ld) {
> +            tcg_out_qemu_ld_slow_path(s, label);
> +        } else {
> +            tcg_out_qemu_st_slow_path(s, label);
> +        }
> +    }
> +}
> +#endif /* SOFTMMU */
> +
>  static const TCGTargetOpDef arm_op_defs[] = {
>      { INDEX_op_exit_tb, { } },
>      { INDEX_op_goto_tb, { } },
> -- 
> 1.8.1.4
> 
> 
>
Richard Henderson - April 22, 2013, 2:39 p.m.
On 2013-04-22 13:59, Aurelien Jarno wrote:
>> >+    /* The code buffer is limited to 16MB, with the prologue located
>> >+       at the end of it.  Therefore we needn't care for any out of
>> >+       range branches.  */
>> >+    assert(val - 8 < 0x01fffffd && val - 8 > -0x01fffffd);
>> >+
>> >+    tcg_out_b(s, cond, val);
> While this is currently true, I am not sure we want to get rid of that
> code, and I hope we'll be able to eventually get rid of the 16MB limit.
>
> For me this dramatically reduce the boot time of guests. That said it is
> not a real benchmark, and it should theoretically reduce the
> performances in some cases as doing so interleaves code and data.
> Someone has to spend time doing benchmarks before we can progress on that.
>

Ok, sure, but how do we progress in the short term?
Certainly rejecting this patch and its changes to tcg_out_goto
is not compatible with approving  19/19, which relies on it.

IMO, tcg_out_goto should be the simple case of goto within a TB,
with only the code for INDEX_goto_tb needing to handle the >16MB case.


r~
Aurelien Jarno - April 23, 2013, 6:44 a.m.
On Mon, Apr 22, 2013 at 03:39:42PM +0100, Richard Henderson wrote:
> On 2013-04-22 13:59, Aurelien Jarno wrote:
> >>>+    /* The code buffer is limited to 16MB, with the prologue located
> >>>+       at the end of it.  Therefore we needn't care for any out of
> >>>+       range branches.  */
> >>>+    assert(val - 8 < 0x01fffffd && val - 8 > -0x01fffffd);
> >>>+
> >>>+    tcg_out_b(s, cond, val);
> >While this is currently true, I am not sure we want to get rid of that
> >code, and I hope we'll be able to eventually get rid of the 16MB limit.
> >
> >For me this dramatically reduce the boot time of guests. That said it is
> >not a real benchmark, and it should theoretically reduce the
> >performances in some cases as doing so interleaves code and data.
> >Someone has to spend time doing benchmarks before we can progress on that.
> >
> 
> Ok, sure, but how do we progress in the short term?
> Certainly rejecting this patch and its changes to tcg_out_goto
> is not compatible with approving  19/19, which relies on it.

Oh, I haven't seen realized that the comment about goto not using
multi-insns ops was referring to patch 18.

> IMO, tcg_out_goto should be the simple case of goto within a TB,
> with only the code for INDEX_goto_tb needing to handle the >16MB case.
> 

From what I have seen in the code, we need to do jumps in the following
conditions:
- jumps within TB
- jumps between slow/fast path in the ld/st code
- jump to the epilogue
- jump between TB

Currently the first three use tcg_goto_out, while only the third one
need to (eventually) jump above the 16MB limit.

The last one should also jump above the 16MB limit, but do not use
tcg_goto_out.

For jumps within TB tcg_out_goto_label can use tcg_out_b directly,
as tcg_out_b_noaddr is already used in the other pass, and the only
benefit of tcg_goto_out over tcg_out_b is the asserts, which can not
really been triggered here.

For jumps from the slow path to the fast path, I think tcg_out_b
can also be used, as anyway tcg_out_b_noaddr is already used from the
fast path to the slow path.

This leaves out tcg_goto_out used only for the jump to the epilogue, and
it can be modified later when someone works on the 16MB limit issue. Of
course that means that patch 19 has to be dropped, but I don't think
it's a big issue.
Richard Henderson - April 23, 2013, 8:13 a.m.
On 2013-04-23 07:44, Aurelien Jarno wrote:
> On Mon, Apr 22, 2013 at 03:39:42PM +0100, Richard Henderson wrote:
>> On 2013-04-22 13:59, Aurelien Jarno wrote:
>>>>> +    /* The code buffer is limited to 16MB, with the prologue located
>>>>> +       at the end of it.  Therefore we needn't care for any out of
>>>>> +       range branches.  */
>>>>> +    assert(val - 8 < 0x01fffffd && val - 8 > -0x01fffffd);
>>>>> +
>>>>> +    tcg_out_b(s, cond, val);
>>> While this is currently true, I am not sure we want to get rid of that
>>> code, and I hope we'll be able to eventually get rid of the 16MB limit.
>>>
>>> For me this dramatically reduce the boot time of guests. That said it is
>>> not a real benchmark, and it should theoretically reduce the
>>> performances in some cases as doing so interleaves code and data.
>>> Someone has to spend time doing benchmarks before we can progress on that.
>>>
>>
>> Ok, sure, but how do we progress in the short term?
>> Certainly rejecting this patch and its changes to tcg_out_goto
>> is not compatible with approving  19/19, which relies on it.
>
> Oh, I haven't seen realized that the comment about goto not using
> multi-insns ops was referring to patch 18.
>
>> IMO, tcg_out_goto should be the simple case of goto within a TB,
>> with only the code for INDEX_goto_tb needing to handle the >16MB case.
>>
>
>>From what I have seen in the code, we need to do jumps in the following
> conditions:
> - jumps within TB
> - jumps between slow/fast path in the ld/st code
> - jump to the epilogue
> - jump between TB
>
> Currently the first three use tcg_goto_out, while only the third one
> need to (eventually) jump above the 16MB limit.

The epilogue is exactly two insns.  When we eliminate the 16MB limit
we should just fold the add sp + pop into the exit_tb.  We can't do
better than that.

> The last one should also jump above the 16MB limit, but do not use
> tcg_goto_out.

Agreed.

> For jumps within TB tcg_out_goto_label can use tcg_out_b directly,
> as tcg_out_b_noaddr is already used in the other pass, and the only
> benefit of tcg_goto_out over tcg_out_b is the asserts, which can not
> really been triggered here.
>
> For jumps from the slow path to the fast path, I think tcg_out_b
> can also be used, as anyway tcg_out_b_noaddr is already used from the
> fast path to the slow path.
>
> This leaves out tcg_goto_out used only for the jump to the epilogue, and
> it can be modified later when someone works on the 16MB limit issue. Of
> course that means that patch 19 has to be dropped, but I don't think
> it's a big issue.

Ok, I'll drop patch 19 for now so that we can make progress.


r~
Aurelien Jarno - April 23, 2013, 8:18 a.m.
Le 23/04/2013 10:13, Richard Henderson a écrit :
> On 2013-04-23 07:44, Aurelien Jarno wrote:
>> On Mon, Apr 22, 2013 at 03:39:42PM +0100, Richard Henderson wrote:
>>> On 2013-04-22 13:59, Aurelien Jarno wrote:
>>>>>> +    /* The code buffer is limited to 16MB, with the prologue located
>>>>>> +       at the end of it.  Therefore we needn't care for any out of
>>>>>> +       range branches.  */
>>>>>> +    assert(val - 8 < 0x01fffffd && val - 8 > -0x01fffffd);
>>>>>> +
>>>>>> +    tcg_out_b(s, cond, val);
>>>> While this is currently true, I am not sure we want to get rid of that
>>>> code, and I hope we'll be able to eventually get rid of the 16MB limit.
>>>>
>>>> For me this dramatically reduce the boot time of guests. That said it is
>>>> not a real benchmark, and it should theoretically reduce the
>>>> performances in some cases as doing so interleaves code and data.
>>>> Someone has to spend time doing benchmarks before we can progress on that.
>>>>
>>>
>>> Ok, sure, but how do we progress in the short term?
>>> Certainly rejecting this patch and its changes to tcg_out_goto
>>> is not compatible with approving  19/19, which relies on it.
>>
>> Oh, I haven't seen realized that the comment about goto not using
>> multi-insns ops was referring to patch 18.
>>
>>> IMO, tcg_out_goto should be the simple case of goto within a TB,
>>> with only the code for INDEX_goto_tb needing to handle the >16MB case.
>>>
>>
>> >From what I have seen in the code, we need to do jumps in the following
>> conditions:
>> - jumps within TB
>> - jumps between slow/fast path in the ld/st code
>> - jump to the epilogue
>> - jump between TB
>>
>> Currently the first three use tcg_goto_out, while only the third one
>> need to (eventually) jump above the 16MB limit.
> 
> The epilogue is exactly two insns.  When we eliminate the 16MB limit
> we should just fold the add sp + pop into the exit_tb.  We can't do
> better than that.
> 
>> The last one should also jump above the 16MB limit, but do not use
>> tcg_goto_out.
> 
> Agreed.
> 
>> For jumps within TB tcg_out_goto_label can use tcg_out_b directly,
>> as tcg_out_b_noaddr is already used in the other pass, and the only
>> benefit of tcg_goto_out over tcg_out_b is the asserts, which can not
>> really been triggered here.
>>
>> For jumps from the slow path to the fast path, I think tcg_out_b
>> can also be used, as anyway tcg_out_b_noaddr is already used from the
>> fast path to the slow path.
>>
>> This leaves out tcg_goto_out used only for the jump to the epilogue, and
>> it can be modified later when someone works on the 16MB limit issue. Of
>> course that means that patch 19 has to be dropped, but I don't think
>> it's a big issue.
> 
> Ok, I'll drop patch 19 for now so that we can make progress.
> 

If you plan to respin the series, do you want I start applying the first
patches, at least the "Fix local stack frame" one that we want to see in
1.5 (and stable)?
Richard Henderson - April 23, 2013, 8:48 a.m.
On 2013-04-23 09:18, Aurelien Jarno wrote:
> If you plan to respin the series, do you want I start applying the first
> patches, at least the "Fix local stack frame" one that we want to see in
> 1.5 (and stable)?

I've already the added the alignment bits you requested for that one.
I'll post the rest of the respin today.


r~

Patch

diff --git a/configure b/configure
index f2af714..bae3132 100755
--- a/configure
+++ b/configure
@@ -4124,7 +4124,7 @@  upper() {
 }
 
 case "$cpu" in
-  i386|x86_64|ppc)
+  arm|i386|x86_64|ppc)
     # The TCG interpreter currently does not support ld/st optimization.
     if test "$tcg_interpreter" = "no" ; then
         echo "CONFIG_QEMU_LDST_OPTIMIZATION=y" >> $config_target_mak
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index e856191..88e87f0 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -338,6 +338,23 @@  extern uintptr_t tci_tb_ptr;
 # elif defined (_ARCH_PPC) && !defined (_ARCH_PPC64)
 #  define GETRA() ((uintptr_t)__builtin_return_address(0))
 #  define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() - 4)) - 1))
+# elif defined (__arm__)
+/* We define two insns between the return address and the branch back to
+   straight-line.  Find and decode that branch insn.  */
+#  define GETRA()       ((uintptr_t)__builtin_return_address(0))
+#  define GETPC_LDST()  tcg_getpc_ldst(GETRA())
+static inline uintptr_t tcg_getpc_ldst(uintptr_t ra)
+{
+    int32_t b;
+    ra += 8;                    /* skip the two insns */
+    b = *(int32_t *)ra;         /* load the branch insn */
+    b = (b << 8) >> (8 - 2);    /* extract the displacement */
+    ra += 8;                    /* branches are relative to pc+8 */
+    ra += b;                    /* apply the displacement */
+    ra -= 4;                    /* return a pointer into the current opcode,
+                                   not the start of the next opcode  */
+    return ra;
+}
 # else
 #  error "CONFIG_QEMU_LDST_OPTIMIZATION needs GETPC_LDST() implementation!"
 # endif
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index ff6dc90..c27e4ee 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -415,6 +415,20 @@  static inline void tcg_out_dat_reg(TCGContext *s,
                     (rn << 16) | (rd << 12) | shift | rm);
 }
 
+static inline void tcg_out_nop(TCGContext *s)
+{
+    if (use_armv7_instructions) {
+        /* Architected nop introduced in v6k.  */
+        /* ??? This is an MSR (imm) 0,0,0 insn.  Anyone know if this
+           also Just So Happened to do nothing on pre-v6k so that we
+           don't need to conditionalize it?  */
+        tcg_out32(s, 0xe320f000);
+    } else {
+        /* Prior to that the assembler uses mov r0, r0.  */
+        tcg_out_dat_reg(s, COND_AL, ARITH_MOV, 0, 0, 0, SHIFT_IMM_LSL(0));
+    }
+}
+
 static inline void tcg_out_mov_reg(TCGContext *s, int cond, int rd, int rm)
 {
     /* Simple reg-reg move, optimising out the 'do nothing' case */
@@ -979,27 +993,17 @@  static inline void tcg_out_st8(TCGContext *s, int cond,
  */
 static inline void tcg_out_goto(TCGContext *s, int cond, uint32_t addr)
 {
-    int32_t val;
+    int32_t val = addr - (uint32_t)s->code_ptr;
 
-    if (addr & 1) {
-        /* goto to a Thumb destination isn't supported */
-        tcg_abort();
-    }
+    /* We don't support a goto with a thumb destination.  */
+    assert((addr & 3) == 0);
 
-    val = addr - (tcg_target_long) s->code_ptr;
-    if (val - 8 < 0x01fffffd && val - 8 > -0x01fffffd)
-        tcg_out_b(s, cond, val);
-    else {
-        if (cond == COND_AL) {
-            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
-            tcg_out32(s, addr);
-        } else {
-            tcg_out_movi32(s, cond, TCG_REG_TMP, val - 8);
-            tcg_out_dat_reg(s, cond, ARITH_ADD,
-                            TCG_REG_PC, TCG_REG_PC,
-                            TCG_REG_TMP, SHIFT_IMM_LSL(0));
-        }
-    }
+    /* The code buffer is limited to 16MB, with the prologue located
+       at the end of it.  Therefore we needn't care for any out of
+       range branches.  */
+    assert(val - 8 < 0x01fffffd && val - 8 > -0x01fffffd);
+
+    tcg_out_b(s, cond, val);
 }
 
 /* The call case is mostly used for helpers - so it's not unreasonable
@@ -1044,14 +1048,9 @@  static inline void tcg_out_goto_label(TCGContext *s, int cond, int label_index)
 {
     TCGLabel *l = &s->labels[label_index];
 
-    if (l->has_value)
+    if (l->has_value) {
         tcg_out_goto(s, cond, l->u.value);
-    else if (cond == COND_AL) {
-        tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
-        tcg_out_reloc(s, s->code_ptr, R_ARM_ABS32, label_index, 31337);
-        s->code_ptr += 4;
     } else {
-        /* Probably this should be preferred even for COND_AL... */
         tcg_out_reloc(s, s->code_ptr, R_ARM_PC24, label_index, 31337);
         tcg_out_b_noaddr(s, cond);
     }
@@ -1203,6 +1202,134 @@  static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
                         TCG_REG_R1, addrhi, SHIFT_IMM_LSL(0));
     }
 }
+
+/* Record the context of a call to the out of line helper code for the slow
+   path for a load or store, so that we can later generate the correct
+   helper code.  */
+static void add_qemu_ldst_label(TCGContext *s, int is_ld, int opc,
+                                int data_reg, int data_reg2, int addrlo_reg,
+                                int addrhi_reg, int mem_index,
+                                uint8_t *raddr, uint8_t *label_ptr)
+{
+    int idx;
+    TCGLabelQemuLdst *label;
+
+    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
+        tcg_abort();
+    }
+
+    idx = s->nb_qemu_ldst_labels++;
+    label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
+    label->is_ld = is_ld;
+    label->opc = opc;
+    label->datalo_reg = data_reg;
+    label->datahi_reg = data_reg2;
+    label->addrlo_reg = addrlo_reg;
+    label->addrhi_reg = addrhi_reg;
+    label->mem_index = mem_index;
+    label->raddr = raddr;
+    label->label_ptr[0] = label_ptr;
+}
+
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+{
+    TCGReg argreg, data_reg, data_reg2;
+    uint8_t *start;
+
+    reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
+
+    argreg = tcg_out_arg_reg32(s, TCG_REG_R0, TCG_AREG0);
+    if (TARGET_LONG_BITS == 64) {
+        argreg = tcg_out_arg_reg64(s, argreg, lb->addrlo_reg, lb->addrhi_reg);
+    } else {
+        argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg);
+    }
+    argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
+    tcg_out_call(s, (tcg_target_long) qemu_ld_helpers[lb->opc & 3]);
+
+    data_reg = lb->datalo_reg;
+    data_reg2 = lb->datahi_reg;
+
+    start = s->code_ptr;
+    switch (lb->opc) {
+    case 0 | 4:
+        tcg_out_ext8s(s, COND_AL, data_reg, TCG_REG_R0);
+        break;
+    case 1 | 4:
+        tcg_out_ext16s(s, COND_AL, data_reg, TCG_REG_R0);
+        break;
+    case 0:
+    case 1:
+    case 2:
+    default:
+        tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
+        break;
+    case 3:
+        tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
+        tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1);
+        break;
+    }
+
+    /* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between
+       the call and the branch back to straight-line code.  Note that the
+       moves above could be elided by register allocation, nor do we know
+       which code alternative we chose for extension.  */
+    switch (s->code_ptr - start) {
+    case 0:
+        tcg_out_nop(s);
+        /* FALLTHRU */
+    case 4:
+        tcg_out_nop(s);
+        /* FALLTHRU */
+    case 8:
+        break;
+    default:
+        abort();
+    }
+
+    tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr);
+}
+
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+{
+    TCGReg argreg, data_reg, data_reg2;
+
+    reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
+
+    argreg = TCG_REG_R0;
+    argreg = tcg_out_arg_reg32(s, argreg, TCG_AREG0);
+    if (TARGET_LONG_BITS == 64) {
+        argreg = tcg_out_arg_reg64(s, argreg, lb->addrlo_reg, lb->addrhi_reg);
+    } else {
+        argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg);
+    }
+
+    data_reg = lb->datalo_reg;
+    data_reg2 = lb->datahi_reg;
+    switch (lb->opc) {
+    case 0:
+        argreg = tcg_out_arg_reg8(s, argreg, data_reg);
+        break;
+    case 1:
+        argreg = tcg_out_arg_reg16(s, argreg, data_reg);
+        break;
+    case 2:
+        argreg = tcg_out_arg_reg32(s, argreg, data_reg);
+        break;
+    case 3:
+        argreg = tcg_out_arg_reg64(s, argreg, data_reg, data_reg2);
+        break;
+    }
+
+    argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
+    tcg_out_call(s, (tcg_target_long) qemu_st_helpers[lb->opc & 3]);
+
+    /* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between
+       the call and the branch back to straight-line code.  */
+    tcg_out_nop(s);
+    tcg_out_nop(s);
+    tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr);
+}
 #endif /* SOFTMMU */
 
 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
@@ -1211,8 +1338,8 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
     bool bswap;
 #ifdef CONFIG_SOFTMMU
     int mem_index, s_bits;
-    TCGReg argreg, addr_reg2;
-    uint32_t *label_ptr;
+    TCGReg addr_reg2;
+    uint8_t *label_ptr;
 #endif
 #ifdef TARGET_WORDS_BIGENDIAN
     bswap = 1;
@@ -1231,89 +1358,56 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
     tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
                      offsetof(CPUArchState, tlb_table[mem_index][0].addr_read));
 
-    tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R2,
+    label_ptr = s->code_ptr;
+    tcg_out_b_noaddr(s, COND_NE);
+
+    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
                     offsetof(CPUTLBEntry, addend)
                     - offsetof(CPUTLBEntry, addr_read));
 
     switch (opc) {
     case 0:
-        tcg_out_ld8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
         break;
     case 0 | 4:
-        tcg_out_ld8s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld8s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
         break;
     case 1:
-        tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
         if (bswap) {
-            tcg_out_bswap16(s, COND_EQ, data_reg, data_reg);
+            tcg_out_bswap16(s, COND_AL, data_reg, data_reg);
         }
         break;
     case 1 | 4:
         if (bswap) {
-            tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
-            tcg_out_bswap16s(s, COND_EQ, data_reg, data_reg);
+            tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_bswap16s(s, COND_AL, data_reg, data_reg);
         } else {
-            tcg_out_ld16s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_ld16s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
         }
         break;
     case 2:
     default:
-        tcg_out_ld32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
         if (bswap) {
-            tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
+            tcg_out_bswap32(s, COND_AL, data_reg, data_reg);
         }
         break;
     case 3:
         if (bswap) {
-            tcg_out_ld32_rwb(s, COND_EQ, data_reg2, TCG_REG_R1, addr_reg);
-            tcg_out_ld32_12(s, COND_EQ, data_reg, TCG_REG_R1, 4);
-            tcg_out_bswap32(s, COND_EQ, data_reg2, data_reg2);
-            tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
+            tcg_out_ld32_rwb(s, COND_AL, data_reg2, TCG_REG_R1, addr_reg);
+            tcg_out_ld32_12(s, COND_AL, data_reg, TCG_REG_R1, 4);
+            tcg_out_bswap32(s, COND_AL, data_reg2, data_reg2);
+            tcg_out_bswap32(s, COND_AL, data_reg, data_reg);
         } else {
-            tcg_out_ld32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
-            tcg_out_ld32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
+            tcg_out_ld32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg);
+            tcg_out_ld32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4);
         }
         break;
     }
 
-    label_ptr = (void *) s->code_ptr;
-    tcg_out_b_noaddr(s, COND_EQ);
-
-    /* TODO: move this code to where the constants pool will be */
-    /* Note that this code relies on the constraints we set in arm_op_defs[]
-     * to ensure that later arguments are not passed to us in registers we
-     * trash by moving the earlier arguments into them.
-     */
-    argreg = TCG_REG_R0;
-    argreg = tcg_out_arg_reg32(s, argreg, TCG_AREG0);
-    if (TARGET_LONG_BITS == 64) {
-        argreg = tcg_out_arg_reg64(s, argreg, addr_reg, addr_reg2);
-    } else {
-        argreg = tcg_out_arg_reg32(s, argreg, addr_reg);
-    }
-    argreg = tcg_out_arg_imm32(s, argreg, mem_index);
-    tcg_out_call(s, (tcg_target_long) qemu_ld_helpers[s_bits]);
-
-    switch (opc) {
-    case 0 | 4:
-        tcg_out_ext8s(s, COND_AL, data_reg, TCG_REG_R0);
-        break;
-    case 1 | 4:
-        tcg_out_ext16s(s, COND_AL, data_reg, TCG_REG_R0);
-        break;
-    case 0:
-    case 1:
-    case 2:
-    default:
-        tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
-        break;
-    case 3:
-        tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
-        tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1);
-        break;
-    }
-
-    reloc_pc24(label_ptr, (tcg_target_long)s->code_ptr);
+    add_qemu_ldst_label(s, 1, opc, data_reg, data_reg2, addr_reg, addr_reg2,
+                        mem_index, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     if (GUEST_BASE) {
         uint32_t offset = GUEST_BASE;
@@ -1382,8 +1476,8 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
     bool bswap;
 #ifdef CONFIG_SOFTMMU
     int mem_index, s_bits;
-    TCGReg argreg, addr_reg2;
-    uint32_t *label_ptr;
+    TCGReg addr_reg2;
+    uint8_t *label_ptr;
 #endif
 #ifdef TARGET_WORDS_BIGENDIAN
     bswap = 1;
@@ -1403,79 +1497,49 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
                      offsetof(CPUArchState,
                               tlb_table[mem_index][0].addr_write));
 
-    tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R2,
+    label_ptr = s->code_ptr;
+    tcg_out_b_noaddr(s, COND_NE);
+
+    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
                     offsetof(CPUTLBEntry, addend)
                     - offsetof(CPUTLBEntry, addr_write));
 
     switch (opc) {
     case 0:
-        tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_st8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
         break;
     case 1:
         if (bswap) {
-            tcg_out_bswap16st(s, COND_EQ, TCG_REG_R0, data_reg);
-            tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
+            tcg_out_bswap16st(s, COND_AL, TCG_REG_R0, data_reg);
+            tcg_out_st16_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1);
         } else {
-            tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_st16_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
         }
         break;
     case 2:
     default:
         if (bswap) {
-            tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
-            tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
+            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg);
+            tcg_out_st32_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1);
         } else {
-            tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_st32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
         }
         break;
     case 3:
         if (bswap) {
-            tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg2);
-            tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, addr_reg);
-            tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
-            tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, 4);
+            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg2);
+            tcg_out_st32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R1, addr_reg);
+            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg);
+            tcg_out_st32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R1, 4);
         } else {
-            tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
-            tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
+            tcg_out_st32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg);
+            tcg_out_st32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4);
         }
         break;
     }
 
-    label_ptr = (void *) s->code_ptr;
-    tcg_out_b_noaddr(s, COND_EQ);
-
-    /* TODO: move this code to where the constants pool will be */
-    /* Note that this code relies on the constraints we set in arm_op_defs[]
-     * to ensure that later arguments are not passed to us in registers we
-     * trash by moving the earlier arguments into them.
-     */
-    argreg = TCG_REG_R0;
-    argreg = tcg_out_arg_reg32(s, argreg, TCG_AREG0);
-    if (TARGET_LONG_BITS == 64) {
-        argreg = tcg_out_arg_reg64(s, argreg, addr_reg, addr_reg2);
-    } else {
-        argreg = tcg_out_arg_reg32(s, argreg, addr_reg);
-    }
-
-    switch (opc) {
-    case 0:
-        argreg = tcg_out_arg_reg8(s, argreg, data_reg);
-        break;
-    case 1:
-        argreg = tcg_out_arg_reg16(s, argreg, data_reg);
-        break;
-    case 2:
-        argreg = tcg_out_arg_reg32(s, argreg, data_reg);
-        break;
-    case 3:
-        argreg = tcg_out_arg_reg64(s, argreg, data_reg, data_reg2);
-        break;
-    }
-
-    argreg = tcg_out_arg_imm32(s, argreg, mem_index);
-    tcg_out_call(s, (tcg_target_long) qemu_st_helpers[s_bits]);
-
-    reloc_pc24(label_ptr, (tcg_target_long)s->code_ptr);
+    add_qemu_ldst_label(s, 0, opc, data_reg, data_reg2, addr_reg, addr_reg2,
+                        mem_index, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
     if (GUEST_BASE) {
         uint32_t offset = GUEST_BASE;
@@ -1875,6 +1939,22 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     }
 }
 
+#ifdef CONFIG_SOFTMMU
+/* Generate TB finalization at the end of block.  */
+void tcg_out_tb_finalize(TCGContext *s)
+{
+    int i;
+    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
+        TCGLabelQemuLdst *label = &s->qemu_ldst_labels[i];
+        if (label->is_ld) {
+            tcg_out_qemu_ld_slow_path(s, label);
+        } else {
+            tcg_out_qemu_st_slow_path(s, label);
+        }
+    }
+}
+#endif /* SOFTMMU */
+
 static const TCGTargetOpDef arm_op_defs[] = {
     { INDEX_op_exit_tb, { } },
     { INDEX_op_goto_tb, { } },