Patchwork [RFC,v2,3/4] tcg: add optimized TCG qemu_ld/st generation

login
register
mail settings
Submitter YeongKyoon Lee
Date July 5, 2012, 1:23 p.m.
Message ID <1341494619-4714-4-git-send-email-yeongkyoon.lee@samsung.com>
Download mbox | patch
Permalink /patch/169168/
State New
Headers show

Comments

YeongKyoon Lee - July 5, 2012, 1:23 p.m.
Add optimized TCG qemu_ld/st generation which generates the code for TLB miss case handling at the end of TB after generating other IRs.

Signed-off-by: Yeongkyoon Lee <yeongkyoon.lee@samsung.com>
---
 tcg/i386/tcg-target.c |  328 +++++++++++++++++++++++++++++++++++++++++++++++++
 tcg/tcg.c             |   12 ++
 tcg/tcg.h             |   35 +++++
 3 files changed, 375 insertions(+), 0 deletions(-)
Peter Maydell - July 5, 2012, 2:04 p.m.
On 5 July 2012 14:23, Yeongkyoon Lee <yeongkyoon.lee@samsung.com> wrote:
> Add optimized TCG qemu_ld/st generation which generates the code for TLB miss case handling at the end of TB after generating other IRs.
>
> Signed-off-by: Yeongkyoon Lee <yeongkyoon.lee@samsung.com>
> ---
>  tcg/i386/tcg-target.c |  328 +++++++++++++++++++++++++++++++++++++++++++++++++
>  tcg/tcg.c             |   12 ++
>  tcg/tcg.h             |   35 +++++
>  3 files changed, 375 insertions(+), 0 deletions(-)
>
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index da17bba..3f2f640 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -984,6 +984,8 @@ static const void *qemu_st_helpers[4] = {
>      helper_stq_mmu,
>  };
>  #else
> +
> +#ifndef CONFIG_QEMU_LDST_OPTIMIZATION
>  /* legacy helper signature: __ld_mmu(target_ulong addr, int
>     mmu_idx) */
>  static void *qemu_ld_helpers[4] = {
> @@ -1001,6 +1003,35 @@ static void *qemu_st_helpers[4] = {
>      __stl_mmu,
>      __stq_mmu,
>  };
> +#else

Is it really worth having this as a CONFIG_ switch? If we think
it's better to do this out of line we should just switch to
always generating the out of line code, I think. There's not much
point in retaining the old code path if it's disabled -- it will
just bitrot.

> +/* extended legacy helper signature: __ext_ld_mmu(target_ulong addr, int
> +   mmu_idx, uintptr raddr) */
> +static void *qemu_ld_helpers[4] = {
> +    __ext_ldb_mmu,
> +    __ext_ldw_mmu,
> +    __ext_ldl_mmu,
> +    __ext_ldq_mmu,
> +};
> +
> +/* extended legacy helper signature: __ext_st_mmu(target_ulong addr, uintxx_t val,
> +   int mmu_idx) */
> +static void *qemu_st_helpers[4] = {
> +    __ext_stb_mmu,
> +    __ext_stw_mmu,
> +    __ext_stl_mmu,
> +    __ext_stq_mmu,
> +};
> +
> +static void add_qemu_ldst_label(TCGContext *s,
> +                                int opc_ext,
> +                                int data_reg,
> +                                int data_reg2,
> +                                int addrlo_reg,
> +                                int addrhi_reg,
> +                                int mem_index,
> +                                uint8_t *raddr,
> +                                uint8_t **label_ptr);
> +#endif  /* !CONFIG_QEMU_LDST_OPTIMIZATION */
>  #endif
>
>  /* Perform the TLB load and compare.
> @@ -1061,19 +1092,36 @@ static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,
>
>      tcg_out_mov(s, type, r0, addrlo);
>
> +#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
> +    /* jne slow_path */
> +    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
> +    if (!label_ptr) {
> +        tcg_abort();
> +    }

There's no point in this check and abort -- label_ptr will always be
non-NULL (it would be an internal error if it wasn't), and if it is
by some future bug NULL, we'll just crash on the next line, which is
just as good. The existing code didn't feel the need to make this
check, we don't need to do it in the new code.

> +    label_ptr[0] = s->code_ptr;
> +    s->code_ptr += 4;
> +#else
>      /* jne label1 */
>      tcg_out8(s, OPC_JCC_short + JCC_JNE);
>      label_ptr[0] = s->code_ptr;
>      s->code_ptr++;
> +#endif
>
>      if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
>          /* cmp 4(r1), addrhi */
>          tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4);
>
> +#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
> +        /* jne slow_path */
> +        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
> +        label_ptr[1] = s->code_ptr;
> +        s->code_ptr += 4;
> +#else
>          /* jne label1 */
>          tcg_out8(s, OPC_JCC_short + JCC_JNE);
>          label_ptr[1] = s->code_ptr;
>          s->code_ptr++;
> +#endif
>      }
>
>      /* TLB Hit.  */
> @@ -1171,11 +1219,13 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
>      int addrlo_idx;
>  #if defined(CONFIG_SOFTMMU)
>      int mem_index, s_bits;
> +#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION)
>  #if TCG_TARGET_REG_BITS == 64
>      int arg_idx;
>  #else
>      int stack_adjust;
>  #endif
> +#endif  /* !CONFIG_QEMU_LDST_OPTIMIZATION */
>      uint8_t *label_ptr[3];
>  #endif
>
> @@ -1197,6 +1247,18 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
>      tcg_out_qemu_ld_direct(s, data_reg, data_reg2,
>                             tcg_target_call_iarg_regs[0], 0, opc);
>
> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
> +    /* helper stub will be jumped back here */

"will jump back here".

> +    add_qemu_ldst_label(s,
> +                        opc,
> +                        data_reg,
> +                        data_reg2,
> +                        args[addrlo_idx],
> +                        args[addrlo_idx + 1],
> +                        mem_index,
> +                        s->code_ptr,
> +                        label_ptr);
> +#else
>      /* jmp label2 */
>      tcg_out8(s, OPC_JMP_short);
>      label_ptr[2] = s->code_ptr;
> @@ -1292,6 +1354,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
>
>      /* label2: */
>      *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
> +#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION */
>  #else
>      {
>          int32_t offset = GUEST_BASE;
> @@ -1385,7 +1448,9 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>      int addrlo_idx;
>  #if defined(CONFIG_SOFTMMU)
>      int mem_index, s_bits;
> +#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION)
>      int stack_adjust;
> +#endif
>      uint8_t *label_ptr[3];
>  #endif
>
> @@ -1407,6 +1472,18 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>      tcg_out_qemu_st_direct(s, data_reg, data_reg2,
>                             tcg_target_call_iarg_regs[0], 0, opc);
>
> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
> +    /* helper stub will be jumped back here */

ditto.

> +    add_qemu_ldst_label(s,
> +                        opc | HL_ST_MASK,
> +                        data_reg,
> +                        data_reg2,
> +                        args[addrlo_idx],
> +                        args[addrlo_idx + 1],
> +                        mem_index,
> +                        s->code_ptr,
> +                        label_ptr);
> +#else
>      /* jmp label2 */
>      tcg_out8(s, OPC_JMP_short);
>      label_ptr[2] = s->code_ptr;
> @@ -1469,6 +1546,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>
>      /* label2: */
>      *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
> +#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION */
>  #else
>      {
>          int32_t offset = GUEST_BASE;
> @@ -1496,6 +1574,256 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>  #endif
>  }
>
> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
> +/* optimization to reduce jump overheads for qemu_ld/st IRs */
> +
> +/*
> + * qemu_ld/st code generator call add_qemu_ldst_label,
> + * so that slow case(TLB miss or I/O rw) is handled at the end of TB
> + */

This comment isn't really describing the purpose of this function,
which is something more along the lines of "Record the context of
a call to the out of line helper code for the slow path for a
load or store, so that we can later generate the correct helper
code".

> +static void add_qemu_ldst_label(TCGContext *s,
> +                                int opc_ext,
> +                                int data_reg,
> +                                int data_reg2,
> +                                int addrlo_reg,
> +                                int addrhi_reg,
> +                                int mem_index,
> +                                uint8_t *raddr,
> +                                uint8_t **label_ptr)
> +{
> +    int idx;
> +    TCGLabelQemuLdst *label;
> +
> +    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST)
> +        tcg_abort();

QEMU coding style requires braces. Please use checkpatch.pl.

> +
> +    idx = s->nb_qemu_ldst_labels++;
> +    label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
> +    label->opc_ext = opc_ext;
> +    label->datalo_reg = data_reg;
> +    label->datahi_reg = data_reg2;
> +    label->addrlo_reg = addrlo_reg;
> +    label->addrhi_reg = addrhi_reg;
> +    label->mem_index = mem_index;
> +    label->raddr = raddr;
> +    if (!label_ptr) {
> +        tcg_abort();
> +    }

Another pointless abort.

> +    label->label_ptr[0] = label_ptr[0];
> +    label->label_ptr[1] = label_ptr[1];
> +}
> +
> +/* generates slow case of qemu_ld at the end of TB */
> +static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
> +{
> +    int s_bits;
> +    int opc = label->opc_ext & HL_OPC_MASK;
> +    int mem_index = label->mem_index;
> +#if TCG_TARGET_REG_BITS == 64
> +    int arg_idx;
> +#else
> +    int stack_adjust;
> +    int addrlo_reg = label->addrlo_reg;
> +    int addrhi_reg = label->addrhi_reg;
> +#endif
> +    int data_reg = label->datalo_reg;
> +    int data_reg2 = label->datahi_reg;
> +    uint8_t *raddr = label->raddr;
> +    uint8_t **label_ptr = &label->label_ptr[0];
> +
> +    s_bits = opc & 3;
> +
> +    /* resolve label address */
> +    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
> +    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
> +        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
> +    }
> +
> +    /* extended helper signature: __ext_ld_mmu(target_ulong addr, int mmu_idx,
> +       uintptr_t raddr) */
> +#if TCG_TARGET_REG_BITS == 32
> +    tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */
> +    stack_adjust = 4;
> +    tcg_out_pushi(s, mem_index);        /* mmu index */
> +    stack_adjust += 4;
> +    if (TARGET_LONG_BITS == 64) {
> +        tcg_out_push(s, addrhi_reg);
> +        stack_adjust += 4;
> +    }
> +    tcg_out_push(s, addrlo_reg); /* guest addr */
> +    stack_adjust += 4;
> +#ifdef CONFIG_TCG_PASS_AREG0
> +    tcg_out_push(s, TCG_AREG0);
> +    stack_adjust += 4;
> +#endif
> +#else
> +    /* The first argument is already loaded with addrlo.  */
> +    arg_idx = 1;
> +    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++],
> +                 mem_index);
> +    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++],
> +                 (uintptr_t)(raddr - 1));
> +#ifdef CONFIG_TCG_PASS_AREG0
> +    /* XXX/FIXME: suboptimal */
> +    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
> +                tcg_target_call_iarg_regs[2]);
> +    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
> +                tcg_target_call_iarg_regs[1]);
> +    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
> +                tcg_target_call_iarg_regs[0]);
> +    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
> +                TCG_AREG0);
> +#endif
> +#endif
> +
> +    tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
> +
> +#if TCG_TARGET_REG_BITS == 32
> +    if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
> +        /* Pop and discard.  This is 2 bytes smaller than the add.  */
> +        tcg_out_pop(s, TCG_REG_ECX);
> +    } else if (stack_adjust != 0) {
> +        tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
> +    }
> +#endif
> +
> +    switch(opc) {
> +    case 0 | 4:
> +        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
> +        break;
> +    case 1 | 4:
> +        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
> +        break;
> +    case 0:
> +        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
> +        break;
> +    case 1:
> +        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
> +        break;
> +    case 2:
> +        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
> +        break;
> +#if TCG_TARGET_REG_BITS == 64
> +    case 2 | 4:
> +        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
> +        break;
> +#endif
> +    case 3:
> +        if (TCG_TARGET_REG_BITS == 64) {
> +            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
> +        } else if (data_reg == TCG_REG_EDX) {
> +            /* xchg %edx, %eax */
> +            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
> +            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
> +        } else {
> +            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
> +            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
> +        }
> +        break;
> +    default:
> +        tcg_abort();
> +    }
> +
> +    /* jump back to original code */
> +    tcg_out_jmp(s, (tcg_target_long) raddr);
> +}
> +
> +/* generates slow case of qemu_st at the end of TB */
> +static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
> +{
> +    int s_bits;
> +    int stack_adjust;
> +    int opc = label->opc_ext & HL_OPC_MASK;
> +    int mem_index = label->mem_index;
> +    int data_reg = label->datalo_reg;
> +#if TCG_TARGET_REG_BITS == 32
> +    int data_reg2 = label->datahi_reg;
> +    int addrlo_reg = label->addrlo_reg;
> +    int addrhi_reg = label->addrhi_reg;
> +#endif
> +    uint8_t *raddr = label->raddr;
> +    uint8_t **label_ptr = &label->label_ptr[0];
> +
> +    s_bits = opc & 3;
> +
> +    /* resolve label address */
> +    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
> +    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
> +        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
> +    }
> +
> +    /* extended helper signature: __ext_st_mmu(target_ulong addr, uintxx_t val,
> +       int mmu_idx, uintptr_t raddr) */
> +#if TCG_TARGET_REG_BITS == 32
> +    tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */
> +    stack_adjust = 4;
> +    tcg_out_pushi(s, mem_index); /* mmu index */
> +    stack_adjust += 4;
> +    if (opc == 3) {
> +        tcg_out_push(s, data_reg2);
> +        stack_adjust += 4;
> +    }
> +    tcg_out_push(s, data_reg);   /* guest data */
> +    stack_adjust += 4;
> +    if (TARGET_LONG_BITS == 64) {
> +        tcg_out_push(s, addrhi_reg);
> +        stack_adjust += 4;
> +    }
> +    tcg_out_push(s, addrlo_reg); /* guest addr */
> +    stack_adjust += 4;
> +#ifdef CONFIG_TCG_PASS_AREG0
> +    tcg_out_push(s, TCG_AREG0);
> +    stack_adjust += 4;
> +#endif
> +#else
> +    tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),
> +                tcg_target_call_iarg_regs[1], data_reg);
> +    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index);
> +    tcg_out_movi(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], (uintptr_t)(raddr - 1));
> +    stack_adjust = 0;
> +#ifdef CONFIG_TCG_PASS_AREG0
> +    /* XXX/FIXME: suboptimal */
> +    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
> +                tcg_target_call_iarg_regs[2]);
> +    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
> +                tcg_target_call_iarg_regs[1]);
> +    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
> +                tcg_target_call_iarg_regs[0]);
> +    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
> +                TCG_AREG0);
> +#endif
> +#endif
> +
> +    tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]);
> +
> +    if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
> +        /* Pop and discard.  This is 2 bytes smaller than the add.  */
> +        tcg_out_pop(s, TCG_REG_ECX);
> +    } else if (stack_adjust != 0) {
> +        tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
> +    }
> +
> +    /* jump back to original code */
> +    tcg_out_jmp(s, (tcg_target_long) raddr);
> +}
> +
> +/* generates all of the slow cases of qemu_ld/st at the end of TB */
> +void tcg_out_qemu_ldst_slow_path(TCGContext *s)
> +{
> +    int i;
> +    TCGLabelQemuLdst *label;
> +
> +    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
> +        label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i];
> +        if (IS_QEMU_LD_LABEL(label)) {
> +            tcg_out_qemu_ld_slow_path(s, label);
> +        } else {
> +            tcg_out_qemu_st_slow_path(s, label);
> +        }
> +    }
> +}
> +#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION */
> +
>  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>                                const TCGArg *args, const int *const_args)
>  {
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 8386b70..8009069 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -301,6 +301,14 @@ void tcg_func_start(TCGContext *s)
>
>      gen_opc_ptr = gen_opc_buf;
>      gen_opparam_ptr = gen_opparam_buf;
> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
> +    /* initialize qemu_ld/st labels which help to generate TLB miss case codes at the end of TB */
> +    s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) * TCG_MAX_QEMU_LDST);
> +    if (!s->qemu_ldst_labels) {
> +        tcg_abort();
> +    }

Unnecessary check -- tcg_malloc() can never return 0.

> +    s->nb_qemu_ldst_labels = 0;
> +#endif
>  }
>
>  static inline void tcg_temp_alloc(TCGContext *s, int n)
> @@ -2169,6 +2177,10 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
>  #endif
>      }
>   the_end:
> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
> +    /* Generate MMU call helpers at the end of block (currently only for qemu_ld/st) */
> +    tcg_out_qemu_ldst_slow_path(s);
> +#endif
>      return -1;
>  }
>
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index d710694..b174cdb 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -187,6 +187,29 @@ typedef tcg_target_ulong TCGArg;
>     are aliases for target_ulong and host pointer sized values respectively.
>   */
>
> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
> +/* Macros and structures for qemu_ld/st IR code optimization:
> +   It looks good for TCG_MAX_HELPER_LABELS to be half of OPC_BUF_SIZE in exec-all.h. */
> +#define TCG_MAX_QEMU_LDST       320

Is that true even if you have a huge block with nothing but simple
guest load instructions in it?

> +#define HL_LDST_SHIFT           4
> +#define HL_LDST_MASK            (1 << HL_LDST_SHIFT)
> +#define HL_ST_MASK              HL_LDST_MASK
> +#define HL_OPC_MASK             (HL_LDST_MASK - 1)
> +#define IS_QEMU_LD_LABEL(L)     (!((L)->opc_ext & HL_LDST_MASK))
> +#define IS_QEMU_ST_LABEL(L)     ((L)->opc_ext & HL_LDST_MASK)
> +
> +typedef struct TCGLabelQemuLdst {
> +    int opc_ext;                /* | 27bit (reserved) | 1bit (ld/st flag) | 4bit (opc) | */
> +    int addrlo_reg;             /* reg index for the low word of guest virtual address */
> +    int addrhi_reg;             /* reg index for the high word of guest virtual address */
> +    int datalo_reg;             /* reg index for the low word to be loaded or to be stored */
> +    int datahi_reg;             /* reg index for the high word to be loaded or to be stored */
> +    int mem_index;              /* soft MMU memory index */
> +    uint8_t *raddr;             /* return address (located end of TB) */
> +    uint8_t *label_ptr[2];      /* label pointers to be updated */
> +} TCGLabelQemuLdst;
> +#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION */
> +
>  #ifdef CONFIG_DEBUG_TCG
>  #define DEBUG_TCGV 1
>  #endif
> @@ -389,6 +412,13 @@ struct TCGContext {
>  #ifdef CONFIG_DEBUG_TCG
>      int temps_in_use;
>  #endif
> +
> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
> +    /* labels info for qemu_ld/st IRs
> +       The labels help to generate TLB miss case codes at the end of TB */
> +    TCGLabelQemuLdst *qemu_ldst_labels;
> +    int nb_qemu_ldst_labels;
> +#endif
>  };
>
>  extern TCGContext tcg_ctx;
> @@ -588,3 +618,8 @@ extern uint8_t code_gen_prologue[];
>  #endif
>
>  void tcg_register_jit(void *buf, size_t buf_size);
> +
> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
> +/* qemu_ld/st generation at the end of TB */
> +void tcg_out_qemu_ldst_slow_path(TCGContext *s);
> +#endif
> --
> 1.7.4.1
>

-- PMM
YeongKyoon Lee - July 6, 2012, 11:20 a.m.
> Is it really worth having this as a CONFIG_ switch? If we think
> it's better to do this out of line we should just switch to
> always generating the out of line code, I think. There's not much
> point in retaining the old code path if it's disabled -- it will
> just bitrot.

I agree.
However, it is just a safe guard because I have not test all the targets 
of qemu.
I've only tested x86 and ARM targets on x86 and x86-64 hosts.
If agreed to remove conditional macro, then I'll fix it.


>> +#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
>> +    /* jne slow_path */
>> +    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
>> +    if (!label_ptr) {
>> +        tcg_abort();
>> +    }
> There's no point in this check and abort -- label_ptr will always be 
> non-NULL (it would be an internal error if it wasn't), and if it is by 
> some future bug NULL, we'll just crash on the next line, which is just 
> as good. The existing code didn't feel the need to make this check, we 
> don't need to do it in the new code. 

It cannot be happened now as you said. It is just for a possible future bug.
But I cannot understand "we'll just crash on the next line" you 
mentioned above.

>> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
>> +    /* helper stub will be jumped back here */
> "will jump back here".

Ok.

>
>> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
>> +    /* helper stub will be jumped back here */
> ditto.

Ok.

>> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
>> +/* optimization to reduce jump overheads for qemu_ld/st IRs */
>> +
>> +/*
>> + * qemu_ld/st code generator call add_qemu_ldst_label,
>> + * so that slow case(TLB miss or I/O rw) is handled at the end of TB
>> + */
> This comment isn't really describing the purpose of this function,
> which is something more along the lines of "Record the context of
> a call to the out of line helper code for the slow path for a
> load or store, so that we can later generate the correct helper
> code".

I agree. Your description looks better.

>
>> +    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST)
>> +        tcg_abort();
> QEMU coding style requires braces. Please use checkpatch.pl.

Ok.

>
>> +
>> +    idx = s->nb_qemu_ldst_labels++;
>> +    label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
>> +    label->opc_ext = opc_ext;
>> +    label->datalo_reg = data_reg;
>> +    label->datahi_reg = data_reg2;
>> +    label->addrlo_reg = addrlo_reg;
>> +    label->addrhi_reg = addrhi_reg;
>> +    label->mem_index = mem_index;
>> +    label->raddr = raddr;
>> +    if (!label_ptr) {
>> +        tcg_abort();
>> +    }
> Another pointless abort.

ditto.

>
>> diff --git a/tcg/tcg.c b/tcg/tcg.c
>> index 8386b70..8009069 100644
>> --- a/tcg/tcg.c
>> +++ b/tcg/tcg.c
>> @@ -301,6 +301,14 @@ void tcg_func_start(TCGContext *s)
>>
>>       gen_opc_ptr = gen_opc_buf;
>>       gen_opparam_ptr = gen_opparam_buf;
>> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
>> +    /* initialize qemu_ld/st labels which help to generate TLB miss case codes at the end of TB */
>> +    s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) * TCG_MAX_QEMU_LDST);
>> +    if (!s->qemu_ldst_labels) {
>> +        tcg_abort();
>> +    }
> Unnecessary check -- tcg_malloc() can never return 0.

Ok.

>> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
>> +/* Macros and structures for qemu_ld/st IR code optimization:
>> +   It looks good for TCG_MAX_HELPER_LABELS to be half of OPC_BUF_SIZE in exec-all.h. */
>> +#define TCG_MAX_QEMU_LDST       320
> Is that true even if you have a huge block with nothing but simple
> guest load instructions in it?

I agree. It needs to be set as same size with OPC_BUF_SIZE for covering 
extreme cases.
Peter Maydell - July 6, 2012, 11:28 a.m.
On 6 July 2012 12:20, Yeongkyoon Lee <yeongkyoon.lee@samsung.com> wrote:
>
>>> +#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
>>> +    /* jne slow_path */
>>> +    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
>>> +    if (!label_ptr) {
>>> +        tcg_abort();
>>> +    }
>>
>> There's no point in this check and abort -- label_ptr will always be
>> non-NULL (it would be an internal error if it wasn't), and if it is by some
>> future bug NULL, we'll just crash on the next line, which is just as good.
>> The existing code didn't feel the need to make this check, we don't need to
>> do it in the new code.
>
> It cannot be happened now as you said. It is just for a possible future bug.
> But I cannot understand "we'll just crash on the next line" you mentioned
> above.

If the check was not present and label_ptr was somehow NULL, then
attempting to execute "label_ptr[0] = s->code_ptr;" will crash.
This is just as helpful for debugging purposes as an abort.
It's sometimes worth having sanity-checking assertions when the
code would otherwise proceed for a long time doing something wrong
but not crashing, because the assert means that you get an early
indication of failure near the point of failure. However the check
you have here is delaying the failure by exactly one line, which is
not useful.

>>> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
>>> +/* Macros and structures for qemu_ld/st IR code optimization:
>>> +   It looks good for TCG_MAX_HELPER_LABELS to be half of OPC_BUF_SIZE in
>>> exec-all.h. */
>>> +#define TCG_MAX_QEMU_LDST       320
>>
>> Is that true even if you have a huge block with nothing but simple
>> guest load instructions in it?
>
> I agree. It needs to be set as same size with OPC_BUF_SIZE for covering
> extreme cases.

The point here, incidentally, is that guest code should never be able
to make qemu crash or abort, so any fixed sized buffer has to be able
to handle the worst case.

-- PMM

Patch

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index da17bba..3f2f640 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -984,6 +984,8 @@  static const void *qemu_st_helpers[4] = {
     helper_stq_mmu,
 };
 #else
+
+#ifndef CONFIG_QEMU_LDST_OPTIMIZATION
 /* legacy helper signature: __ld_mmu(target_ulong addr, int
    mmu_idx) */
 static void *qemu_ld_helpers[4] = {
@@ -1001,6 +1003,35 @@  static void *qemu_st_helpers[4] = {
     __stl_mmu,
     __stq_mmu,
 };
+#else
+/* extended legacy helper signature: __ext_ld_mmu(target_ulong addr, int
+   mmu_idx, uintptr raddr) */
+static void *qemu_ld_helpers[4] = {
+    __ext_ldb_mmu,
+    __ext_ldw_mmu,
+    __ext_ldl_mmu,
+    __ext_ldq_mmu,
+};
+
+/* extended legacy helper signature: __ext_st_mmu(target_ulong addr, uintxx_t val,
+   int mmu_idx) */
+static void *qemu_st_helpers[4] = {
+    __ext_stb_mmu,
+    __ext_stw_mmu,
+    __ext_stl_mmu,
+    __ext_stq_mmu,
+};
+
+static void add_qemu_ldst_label(TCGContext *s,
+                                int opc_ext,
+                                int data_reg,
+                                int data_reg2,
+                                int addrlo_reg,
+                                int addrhi_reg,
+                                int mem_index,
+                                uint8_t *raddr,
+                                uint8_t **label_ptr);
+#endif  /* !CONFIG_QEMU_LDST_OPTIMIZATION */
 #endif
 
 /* Perform the TLB load and compare.
@@ -1061,19 +1092,36 @@  static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,
 
     tcg_out_mov(s, type, r0, addrlo);
 
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+    /* jne slow_path */
+    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
+    if (!label_ptr) {
+        tcg_abort();
+    }
+    label_ptr[0] = s->code_ptr;
+    s->code_ptr += 4;
+#else
     /* jne label1 */
     tcg_out8(s, OPC_JCC_short + JCC_JNE);
     label_ptr[0] = s->code_ptr;
     s->code_ptr++;
+#endif
 
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
         /* cmp 4(r1), addrhi */
         tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4);
 
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+        /* jne slow_path */
+        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
+        label_ptr[1] = s->code_ptr;
+        s->code_ptr += 4;
+#else
         /* jne label1 */
         tcg_out8(s, OPC_JCC_short + JCC_JNE);
         label_ptr[1] = s->code_ptr;
         s->code_ptr++;
+#endif
     }
 
     /* TLB Hit.  */
@@ -1171,11 +1219,13 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     int addrlo_idx;
 #if defined(CONFIG_SOFTMMU)
     int mem_index, s_bits;
+#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION)
 #if TCG_TARGET_REG_BITS == 64
     int arg_idx;
 #else
     int stack_adjust;
 #endif
+#endif  /* !CONFIG_QEMU_LDST_OPTIMIZATION */
     uint8_t *label_ptr[3];
 #endif
 
@@ -1197,6 +1247,18 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     tcg_out_qemu_ld_direct(s, data_reg, data_reg2,
                            tcg_target_call_iarg_regs[0], 0, opc);
 
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+    /* helper stub will be jumped back here */
+    add_qemu_ldst_label(s,
+                        opc,
+                        data_reg,
+                        data_reg2,
+                        args[addrlo_idx],
+                        args[addrlo_idx + 1],
+                        mem_index,
+                        s->code_ptr,
+                        label_ptr);
+#else
     /* jmp label2 */
     tcg_out8(s, OPC_JMP_short);
     label_ptr[2] = s->code_ptr;
@@ -1292,6 +1354,7 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
 
     /* label2: */
     *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION */
 #else
     {
         int32_t offset = GUEST_BASE;
@@ -1385,7 +1448,9 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     int addrlo_idx;
 #if defined(CONFIG_SOFTMMU)
     int mem_index, s_bits;
+#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION)
     int stack_adjust;
+#endif
     uint8_t *label_ptr[3];
 #endif
 
@@ -1407,6 +1472,18 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     tcg_out_qemu_st_direct(s, data_reg, data_reg2,
                            tcg_target_call_iarg_regs[0], 0, opc);
 
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+    /* helper stub will be jumped back here */
+    add_qemu_ldst_label(s,
+                        opc | HL_ST_MASK,
+                        data_reg,
+                        data_reg2,
+                        args[addrlo_idx],
+                        args[addrlo_idx + 1],
+                        mem_index,
+                        s->code_ptr,
+                        label_ptr);
+#else
     /* jmp label2 */
     tcg_out8(s, OPC_JMP_short);
     label_ptr[2] = s->code_ptr;
@@ -1469,6 +1546,7 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
 
     /* label2: */
     *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION */
 #else
     {
         int32_t offset = GUEST_BASE;
@@ -1496,6 +1574,256 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
 #endif
 }
 
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+/* optimization to reduce jump overheads for qemu_ld/st IRs */
+
+/*
+ * qemu_ld/st code generator call add_qemu_ldst_label,
+ * so that slow case(TLB miss or I/O rw) is handled at the end of TB
+ */
+static void add_qemu_ldst_label(TCGContext *s,
+                                int opc_ext,
+                                int data_reg,
+                                int data_reg2,
+                                int addrlo_reg,
+                                int addrhi_reg,
+                                int mem_index,
+                                uint8_t *raddr,
+                                uint8_t **label_ptr)
+{
+    int idx;
+    TCGLabelQemuLdst *label;
+
+    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST)
+        tcg_abort();
+
+    idx = s->nb_qemu_ldst_labels++;
+    label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
+    label->opc_ext = opc_ext;
+    label->datalo_reg = data_reg;
+    label->datahi_reg = data_reg2;
+    label->addrlo_reg = addrlo_reg;
+    label->addrhi_reg = addrhi_reg;
+    label->mem_index = mem_index;
+    label->raddr = raddr;
+    if (!label_ptr) {
+        tcg_abort();
+    }
+    label->label_ptr[0] = label_ptr[0];
+    label->label_ptr[1] = label_ptr[1];
+}
+
+/* generates slow case of qemu_ld at the end of TB */
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+    int s_bits;
+    int opc = label->opc_ext & HL_OPC_MASK;
+    int mem_index = label->mem_index;
+#if TCG_TARGET_REG_BITS == 64
+    int arg_idx;
+#else
+    int stack_adjust;
+    int addrlo_reg = label->addrlo_reg;
+    int addrhi_reg = label->addrhi_reg;
+#endif
+    int data_reg = label->datalo_reg;
+    int data_reg2 = label->datahi_reg;
+    uint8_t *raddr = label->raddr;
+    uint8_t **label_ptr = &label->label_ptr[0];
+
+    s_bits = opc & 3;
+
+    /* resolve label address */
+    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+    }
+
+    /* extended helper signature: __ext_ld_mmu(target_ulong addr, int mmu_idx,
+       uintptr_t raddr) */
+#if TCG_TARGET_REG_BITS == 32
+    tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */
+    stack_adjust = 4;
+    tcg_out_pushi(s, mem_index);        /* mmu index */
+    stack_adjust += 4;
+    if (TARGET_LONG_BITS == 64) {
+        tcg_out_push(s, addrhi_reg);
+        stack_adjust += 4;
+    }
+    tcg_out_push(s, addrlo_reg); /* guest addr */
+    stack_adjust += 4;
+#ifdef CONFIG_TCG_PASS_AREG0
+    tcg_out_push(s, TCG_AREG0);
+    stack_adjust += 4;
+#endif
+#else
+    /* The first argument is already loaded with addrlo.  */
+    arg_idx = 1;
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++],
+                 mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++],
+                 (uintptr_t)(raddr - 1));
+#ifdef CONFIG_TCG_PASS_AREG0
+    /* XXX/FIXME: suboptimal */
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
+                tcg_target_call_iarg_regs[2]);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
+                tcg_target_call_iarg_regs[1]);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
+                tcg_target_call_iarg_regs[0]);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
+                TCG_AREG0);
+#endif
+#endif
+
+    tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
+
+#if TCG_TARGET_REG_BITS == 32
+    if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
+        /* Pop and discard.  This is 2 bytes smaller than the add.  */
+        tcg_out_pop(s, TCG_REG_ECX);
+    } else if (stack_adjust != 0) {
+        tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
+    }
+#endif
+
+    switch(opc) {
+    case 0 | 4:
+        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
+        break;
+    case 1 | 4:
+        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
+        break;
+    case 0:
+        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
+        break;
+    case 1:
+        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
+        break;
+    case 2:
+        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+        break;
+#if TCG_TARGET_REG_BITS == 64
+    case 2 | 4:
+        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
+        break;
+#endif
+    case 3:
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
+        } else if (data_reg == TCG_REG_EDX) {
+            /* xchg %edx, %eax */
+            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
+        }
+        break;
+    default:
+        tcg_abort();
+    }
+
+    /* jump back to original code */
+    tcg_out_jmp(s, (tcg_target_long) raddr);
+}
+
+/* generates slow case of qemu_st at the end of TB */
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+    int s_bits;
+    int stack_adjust;
+    int opc = label->opc_ext & HL_OPC_MASK;
+    int mem_index = label->mem_index;
+    int data_reg = label->datalo_reg;
+#if TCG_TARGET_REG_BITS == 32
+    int data_reg2 = label->datahi_reg;
+    int addrlo_reg = label->addrlo_reg;
+    int addrhi_reg = label->addrhi_reg;
+#endif
+    uint8_t *raddr = label->raddr;
+    uint8_t **label_ptr = &label->label_ptr[0];
+
+    s_bits = opc & 3;
+
+    /* resolve label address */
+    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+    }
+
+    /* extended helper signature: __ext_st_mmu(target_ulong addr, uintxx_t val,
+       int mmu_idx, uintptr_t raddr) */
+#if TCG_TARGET_REG_BITS == 32
+    tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */
+    stack_adjust = 4;
+    tcg_out_pushi(s, mem_index); /* mmu index */
+    stack_adjust += 4;
+    if (opc == 3) {
+        tcg_out_push(s, data_reg2);
+        stack_adjust += 4;
+    }
+    tcg_out_push(s, data_reg);   /* guest data */
+    stack_adjust += 4;
+    if (TARGET_LONG_BITS == 64) {
+        tcg_out_push(s, addrhi_reg);
+        stack_adjust += 4;
+    }
+    tcg_out_push(s, addrlo_reg); /* guest addr */
+    stack_adjust += 4;
+#ifdef CONFIG_TCG_PASS_AREG0
+    tcg_out_push(s, TCG_AREG0);
+    stack_adjust += 4;
+#endif
+#else
+    tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),
+                tcg_target_call_iarg_regs[1], data_reg);
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index);
+    tcg_out_movi(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], (uintptr_t)(raddr - 1));
+    stack_adjust = 0;
+#ifdef CONFIG_TCG_PASS_AREG0
+    /* XXX/FIXME: suboptimal */
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
+                tcg_target_call_iarg_regs[2]);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
+                tcg_target_call_iarg_regs[1]);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
+                tcg_target_call_iarg_regs[0]);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
+                TCG_AREG0);
+#endif
+#endif
+
+    tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]);
+
+    if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
+        /* Pop and discard.  This is 2 bytes smaller than the add.  */
+        tcg_out_pop(s, TCG_REG_ECX);
+    } else if (stack_adjust != 0) {
+        tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
+    }
+
+    /* jump back to original code */
+    tcg_out_jmp(s, (tcg_target_long) raddr);
+}
+
+/* generates all of the slow cases of qemu_ld/st at the end of TB */
+void tcg_out_qemu_ldst_slow_path(TCGContext *s)
+{
+    int i;
+    TCGLabelQemuLdst *label;
+
+    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
+        label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i];
+        if (IS_QEMU_LD_LABEL(label)) {
+            tcg_out_qemu_ld_slow_path(s, label);
+        } else {
+            tcg_out_qemu_st_slow_path(s, label);
+        }
+    }
+}
+#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION */
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
 {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 8386b70..8009069 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -301,6 +301,14 @@  void tcg_func_start(TCGContext *s)
 
     gen_opc_ptr = gen_opc_buf;
     gen_opparam_ptr = gen_opparam_buf;
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+    /* initialize qemu_ld/st labels which help to generate TLB miss case codes at the end of TB */
+    s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) * TCG_MAX_QEMU_LDST);
+    if (!s->qemu_ldst_labels) {
+        tcg_abort();
+    }
+    s->nb_qemu_ldst_labels = 0;
+#endif
 }
 
 static inline void tcg_temp_alloc(TCGContext *s, int n)
@@ -2169,6 +2177,10 @@  static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
 #endif
     }
  the_end:
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+    /* Generate MMU call helpers at the end of block (currently only for qemu_ld/st) */
+    tcg_out_qemu_ldst_slow_path(s);
+#endif
     return -1;
 }
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index d710694..b174cdb 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -187,6 +187,29 @@  typedef tcg_target_ulong TCGArg;
    are aliases for target_ulong and host pointer sized values respectively.
  */
 
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+/* Macros and structures for qemu_ld/st IR code optimization:
+   It looks good for TCG_MAX_HELPER_LABELS to be half of OPC_BUF_SIZE in exec-all.h. */
+#define TCG_MAX_QEMU_LDST       320
+#define HL_LDST_SHIFT           4
+#define HL_LDST_MASK            (1 << HL_LDST_SHIFT)
+#define HL_ST_MASK              HL_LDST_MASK
+#define HL_OPC_MASK             (HL_LDST_MASK - 1)
+#define IS_QEMU_LD_LABEL(L)     (!((L)->opc_ext & HL_LDST_MASK))
+#define IS_QEMU_ST_LABEL(L)     ((L)->opc_ext & HL_LDST_MASK)
+
+typedef struct TCGLabelQemuLdst {
+    int opc_ext;                /* | 27bit (reserved) | 1bit (ld/st flag) | 4bit (opc) | */
+    int addrlo_reg;             /* reg index for the low word of guest virtual address */
+    int addrhi_reg;             /* reg index for the high word of guest virtual address */
+    int datalo_reg;             /* reg index for the low word to be loaded or to be stored */
+    int datahi_reg;             /* reg index for the high word to be loaded or to be stored */
+    int mem_index;              /* soft MMU memory index */
+    uint8_t *raddr;             /* return address (located end of TB) */
+    uint8_t *label_ptr[2];      /* label pointers to be updated */
+} TCGLabelQemuLdst;
+#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION */
+
 #ifdef CONFIG_DEBUG_TCG
 #define DEBUG_TCGV 1
 #endif
@@ -389,6 +412,13 @@  struct TCGContext {
 #ifdef CONFIG_DEBUG_TCG
     int temps_in_use;
 #endif
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+    /* labels info for qemu_ld/st IRs
+       The labels help to generate TLB miss case codes at the end of TB */
+    TCGLabelQemuLdst *qemu_ldst_labels;
+    int nb_qemu_ldst_labels;
+#endif
 };
 
 extern TCGContext tcg_ctx;
@@ -588,3 +618,8 @@  extern uint8_t code_gen_prologue[];
 #endif
 
 void tcg_register_jit(void *buf, size_t buf_size);
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+/* qemu_ld/st generation at the end of TB */
+void tcg_out_qemu_ldst_slow_path(TCGContext *s);
+#endif