Patchwork [v5,3/3] tcg: Optimize qemu_ld/st by generating slow paths at the end of a block

login
register
mail settings
Submitter YeongKyoon Lee
Date Oct. 9, 2012, 12:37 p.m.
Message ID <1349786252-12343-4-git-send-email-yeongkyoon.lee@samsung.com>
Download mbox | patch
Permalink /patch/190293/
State New
Headers show

Comments

YeongKyoon Lee - Oct. 9, 2012, 12:37 p.m.
Add optimized TCG qemu_ld/st generation which locates the code of TLB miss
cases at the end of a block after generating the other IRs.
Currently, this optimization supports only i386 and x86_64 hosts.

Signed-off-by: Yeongkyoon Lee <yeongkyoon.lee@samsung.com>
---
 tcg/i386/tcg-target.c |  420 ++++++++++++++++++++++++++++++++-----------------
 tcg/tcg.c             |   13 ++
 tcg/tcg.h             |   35 ++++
 3 files changed, 323 insertions(+), 145 deletions(-)

--
1.7.5.4
Richard Henderson - Oct. 9, 2012, 6:49 p.m.
On 10/09/2012 05:37 AM, Yeongkyoon Lee wrote:
> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
> +    /* Initialize qemu_ld/st labels to assist code generation at the end of TB
> +       for TLB miss cases at the end of TB */
> +    s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) *
> +                                     TCG_MAX_QEMU_LDST);
> +    s->nb_qemu_ldst_labels = 0;
> +#endif

I said before that I wasn't fond of this sort of "constant" dynamic allocation.
Regardless of what surrounding code does.  You could clean those up too,
as a separate patch...

> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
> +    /* Generate slow paths of qemu_ld/st IRs which call MMU helpers at
> +       the end of block */
> +    tcg_out_qemu_ldst_slow_path(s);
> +#endif

This interface is so close to "tcg_out_ldst_and_constant_pools(s)" that
I don't think the function should be specific to ldst.  Just call it
tcg_out_tb_finalize or something.

> +/* Macros/structures for qemu_ld/st IR code optimization:
> +   TCG_MAX_HELPER_LABELS is defined as same as OPC_BUF_SIZE in exec-all.h. */
> +#define TCG_MAX_QEMU_LDST       640
> +#define HL_LDST_SHIFT           4
> +#define HL_LDST_MASK            (1 << HL_LDST_SHIFT)
> +#define HL_ST_MASK              HL_LDST_MASK
> +#define HL_OPC_MASK             (HL_LDST_MASK - 1)
> +#define IS_QEMU_LD_LABEL(L)     (!((L)->opc_ext & HL_LDST_MASK))
> +#define IS_QEMU_ST_LABEL(L)     ((L)->opc_ext & HL_LDST_MASK)
> +
> +typedef struct TCGLabelQemuLdst {
> +    int opc_ext;            /* | 27bit(reserved) | 1bit(ld/st) | 4bit(opc) | */

Any good reason to use all these masks when the compiler can do it
for you with bitfields?


r~
YeongKyoon Lee - Oct. 10, 2012, 4:41 a.m.
On 2012년 10월 10일 03:49, Richard Henderson wrote:
> On 10/09/2012 05:37 AM, Yeongkyoon Lee wrote:
>> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
>> +    /* Initialize qemu_ld/st labels to assist code generation at the end of TB
>> +       for TLB miss cases at the end of TB */
>> +    s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) *
>> +                                     TCG_MAX_QEMU_LDST);
>> +    s->nb_qemu_ldst_labels = 0;
>> +#endif
> I said before that I wasn't fond of this sort of "constant" dynamic allocation.
> Regardless of what surrounding code does.  You could clean those up too,
> as a separate patch...

I can change the dynamic allocation to static one as you said, however, 
one concern is that we might use redundant memory on non-TCG 
environment, such as, KVM mode.
What's you opinion about this?

>
>> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
>> +    /* Generate slow paths of qemu_ld/st IRs which call MMU helpers at
>> +       the end of block */
>> +    tcg_out_qemu_ldst_slow_path(s);
>> +#endif
> This interface is so close to "tcg_out_ldst_and_constant_pools(s)" that
> I don't think the function should be specific to ldst.  Just call it
> tcg_out_tb_finalize or something.

That looks good.
I'll do refactoring for the function names later.

>
>> +/* Macros/structures for qemu_ld/st IR code optimization:
>> +   TCG_MAX_HELPER_LABELS is defined as same as OPC_BUF_SIZE in exec-all.h. */
>> +#define TCG_MAX_QEMU_LDST       640
>> +#define HL_LDST_SHIFT           4
>> +#define HL_LDST_MASK            (1 << HL_LDST_SHIFT)
>> +#define HL_ST_MASK              HL_LDST_MASK
>> +#define HL_OPC_MASK             (HL_LDST_MASK - 1)
>> +#define IS_QEMU_LD_LABEL(L)     (!((L)->opc_ext & HL_LDST_MASK))
>> +#define IS_QEMU_ST_LABEL(L)     ((L)->opc_ext & HL_LDST_MASK)
>> +
>> +typedef struct TCGLabelQemuLdst {
>> +    int opc_ext;            /* | 27bit(reserved) | 1bit(ld/st) | 4bit(opc) | */
> Any good reason to use all these masks when the compiler can do it
> for you with bitfields?

No. It is just my coding style.
However, there might be no compiler problems and bitfields might look 
somewhat pretty, so I'll use bitfields later.

>
>
> r~
>

Patch

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 0e218c8..4c50542 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -983,24 +983,34 @@  static void tcg_out_jmp(TCGContext *s, tcg_target_long dest)

 #include "../../softmmu_defs.h"

-/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
-   int mmu_idx) */
+/* extended helper signature: ext_helper_ld_mmu(CPUState *env,
+   target_ulong addr, int mmu_idx, uintptr_t raddr) */
 static const void *qemu_ld_helpers[4] = {
-    helper_ldb_mmu,
-    helper_ldw_mmu,
-    helper_ldl_mmu,
-    helper_ldq_mmu,
+    ext_helper_ldb_mmu,
+    ext_helper_ldw_mmu,
+    ext_helper_ldl_mmu,
+    ext_helper_ldq_mmu,
 };

-/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
-   uintxx_t val, int mmu_idx) */
+/* extended helper signature: ext_helper_st_mmu(CPUState *env,
+   target_ulong addr, uintxx_t val, int mmu_idx, uintptr_t raddr) */
 static const void *qemu_st_helpers[4] = {
-    helper_stb_mmu,
-    helper_stw_mmu,
-    helper_stl_mmu,
-    helper_stq_mmu,
+    ext_helper_stb_mmu,
+    ext_helper_stw_mmu,
+    ext_helper_stl_mmu,
+    ext_helper_stq_mmu,
 };

+static void add_qemu_ldst_label(TCGContext *s,
+                                int opc_ext,
+                                int data_reg,
+                                int data_reg2,
+                                int addrlo_reg,
+                                int addrhi_reg,
+                                int mem_index,
+                                uint8_t *raddr,
+                                uint8_t **label_ptr);
+
 /* Perform the TLB load and compare.

    Inputs:
@@ -1059,19 +1069,21 @@  static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,

     tcg_out_mov(s, type, r0, addrlo);

-    /* jne label1 */
-    tcg_out8(s, OPC_JCC_short + JCC_JNE);
+    /* jne slow_path */
+    /* XXX: How to avoid using OPC_JCC_long for peephole optimization? */
+    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
     label_ptr[0] = s->code_ptr;
-    s->code_ptr++;
+    s->code_ptr += 4;

     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
         /* cmp 4(r1), addrhi */
         tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4);

-        /* jne label1 */
-        tcg_out8(s, OPC_JCC_short + JCC_JNE);
+        /* jne slow_path */
+        /* XXX: How to avoid using OPC_JCC_long for peephole optimization? */
+        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
         label_ptr[1] = s->code_ptr;
-        s->code_ptr++;
+        s->code_ptr += 4;
     }

     /* TLB Hit.  */
@@ -1169,12 +1181,7 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     int addrlo_idx;
 #if defined(CONFIG_SOFTMMU)
     int mem_index, s_bits;
-#if TCG_TARGET_REG_BITS == 64
-    int arg_idx;
-#else
-    int stack_adjust;
-#endif
-    uint8_t *label_ptr[3];
+    uint8_t *label_ptr[2];
 #endif

     data_reg = args[0];
@@ -1194,93 +1201,16 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     /* TLB Hit.  */
     tcg_out_qemu_ld_direct(s, data_reg, data_reg2, TCG_REG_L0, 0, opc);

-    /* jmp label2 */
-    tcg_out8(s, OPC_JMP_short);
-    label_ptr[2] = s->code_ptr;
-    s->code_ptr++;
-
-    /* TLB Miss.  */
-
-    /* label1: */
-    *label_ptr[0] = s->code_ptr - label_ptr[0] - 1;
-    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        *label_ptr[1] = s->code_ptr - label_ptr[1] - 1;
-    }
-
-    /* XXX: move that code at the end of the TB */
-#if TCG_TARGET_REG_BITS == 32
-    tcg_out_pushi(s, mem_index);
-    stack_adjust = 4;
-    if (TARGET_LONG_BITS == 64) {
-        tcg_out_push(s, args[addrlo_idx + 1]);
-        stack_adjust += 4;
-    }
-    tcg_out_push(s, args[addrlo_idx]);
-    stack_adjust += 4;
-    tcg_out_push(s, TCG_AREG0);
-    stack_adjust += 4;
-#else
-    /* The first argument is already loaded with addrlo.  */
-    arg_idx = 1;
-    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx],
-                 mem_index);
-    /* XXX/FIXME: suboptimal */
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], TCG_REG_L2);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], TCG_REG_L1);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], TCG_REG_L0);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
-#endif
-
-    tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
-
-#if TCG_TARGET_REG_BITS == 32
-    if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
-        /* Pop and discard.  This is 2 bytes smaller than the add.  */
-        tcg_out_pop(s, TCG_REG_ECX);
-    } else if (stack_adjust != 0) {
-        tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
-    }
-#endif
-
-    switch(opc) {
-    case 0 | 4:
-        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
-        break;
-    case 1 | 4:
-        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
-        break;
-    case 0:
-        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
-        break;
-    case 1:
-        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
-        break;
-    case 2:
-        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
-        break;
-#if TCG_TARGET_REG_BITS == 64
-    case 2 | 4:
-        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
-        break;
-#endif
-    case 3:
-        if (TCG_TARGET_REG_BITS == 64) {
-            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
-        } else if (data_reg == TCG_REG_EDX) {
-            /* xchg %edx, %eax */
-            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
-        } else {
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
-        }
-        break;
-    default:
-        tcg_abort();
-    }
-
-    /* label2: */
-    *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+    /* Record the current context of a load into ldst label */
+    add_qemu_ldst_label(s,
+                        opc,
+                        data_reg,
+                        data_reg2,
+                        args[addrlo_idx],
+                        args[addrlo_idx + 1],
+                        mem_index,
+                        s->code_ptr,
+                        label_ptr);
 #else
     {
         int32_t offset = GUEST_BASE;
@@ -1372,8 +1302,7 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     int addrlo_idx;
 #if defined(CONFIG_SOFTMMU)
     int mem_index, s_bits;
-    int stack_adjust;
-    uint8_t *label_ptr[3];
+    uint8_t *label_ptr[2];
 #endif

     data_reg = args[0];
@@ -1393,23 +1322,220 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     /* TLB Hit.  */
     tcg_out_qemu_st_direct(s, data_reg, data_reg2, TCG_REG_L0, 0, opc);

-    /* jmp label2 */
-    tcg_out8(s, OPC_JMP_short);
-    label_ptr[2] = s->code_ptr;
-    s->code_ptr++;
+    /* Record the current context of a store into ldst label */
+    add_qemu_ldst_label(s,
+                        opc | HL_ST_MASK,
+                        data_reg,
+                        data_reg2,
+                        args[addrlo_idx],
+                        args[addrlo_idx + 1],
+                        mem_index,
+                        s->code_ptr,
+                        label_ptr);
+#else
+    {
+        int32_t offset = GUEST_BASE;
+        int base = args[addrlo_idx];

-    /* TLB Miss.  */
+        if (TCG_TARGET_REG_BITS == 64) {
+            /* ??? We assume all operations have left us with register
+               contents that are zero extended.  So far this appears to
+               be true.  If we want to enforce this, we can either do
+               an explicit zero-extension here, or (if GUEST_BASE == 0)
+               use the ADDR32 prefix.  For now, do nothing.  */

-    /* label1: */
-    *label_ptr[0] = s->code_ptr - label_ptr[0] - 1;
+            if (offset != GUEST_BASE) {
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L0, GUEST_BASE);
+                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L0, base);
+                base = TCG_REG_L0;
+                offset = 0;
+            }
+        }
+
+        tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, opc);
+    }
+#endif
+}
+
+#if defined(CONFIG_SOFTMMU)
+/*
+ * Record the context of a call to the out of line helper code for the slow path
+ * for a load or store, so that we can later generate the correct helper code
+ */
+static void add_qemu_ldst_label(TCGContext *s,
+                                int opc_ext,
+                                int data_reg,
+                                int data_reg2,
+                                int addrlo_reg,
+                                int addrhi_reg,
+                                int mem_index,
+                                uint8_t *raddr,
+                                uint8_t **label_ptr)
+{
+    int idx;
+    TCGLabelQemuLdst *label;
+
+    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
+        tcg_abort();
+    }
+
+    idx = s->nb_qemu_ldst_labels++;
+    label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
+    label->opc_ext = opc_ext;
+    label->datalo_reg = data_reg;
+    label->datahi_reg = data_reg2;
+    label->addrlo_reg = addrlo_reg;
+    label->addrhi_reg = addrhi_reg;
+    label->mem_index = mem_index;
+    label->raddr = raddr;
+    label->label_ptr[0] = label_ptr[0];
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        *label_ptr[1] = s->code_ptr - label_ptr[1] - 1;
+        label->label_ptr[1] = label_ptr[1];
     }
+}

-    /* XXX: move that code at the end of the TB */
+/*
+ * Generate code for the slow path for a load at the end of block
+ */
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+    int s_bits;
+    int opc = label->opc_ext & HL_OPC_MASK;
+    int mem_index = label->mem_index;
 #if TCG_TARGET_REG_BITS == 32
+    int stack_adjust;
+    int addrlo_reg = label->addrlo_reg;
+    int addrhi_reg = label->addrhi_reg;
+#endif
+    int data_reg = label->datalo_reg;
+    int data_reg2 = label->datahi_reg;
+    uint8_t *raddr = label->raddr;
+    uint8_t **label_ptr = &label->label_ptr[0];
+
+    s_bits = opc & 3;
+
+    /* resolve label address */
+    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+    }
+
+    /* extended helper signature: ext_helper_ld_mmu(CPUState *env,
+       target_ulong addr, int mmu_idx, uintptr_t raddr) */
+#if TCG_TARGET_REG_BITS == 32
+    /* The last arg is the generated code address corresponding to qemu_ld IR */
+    tcg_out_pushi(s, (tcg_target_ulong)(raddr - 1));
+    stack_adjust = 4;
     tcg_out_pushi(s, mem_index);
+    stack_adjust += 4;
+    if (TARGET_LONG_BITS == 64) {
+        tcg_out_push(s, addrhi_reg);
+        stack_adjust += 4;
+    }
+    tcg_out_push(s, addrlo_reg);
+    stack_adjust += 4;
+    tcg_out_push(s, TCG_AREG0);
+    stack_adjust += 4;
+#else
+    /* The first argument is already loaded with addrlo.  */
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[1],
+                 mem_index);
+    /* The last arg is the generated code address corresponding to qemu_ld IR */
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2],
+                 (tcg_target_ulong)(raddr - 1));
+    /* XXX/FIXME: suboptimal */
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], TCG_REG_L2);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], TCG_REG_L1);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], TCG_REG_L0);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
+#endif
+
+    tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
+
+#if TCG_TARGET_REG_BITS == 32
+    if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
+        /* Pop and discard.  This is 2 bytes smaller than the add.  */
+        tcg_out_pop(s, TCG_REG_ECX);
+    } else if (stack_adjust != 0) {
+        tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
+    }
+#endif
+
+    switch (opc) {
+    case 0 | 4:
+        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
+        break;
+    case 1 | 4:
+        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
+        break;
+    case 0:
+        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
+        break;
+    case 1:
+        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
+        break;
+    case 2:
+        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+        break;
+#if TCG_TARGET_REG_BITS == 64
+    case 2 | 4:
+        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
+        break;
+#endif
+    case 3:
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
+        } else if (data_reg == TCG_REG_EDX) {
+            /* xchg %edx, %eax */
+            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
+        }
+        break;
+    default:
+        tcg_abort();
+    }
+
+    /* Jump back to the original code accessing a guest memory */
+    tcg_out_jmp(s, (tcg_target_long) raddr);
+}
+
+/*
+ * Generate code for the slow path for a store at the end of block
+ */
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+    int s_bits;
+    int stack_adjust;
+    int opc = label->opc_ext & HL_OPC_MASK;
+    int mem_index = label->mem_index;
+    int data_reg = label->datalo_reg;
+#if TCG_TARGET_REG_BITS == 32
+    int data_reg2 = label->datahi_reg;
+    int addrlo_reg = label->addrlo_reg;
+    int addrhi_reg = label->addrhi_reg;
+#endif
+    uint8_t *raddr = label->raddr;
+    uint8_t **label_ptr = &label->label_ptr[0];
+
+    s_bits = opc & 3;
+
+    /* resolve label address */
+    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+    }
+
+    /* extended helper signature: ext_helper_st_mmu(CPUState *env,
+       target_ulong addr, uintxx_t val, int mmu_idx, uintptr_t raddr) */
+#if TCG_TARGET_REG_BITS == 32
+    /* The last arg is the generated code address corresponding to qemu_st IR */
+    tcg_out_pushi(s, (tcg_target_ulong)(raddr - 1));
     stack_adjust = 4;
+    tcg_out_pushi(s, mem_index);
+    stack_adjust += 4;
     if (opc == 3) {
         tcg_out_push(s, data_reg2);
         stack_adjust += 4;
@@ -1417,10 +1543,10 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     tcg_out_push(s, data_reg);
     stack_adjust += 4;
     if (TARGET_LONG_BITS == 64) {
-        tcg_out_push(s, args[addrlo_idx + 1]);
+        tcg_out_push(s, addrhi_reg);
         stack_adjust += 4;
     }
-    tcg_out_push(s, args[addrlo_idx]);
+    tcg_out_push(s, addrlo_reg);
     stack_adjust += 4;
     tcg_out_push(s, TCG_AREG0);
     stack_adjust += 4;
@@ -1429,6 +1555,14 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
                 TCG_REG_L1, data_reg);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_L2, mem_index);
     stack_adjust = 0;
+    /* The last arg is the generated code address corresponding to qemu_st IR */
+#if defined(_WIN64)
+    tcg_out_pushi(s, (tcg_target_ulong)(raddr - 1));
+    stack_adjust += 8;
+#else
+    tcg_out_movi(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[4],
+                 (tcg_target_ulong)(raddr - 1));
+#endif
     /* XXX/FIXME: suboptimal */
     tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], TCG_REG_L2);
     tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], TCG_REG_L1);
@@ -1445,32 +1579,28 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
         tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
     }

-    /* label2: */
-    *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
-#else
-    {
-        int32_t offset = GUEST_BASE;
-        int base = args[addrlo_idx];
+    /* Jump back to the original code accessing a guest memory */
+    tcg_out_jmp(s, (tcg_target_long) raddr);
+}

-        if (TCG_TARGET_REG_BITS == 64) {
-            /* ??? We assume all operations have left us with register
-               contents that are zero extended.  So far this appears to
-               be true.  If we want to enforce this, we can either do
-               an explicit zero-extension here, or (if GUEST_BASE == 0)
-               use the ADDR32 prefix.  For now, do nothing.  */
+/*
+ * Generate all of the slow paths of qemu_ld/st at the end of block
+ */
+void tcg_out_qemu_ldst_slow_path(TCGContext *s)
+{
+    int i;
+    TCGLabelQemuLdst *label;

-            if (offset != GUEST_BASE) {
-                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L0, GUEST_BASE);
-                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L0, base);
-                base = TCG_REG_L0;
-                offset = 0;
+    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
+        label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i];
+        if (IS_QEMU_LD_LABEL(label)) {
+            tcg_out_qemu_ld_slow_path(s, label);
+        } else {
+            tcg_out_qemu_st_slow_path(s, label);
             }
         }
-
-        tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, opc);
-    }
-#endif
 }
+#endif  /* CONFIG_SOFTMMU */

 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index c069e44..c96b7f1 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -304,6 +304,14 @@  void tcg_func_start(TCGContext *s)

     gen_opc_ptr = gen_opc_buf;
     gen_opparam_ptr = gen_opparam_buf;
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+    /* Initialize qemu_ld/st labels to assist code generation at the end of TB
+       for TLB miss cases at the end of TB */
+    s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) *
+                                     TCG_MAX_QEMU_LDST);
+    s->nb_qemu_ldst_labels = 0;
+#endif
 }

 static inline void tcg_temp_alloc(TCGContext *s, int n)
@@ -2163,6 +2171,11 @@  static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
 #endif
     }
  the_end:
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+    /* Generate slow paths of qemu_ld/st IRs which call MMU helpers at
+       the end of block */
+    tcg_out_qemu_ldst_slow_path(s);
+#endif
     return -1;
 }

diff --git a/tcg/tcg.h b/tcg/tcg.h
index af7464a..b54884f 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -188,6 +188,29 @@  typedef tcg_target_ulong TCGArg;
    are aliases for target_ulong and host pointer sized values respectively.
  */

+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+/* Macros/structures for qemu_ld/st IR code optimization:
+   TCG_MAX_HELPER_LABELS is defined as same as OPC_BUF_SIZE in exec-all.h. */
+#define TCG_MAX_QEMU_LDST       640
+#define HL_LDST_SHIFT           4
+#define HL_LDST_MASK            (1 << HL_LDST_SHIFT)
+#define HL_ST_MASK              HL_LDST_MASK
+#define HL_OPC_MASK             (HL_LDST_MASK - 1)
+#define IS_QEMU_LD_LABEL(L)     (!((L)->opc_ext & HL_LDST_MASK))
+#define IS_QEMU_ST_LABEL(L)     ((L)->opc_ext & HL_LDST_MASK)
+
+typedef struct TCGLabelQemuLdst {
+    int opc_ext;            /* | 27bit(reserved) | 1bit(ld/st) | 4bit(opc) | */
+    int addrlo_reg;         /* reg index for low word of guest virtual addr */
+    int addrhi_reg;         /* reg index for high word of guest virtual addr */
+    int datalo_reg;         /* reg index for low word to be loaded or stored */
+    int datahi_reg;         /* reg index for high word to be loaded or stored */
+    int mem_index;          /* soft MMU memory index */
+    uint8_t *raddr;         /* gen code addr of the next IR of qemu_ld/st IR */
+    uint8_t *label_ptr[2];  /* label pointers to be updated */
+} TCGLabelQemuLdst;
+#endif
+
 #ifdef CONFIG_DEBUG_TCG
 #define DEBUG_TCGV 1
 #endif
@@ -392,6 +415,13 @@  struct TCGContext {
     int temps_in_use;
     int goto_tb_issue_mask;
 #endif
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+    /* labels info for qemu_ld/st IRs
+       The labels help to generate TLB miss case codes at the end of TB */
+    TCGLabelQemuLdst *qemu_ldst_labels;
+    int nb_qemu_ldst_labels;
+#endif
 };

 extern TCGContext tcg_ctx;
@@ -595,3 +625,8 @@  extern uint8_t code_gen_prologue[];
 #endif

 void tcg_register_jit(void *buf, size_t buf_size);
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+/* Generate all of the slow paths of qemu_ld/st at the end of block */
+void tcg_out_qemu_ldst_slow_path(TCGContext *s);
+#endif