Patchwork [v6,3/3] tcg: Optimize qemu_ld/st by generating slow paths at the end of a block

login
register
mail settings
Submitter YeongKyoon Lee
Date Oct. 16, 2012, 7:23 a.m.
Message ID <1350372190-32186-4-git-send-email-yeongkyoon.lee@samsung.com>
Download mbox | patch
Permalink /patch/191745/
State New
Headers show

Comments

YeongKyoon Lee - Oct. 16, 2012, 7:23 a.m.
Add optimized TCG qemu_ld/st generation which locates the code of TLB miss
cases at the end of a block after generating the other IRs.
Currently, this optimization supports only i386 and x86_64 hosts.

Signed-off-by: Yeongkyoon Lee <yeongkyoon.lee@samsung.com>

---
 tcg/i386/tcg-target.c |  415 +++++++++++++++++++++++++++++++++----------------
 tcg/tcg.c             |   12 ++
 tcg/tcg.h             |   30 ++++
 3 files changed, 324 insertions(+), 133 deletions(-)

--
1.7.5.4
Richard Henderson - Oct. 17, 2012, 11:44 p.m.
On 2012-10-16 17:23, Yeongkyoon Lee wrote:
> +    /* Code generation of qemu_ld/st's slow path calling MMU helper
> +
> +       PRE_PROC ...
> +       call MMU helper
> +       jmp POST_PROC (2b) : short forward jump <- GETRA()
> +       jmp next_code (5b) : dummy long backward jump which is never executed
> +       POST_PROC ... : do post-processing <- GETRA() + 7
> +       jmp next_code : jump to the code corresponding to next IR of qemu_ld/st
> +    */

Is this jump over jump really any better than passing next_code
as another function argument?

In 32-bit mode
	push $next_code
In 64-bit mode
	leaq next_code(%rip),%r8


r~
YeongKyoon Lee - Oct. 18, 2012, 2:58 a.m.
On 2012년 10월 18일 08:44, Richard Henderson wrote:
> On 2012-10-16 17:23, Yeongkyoon Lee wrote:
>> +    /* Code generation of qemu_ld/st's slow path calling MMU helper
>> +
>> +       PRE_PROC ...
>> +       call MMU helper
>> +       jmp POST_PROC (2b) : short forward jump <- GETRA()
>> +       jmp next_code (5b) : dummy long backward jump which is never executed
>> +       POST_PROC ... : do post-processing <- GETRA() + 7
>> +       jmp next_code : jump to the code corresponding to next IR of qemu_ld/st
>> +    */
> Is this jump over jump really any better than passing next_code
> as another function argument?
>
> In 32-bit mode
> 	push $next_code
> In 64-bit mode
> 	leaq next_code(%rip),%r8
>
>
> r~
>

Only one advantage is no fragmentation of MMU helpers, that is, we will 
still have the same helper prototypes.
In my opinion, the performance degradation of using jmp instead of push 
or something, is negligible because it is executed on slow path.

Patch

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 4952c05..51abe85 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -1001,6 +1001,17 @@  static const void *qemu_st_helpers[4] = {
     helper_stq_mmu,
 };

+static void add_qemu_ldst_label(TCGContext *s,
+                                int is_ld,
+                                int opc,
+                                int data_reg,
+                                int data_reg2,
+                                int addrlo_reg,
+                                int addrhi_reg,
+                                int mem_index,
+                                uint8_t *raddr,
+                                uint8_t **label_ptr);
+
 /* Perform the TLB load and compare.

    Inputs:
@@ -1059,19 +1070,19 @@  static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,

     tcg_out_mov(s, type, r0, addrlo);

-    /* jne label1 */
-    tcg_out8(s, OPC_JCC_short + JCC_JNE);
+    /* jne slow_path */
+    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
     label_ptr[0] = s->code_ptr;
-    s->code_ptr++;
+    s->code_ptr += 4;

     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
         /* cmp 4(r1), addrhi */
         tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4);

-        /* jne label1 */
-        tcg_out8(s, OPC_JCC_short + JCC_JNE);
+        /* jne slow_path */
+        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
         label_ptr[1] = s->code_ptr;
-        s->code_ptr++;
+        s->code_ptr += 4;
     }

     /* TLB Hit.  */
@@ -1169,12 +1180,7 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     int addrlo_idx;
 #if defined(CONFIG_SOFTMMU)
     int mem_index, s_bits;
-#if TCG_TARGET_REG_BITS == 64
-    int arg_idx;
-#else
-    int stack_adjust;
-#endif
-    uint8_t *label_ptr[3];
+    uint8_t *label_ptr[2];
 #endif

     data_reg = args[0];
@@ -1194,93 +1200,17 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     /* TLB Hit.  */
     tcg_out_qemu_ld_direct(s, data_reg, data_reg2, TCG_REG_L0, 0, opc);

-    /* jmp label2 */
-    tcg_out8(s, OPC_JMP_short);
-    label_ptr[2] = s->code_ptr;
-    s->code_ptr++;
-
-    /* TLB Miss.  */
-
-    /* label1: */
-    *label_ptr[0] = s->code_ptr - label_ptr[0] - 1;
-    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        *label_ptr[1] = s->code_ptr - label_ptr[1] - 1;
-    }
-
-    /* XXX: move that code at the end of the TB */
-#if TCG_TARGET_REG_BITS == 32
-    tcg_out_pushi(s, mem_index);
-    stack_adjust = 4;
-    if (TARGET_LONG_BITS == 64) {
-        tcg_out_push(s, args[addrlo_idx + 1]);
-        stack_adjust += 4;
-    }
-    tcg_out_push(s, args[addrlo_idx]);
-    stack_adjust += 4;
-    tcg_out_push(s, TCG_AREG0);
-    stack_adjust += 4;
-#else
-    /* The first argument is already loaded with addrlo.  */
-    arg_idx = 1;
-    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx],
-                 mem_index);
-    /* XXX/FIXME: suboptimal */
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], TCG_REG_L2);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], TCG_REG_L1);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], TCG_REG_L0);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
-#endif
-
-    tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
-
-#if TCG_TARGET_REG_BITS == 32
-    if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
-        /* Pop and discard.  This is 2 bytes smaller than the add.  */
-        tcg_out_pop(s, TCG_REG_ECX);
-    } else if (stack_adjust != 0) {
-        tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
-    }
-#endif
-
-    switch(opc) {
-    case 0 | 4:
-        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
-        break;
-    case 1 | 4:
-        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
-        break;
-    case 0:
-        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
-        break;
-    case 1:
-        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
-        break;
-    case 2:
-        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
-        break;
-#if TCG_TARGET_REG_BITS == 64
-    case 2 | 4:
-        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
-        break;
-#endif
-    case 3:
-        if (TCG_TARGET_REG_BITS == 64) {
-            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
-        } else if (data_reg == TCG_REG_EDX) {
-            /* xchg %edx, %eax */
-            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
-        } else {
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
-        }
-        break;
-    default:
-        tcg_abort();
-    }
-
-    /* label2: */
-    *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+    /* Record the current context of a load into ldst label */
+    add_qemu_ldst_label(s,
+                        1,
+                        opc,
+                        data_reg,
+                        data_reg2,
+                        args[addrlo_idx],
+                        args[addrlo_idx + 1],
+                        mem_index,
+                        s->code_ptr,
+                        label_ptr);
 #else
     {
         int32_t offset = GUEST_BASE;
@@ -1372,8 +1302,7 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     int addrlo_idx;
 #if defined(CONFIG_SOFTMMU)
     int mem_index, s_bits;
-    int stack_adjust;
-    uint8_t *label_ptr[3];
+    uint8_t *label_ptr[2];
 #endif

     data_reg = args[0];
@@ -1393,20 +1322,225 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     /* TLB Hit.  */
     tcg_out_qemu_st_direct(s, data_reg, data_reg2, TCG_REG_L0, 0, opc);

-    /* jmp label2 */
+    /* Record the current context of a store into ldst label */
+    add_qemu_ldst_label(s,
+                        0,
+                        opc,
+                        data_reg,
+                        data_reg2,
+                        args[addrlo_idx],
+                        args[addrlo_idx + 1],
+                        mem_index,
+                        s->code_ptr,
+                        label_ptr);
+#else
+    {
+        int32_t offset = GUEST_BASE;
+        int base = args[addrlo_idx];
+
+        if (TCG_TARGET_REG_BITS == 64) {
+            /* ??? We assume all operations have left us with register
+               contents that are zero extended.  So far this appears to
+               be true.  If we want to enforce this, we can either do
+               an explicit zero-extension here, or (if GUEST_BASE == 0)
+               use the ADDR32 prefix.  For now, do nothing.  */
+
+            if (offset != GUEST_BASE) {
+                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L0, GUEST_BASE);
+                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L0, base);
+                base = TCG_REG_L0;
+                offset = 0;
+            }
+        }
+
+        tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, opc);
+    }
+#endif
+}
+
+#if defined(CONFIG_SOFTMMU)
+/*
+ * Record the context of a call to the out of line helper code for the slow path
+ * for a load or store, so that we can later generate the correct helper code
+ */
+static void add_qemu_ldst_label(TCGContext *s,
+                                int is_ld,
+                                int opc,
+                                int data_reg,
+                                int data_reg2,
+                                int addrlo_reg,
+                                int addrhi_reg,
+                                int mem_index,
+                                uint8_t *raddr,
+                                uint8_t **label_ptr)
+{
+    int idx;
+    TCGLabelQemuLdst *label;
+
+    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
+        tcg_abort();
+    }
+
+    idx = s->nb_qemu_ldst_labels++;
+    label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
+    label->is_ld = is_ld;
+    label->opc = opc;
+    label->datalo_reg = data_reg;
+    label->datahi_reg = data_reg2;
+    label->addrlo_reg = addrlo_reg;
+    label->addrhi_reg = addrhi_reg;
+    label->mem_index = mem_index;
+    label->raddr = raddr;
+    label->label_ptr[0] = label_ptr[0];
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        label->label_ptr[1] = label_ptr[1];
+    }
+}
+
+/*
+ * Generate code for the slow path for a load at the end of block
+ */
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+    int s_bits;
+    int opc = label->opc;
+    int mem_index = label->mem_index;
+#if TCG_TARGET_REG_BITS == 32
+    int stack_adjust;
+    int addrlo_reg = label->addrlo_reg;
+    int addrhi_reg = label->addrhi_reg;
+#endif
+    int data_reg = label->datalo_reg;
+    int data_reg2 = label->datahi_reg;
+    uint8_t *raddr = label->raddr;
+    uint8_t **label_ptr = &label->label_ptr[0];
+
+    s_bits = opc & 3;
+
+    /* resolve label address */
+    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+    }
+
+#if TCG_TARGET_REG_BITS == 32
+    tcg_out_pushi(s, mem_index);
+    stack_adjust = 4;
+    if (TARGET_LONG_BITS == 64) {
+        tcg_out_push(s, addrhi_reg);
+        stack_adjust += 4;
+    }
+    tcg_out_push(s, addrlo_reg);
+    stack_adjust += 4;
+    tcg_out_push(s, TCG_AREG0);
+    stack_adjust += 4;
+#else
+    /* The first argument is already loaded with addrlo.  */
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[1],
+                 mem_index);
+    /* XXX/FIXME: suboptimal */
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], TCG_REG_L2);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], TCG_REG_L1);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], TCG_REG_L0);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
+#endif
+
+    /* Code generation of qemu_ld/st's slow path calling MMU helper
+
+       PRE_PROC ...
+       call MMU helper
+       jmp POST_PROC (2b) : short forward jump <- GETRA()
+       jmp next_code (5b) : dummy long backward jump which is never executed
+       POST_PROC ... : do post-processing <- GETRA() + 7
+       jmp next_code : jump to the code corresponding to next IR of qemu_ld/st
+    */
+
+    tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
+
+    /* Jump to post-processing code */
     tcg_out8(s, OPC_JMP_short);
-    label_ptr[2] = s->code_ptr;
-    s->code_ptr++;
+    tcg_out8(s, 5);
+    /* Dummy backward jump having information of fast path'pc for MMU helpers */
+    tcg_out8(s, OPC_JMP_long);
+    *(int32_t *)s->code_ptr = (int32_t)(raddr - s->code_ptr - 4);
+    s->code_ptr += 4;

-    /* TLB Miss.  */
+#if TCG_TARGET_REG_BITS == 32
+    if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
+        /* Pop and discard.  This is 2 bytes smaller than the add.  */
+        tcg_out_pop(s, TCG_REG_ECX);
+    } else if (stack_adjust != 0) {
+        tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
+    }
+#endif

-    /* label1: */
-    *label_ptr[0] = s->code_ptr - label_ptr[0] - 1;
+    switch (opc) {
+    case 0 | 4:
+        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
+        break;
+    case 1 | 4:
+        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
+        break;
+    case 0:
+        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
+        break;
+    case 1:
+        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
+        break;
+    case 2:
+        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+        break;
+#if TCG_TARGET_REG_BITS == 64
+    case 2 | 4:
+        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
+        break;
+#endif
+    case 3:
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
+        } else if (data_reg == TCG_REG_EDX) {
+            /* xchg %edx, %eax */
+            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
+        }
+        break;
+    default:
+        tcg_abort();
+    }
+
+    /* Jump to the code corresponding to next IR of qemu_st */
+    tcg_out_jmp(s, (tcg_target_long)raddr);
+}
+
+/*
+ * Generate code for the slow path for a store at the end of block
+ */
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+    int s_bits;
+    int stack_adjust;
+    int opc = label->opc;
+    int mem_index = label->mem_index;
+    int data_reg = label->datalo_reg;
+#if TCG_TARGET_REG_BITS == 32
+    int data_reg2 = label->datahi_reg;
+    int addrlo_reg = label->addrlo_reg;
+    int addrhi_reg = label->addrhi_reg;
+#endif
+    uint8_t *raddr = label->raddr;
+    uint8_t **label_ptr = &label->label_ptr[0];
+
+    s_bits = opc & 3;
+
+    /* resolve label address */
+    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        *label_ptr[1] = s->code_ptr - label_ptr[1] - 1;
+        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
     }

-    /* XXX: move that code at the end of the TB */
 #if TCG_TARGET_REG_BITS == 32
     tcg_out_pushi(s, mem_index);
     stack_adjust = 4;
@@ -1417,10 +1551,10 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     tcg_out_push(s, data_reg);
     stack_adjust += 4;
     if (TARGET_LONG_BITS == 64) {
-        tcg_out_push(s, args[addrlo_idx + 1]);
+        tcg_out_push(s, addrhi_reg);
         stack_adjust += 4;
     }
-    tcg_out_push(s, args[addrlo_idx]);
+    tcg_out_push(s, addrlo_reg);
     stack_adjust += 4;
     tcg_out_push(s, TCG_AREG0);
     stack_adjust += 4;
@@ -1436,8 +1570,26 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
 #endif

+    /* Code generation of qemu_ld/st's slow path calling MMU helper
+
+       PRE_PROC ...
+       call MMU helper
+       jmp POST_PROC (2b) : short forward jump <- GETRA()
+       jmp next_code (5b) : dummy long backward jump which is never executed
+       POST_PROC ... : do post-processing <- GETRA() + 7
+       jmp next_code : jump to the code corresponding to next IR of qemu_ld/st
+    */
+
     tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]);

+    /* Jump to post-processing code */
+    tcg_out8(s, OPC_JMP_short);
+    tcg_out8(s, 5);
+    /* Dummy backward jump having information of fast path'pc for MMU helpers */
+    tcg_out8(s, OPC_JMP_long);
+    *(int32_t *)s->code_ptr = (int32_t)(raddr - s->code_ptr - 4);
+    s->code_ptr += 4;
+
     if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
         /* Pop and discard.  This is 2 bytes smaller than the add.  */
         tcg_out_pop(s, TCG_REG_ECX);
@@ -1445,32 +1597,29 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
         tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
     }

-    /* label2: */
-    *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
-#else
-    {
-        int32_t offset = GUEST_BASE;
-        int base = args[addrlo_idx];
-
-        if (TCG_TARGET_REG_BITS == 64) {
-            /* ??? We assume all operations have left us with register
-               contents that are zero extended.  So far this appears to
-               be true.  If we want to enforce this, we can either do
-               an explicit zero-extension here, or (if GUEST_BASE == 0)
-               use the ADDR32 prefix.  For now, do nothing.  */
+    /* Jump to the code corresponding to next IR of qemu_st */
+    tcg_out_jmp(s, (tcg_target_long)raddr);
+}

-            if (offset != GUEST_BASE) {
-                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L0, GUEST_BASE);
-                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L0, base);
-                base = TCG_REG_L0;
-                offset = 0;
-            }
+/*
+ * Generate TB finalization at the end of block
+ */
+void tcg_out_tb_finalize(TCGContext *s)
+{
+    int i;
+    TCGLabelQemuLdst *label;
+
+    /* qemu_ld/st slow paths */
+    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
+        label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i];
+        if (label->is_ld) {
+            tcg_out_qemu_ld_slow_path(s, label);
+        } else {
+            tcg_out_qemu_st_slow_path(s, label);
         }
-
-        tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, opc);
     }
-#endif
 }
+#endif  /* CONFIG_SOFTMMU */

 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 32cd0c6..d224f61 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -303,6 +303,14 @@  void tcg_func_start(TCGContext *s)

     gen_opc_ptr = gen_opc_buf;
     gen_opparam_ptr = gen_opparam_buf;
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+    /* Initialize qemu_ld/st labels to assist code generation at the end of TB
+       for TLB miss cases at the end of TB */
+    s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) *
+                                     TCG_MAX_QEMU_LDST);
+    s->nb_qemu_ldst_labels = 0;
+#endif
 }

 static inline void tcg_temp_alloc(TCGContext *s, int n)
@@ -2164,6 +2172,10 @@  static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
 #endif
     }
  the_end:
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+    /* Generate TB finalization at the end of block */
+    tcg_out_tb_finalize(s);
+#endif
     return -1;
 }

diff --git a/tcg/tcg.h b/tcg/tcg.h
index 7bafe0e..c930d1a 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -188,6 +188,24 @@  typedef tcg_target_ulong TCGArg;
    are aliases for target_ulong and host pointer sized values respectively.
  */

+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+/* Macros/structures for qemu_ld/st IR code optimization:
+   TCG_MAX_HELPER_LABELS is defined as same as OPC_BUF_SIZE in exec-all.h. */
+#define TCG_MAX_QEMU_LDST       640
+
+typedef struct TCGLabelQemuLdst {
+    int is_ld:1;            /* qemu_ld: 1, qemu_st: 0 */
+    int opc:4;
+    int addrlo_reg;         /* reg index for low word of guest virtual addr */
+    int addrhi_reg;         /* reg index for high word of guest virtual addr */
+    int datalo_reg;         /* reg index for low word to be loaded or stored */
+    int datahi_reg;         /* reg index for high word to be loaded or stored */
+    int mem_index;          /* soft MMU memory index */
+    uint8_t *raddr;         /* gen code addr of the next IR of qemu_ld/st IR */
+    uint8_t *label_ptr[2];  /* label pointers to be updated */
+} TCGLabelQemuLdst;
+#endif
+
 #ifdef CONFIG_DEBUG_TCG
 #define DEBUG_TCGV 1
 #endif
@@ -422,6 +440,13 @@  struct TCGContext {
     int temps_in_use;
     int goto_tb_issue_mask;
 #endif
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+    /* labels info for qemu_ld/st IRs
+       The labels help to generate TLB miss case codes at the end of TB */
+    TCGLabelQemuLdst *qemu_ldst_labels;
+    int nb_qemu_ldst_labels;
+#endif
 };

 extern TCGContext tcg_ctx;
@@ -625,3 +650,8 @@  extern uint8_t code_gen_prologue[];
 #endif

 void tcg_register_jit(void *buf, size_t buf_size);
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+/* Generate TB finalization at the end of block */
+void tcg_out_tb_finalize(TCGContext *s);
+#endif