diff mbox

[RFC,v4,3/3] tcg: Optimize qemu_ld/st by generating slow paths at the end of a block

Message ID 1343201734-12062-4-git-send-email-yeongkyoon.lee@samsung.com
State New
Headers show

Commit Message

YeongKyoon Lee July 25, 2012, 7:35 a.m. UTC
Add optimized TCG qemu_ld/st generation which locates the code of TLB miss
cases at the end of a block after generating the other IRs.
Currently, this optimization supports only i386 and x86_64 hosts.

Signed-off-by: Yeongkyoon Lee <yeongkyoon.lee@samsung.com>
---
 tcg/i386/tcg-target.c |  475 +++++++++++++++++++++++++++++++------------------
 tcg/tcg.c             |   12 ++
 tcg/tcg.h             |   35 ++++
 3 files changed, 353 insertions(+), 169 deletions(-)

Comments

Richard Henderson July 25, 2012, 2 p.m. UTC | #1
On 07/25/2012 12:35 AM, Yeongkyoon Lee wrote:
> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
> +/* Macros/structures for qemu_ld/st IR code optimization:
> +   TCG_MAX_HELPER_LABELS is defined as same as OPC_BUF_SIZE in exec-all.h. */
> +#define TCG_MAX_QEMU_LDST       640

Why statically size this ...

> +    /* labels info for qemu_ld/st IRs
> +       The labels help to generate TLB miss case codes at the end of TB */
> +    TCGLabelQemuLdst *qemu_ldst_labels;

... and then allocate the array dynamically?

> +    /* jne slow_path */
> +    /* XXX: How to avoid using OPC_JCC_long for peephole optimization? */
> +    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);

You can't, not and maintain the code-generate-until-address-reached
exception invariant.

> +#ifndef CONFIG_QEMU_LDST_OPTIMIZATION
>  uint8_t __ldb_mmu(target_ulong addr, int mmu_idx);
>  void __stb_mmu(target_ulong addr, uint8_t val, int mmu_idx);
>  uint16_t __ldw_mmu(target_ulong addr, int mmu_idx);
> @@ -28,6 +30,30 @@ void __stl_cmmu(target_ulong addr, uint32_t val, int mmu_idx);
>  uint64_t __ldq_cmmu(target_ulong addr, int mmu_idx);
>  void __stq_cmmu(target_ulong addr, uint64_t val, int mmu_idx);
>  #else
> +/* Extended versions of MMU helpers for qemu_ld/st optimization.
> +   The additional argument is a host code address accessing guest memory */
> +uint8_t ext_ldb_mmu(target_ulong addr, int mmu_idx, uintptr_t ra);

Don't tie LDST_OPTIMIZATION directly to the extended function calls.

For a host supporting predication, like ARM, the best code sequence
may look like

	(1) TLB check
	(2) If hit, load value from memory
	(3) If miss, call miss case (5)
	(4) ... next code
	...
	(5) Load call parameters
	(6) Tail call (aka jump) to MMU helper

so that (a) we need not explicitly load the address of (3) by hand
for your RA parameter and (b) the mmu helper returns directly to (4).


r~
YeongKyoon Lee July 28, 2012, 3:39 p.m. UTC | #2
On 2012년 07월 25일 23:00, Richard Henderson wrote:
> On 07/25/2012 12:35 AM, Yeongkyoon Lee wrote:
>> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
>> +/* Macros/structures for qemu_ld/st IR code optimization:
>> +   TCG_MAX_HELPER_LABELS is defined as same as OPC_BUF_SIZE in exec-all.h. */
>> +#define TCG_MAX_QEMU_LDST       640
> Why statically size this ...

This just followed the other TCG's code style, the allocation of the 
"labels" of "TCGContext" in tcg.c.


>
>> +    /* labels info for qemu_ld/st IRs
>> +       The labels help to generate TLB miss case codes at the end of TB */
>> +    TCGLabelQemuLdst *qemu_ldst_labels;
> ... and then allocate the array dynamically?

ditto.

>
>> +    /* jne slow_path */
>> +    /* XXX: How to avoid using OPC_JCC_long for peephole optimization? */
>> +    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
> You can't, not and maintain the code-generate-until-address-reached
> exception invariant.
>
>> +#ifndef CONFIG_QEMU_LDST_OPTIMIZATION
>>   uint8_t __ldb_mmu(target_ulong addr, int mmu_idx);
>>   void __stb_mmu(target_ulong addr, uint8_t val, int mmu_idx);
>>   uint16_t __ldw_mmu(target_ulong addr, int mmu_idx);
>> @@ -28,6 +30,30 @@ void __stl_cmmu(target_ulong addr, uint32_t val, int mmu_idx);
>>   uint64_t __ldq_cmmu(target_ulong addr, int mmu_idx);
>>   void __stq_cmmu(target_ulong addr, uint64_t val, int mmu_idx);
>>   #else
>> +/* Extended versions of MMU helpers for qemu_ld/st optimization.
>> +   The additional argument is a host code address accessing guest memory */
>> +uint8_t ext_ldb_mmu(target_ulong addr, int mmu_idx, uintptr_t ra);
> Don't tie LDST_OPTIMIZATION directly to the extended function calls.
>
> For a host supporting predication, like ARM, the best code sequence
> may look like
>
> 	(1) TLB check
> 	(2) If hit, load value from memory
> 	(3) If miss, call miss case (5)
> 	(4) ... next code
> 	...
> 	(5) Load call parameters
> 	(6) Tail call (aka jump) to MMU helper
>
> so that (a) we need not explicitly load the address of (3) by hand
> for your RA parameter and (b) the mmu helper returns directly to (4).
>
>
> r~

The difference between current HEAD and the code sequence you said is, I 
think, code locality.
My LDST_OPTIMIZATION patches enhances the code locality and also removes 
one jump.
It shows about 4% rising of CoreMark performance on x86 host which 
supports predication like ARM.
Probably, the performance enhancement for AREG0 cases might get more larger.
I'm not sure where the performance enhancement came from now, and I'll 
check it by some tests later.

In my humble opinion, there are no things to lose in LDST_OPTIMIZATION 
except
for just adding one argument to MMU helper implicitly which doesn't look 
so critical.
How about your opinion?

Thanks.
YeongKyoon Lee Aug. 27, 2012, 7:23 a.m. UTC | #3
On 2012년 07월 29일 00:39, Yeongkyoon Lee wrote:
> On 2012년 07월 25일 23:00, Richard Henderson wrote:
>> On 07/25/2012 12:35 AM, Yeongkyoon Lee wrote:
>>> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
>>> +/* Macros/structures for qemu_ld/st IR code optimization:
>>> +   TCG_MAX_HELPER_LABELS is defined as same as OPC_BUF_SIZE in 
>>> exec-all.h. */
>>> +#define TCG_MAX_QEMU_LDST       640
>> Why statically size this ...
>
> This just followed the other TCG's code style, the allocation of the 
> "labels" of "TCGContext" in tcg.c.
>
>
>>
>>> +    /* labels info for qemu_ld/st IRs
>>> +       The labels help to generate TLB miss case codes at the end 
>>> of TB */
>>> +    TCGLabelQemuLdst *qemu_ldst_labels;
>> ... and then allocate the array dynamically?
>
> ditto.
>
>>
>>> +    /* jne slow_path */
>>> +    /* XXX: How to avoid using OPC_JCC_long for peephole 
>>> optimization? */
>>> +    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
>> You can't, not and maintain the code-generate-until-address-reached
>> exception invariant.
>>
>>> +#ifndef CONFIG_QEMU_LDST_OPTIMIZATION
>>>   uint8_t __ldb_mmu(target_ulong addr, int mmu_idx);
>>>   void __stb_mmu(target_ulong addr, uint8_t val, int mmu_idx);
>>>   uint16_t __ldw_mmu(target_ulong addr, int mmu_idx);
>>> @@ -28,6 +30,30 @@ void __stl_cmmu(target_ulong addr, uint32_t val, 
>>> int mmu_idx);
>>>   uint64_t __ldq_cmmu(target_ulong addr, int mmu_idx);
>>>   void __stq_cmmu(target_ulong addr, uint64_t val, int mmu_idx);
>>>   #else
>>> +/* Extended versions of MMU helpers for qemu_ld/st optimization.
>>> +   The additional argument is a host code address accessing guest 
>>> memory */
>>> +uint8_t ext_ldb_mmu(target_ulong addr, int mmu_idx, uintptr_t ra);
>> Don't tie LDST_OPTIMIZATION directly to the extended function calls.
>>
>> For a host supporting predication, like ARM, the best code sequence
>> may look like
>>
>>     (1) TLB check
>>     (2) If hit, load value from memory
>>     (3) If miss, call miss case (5)
>>     (4) ... next code
>>     ...
>>     (5) Load call parameters
>>     (6) Tail call (aka jump) to MMU helper
>>
>> so that (a) we need not explicitly load the address of (3) by hand
>> for your RA parameter and (b) the mmu helper returns directly to (4).
>>
>>
>> r~
>
> The difference between current HEAD and the code sequence you said is, 
> I think, code locality.
> My LDST_OPTIMIZATION patches enhances the code locality and also 
> removes one jump.
> It shows about 4% rising of CoreMark performance on x86 host which 
> supports predication like ARM.
> Probably, the performance enhancement for AREG0 cases might get more 
> larger.
> I'm not sure where the performance enhancement came from now, and I'll 
> check it by some tests later.
>
> In my humble opinion, there are no things to lose in LDST_OPTIMIZATION 
> except
> for just adding one argument to MMU helper implicitly which doesn't 
> look so critical.
> How about your opinion?
>
> Thanks.
>

It's been a long time.

I've tested the performances of one jump difference when fast qemu_ld/st 
(TLB hit).
The result shows 3.6% CoreMark enhancement when reducing one jump where 
slow paths are generated at the end of block as same for the both cases.
That means reducing one jump dominates the majority of performance 
enhancement from LDST_OPTIMIZATION.
As a result, it needs extended MMU helper functions for attaining that 
performance rising, and those extended functions are used only implicitly.

BTW, who will finally confirm my patches?
I have sent four version of my patches in which I have applied all the 
reasonable feedbacks from this community.
Currently, v4 is the final candidate though it might need merge with 
latest HEAD because it was sent 1 month before.

Thanks.
Blue Swirl Aug. 27, 2012, 6:24 p.m. UTC | #4
On Mon, Aug 27, 2012 at 7:23 AM, Yeongkyoon Lee
<yeongkyoon.lee@samsung.com> wrote:
> On 2012년 07월 29일 00:39, Yeongkyoon Lee wrote:
>>
>> On 2012년 07월 25일 23:00, Richard Henderson wrote:
>>>
>>> On 07/25/2012 12:35 AM, Yeongkyoon Lee wrote:
>>>>
>>>> +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
>>>> +/* Macros/structures for qemu_ld/st IR code optimization:
>>>> +   TCG_MAX_HELPER_LABELS is defined as same as OPC_BUF_SIZE in
>>>> exec-all.h. */
>>>> +#define TCG_MAX_QEMU_LDST       640
>>>
>>> Why statically size this ...
>>
>>
>> This just followed the other TCG's code style, the allocation of the
>> "labels" of "TCGContext" in tcg.c.
>>
>>
>>>
>>>> +    /* labels info for qemu_ld/st IRs
>>>> +       The labels help to generate TLB miss case codes at the end of TB
>>>> */
>>>> +    TCGLabelQemuLdst *qemu_ldst_labels;
>>>
>>> ... and then allocate the array dynamically?
>>
>>
>> ditto.
>>
>>>
>>>> +    /* jne slow_path */
>>>> +    /* XXX: How to avoid using OPC_JCC_long for peephole optimization?
>>>> */
>>>> +    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
>>>
>>> You can't, not and maintain the code-generate-until-address-reached
>>> exception invariant.
>>>
>>>> +#ifndef CONFIG_QEMU_LDST_OPTIMIZATION
>>>>   uint8_t __ldb_mmu(target_ulong addr, int mmu_idx);
>>>>   void __stb_mmu(target_ulong addr, uint8_t val, int mmu_idx);
>>>>   uint16_t __ldw_mmu(target_ulong addr, int mmu_idx);
>>>> @@ -28,6 +30,30 @@ void __stl_cmmu(target_ulong addr, uint32_t val, int
>>>> mmu_idx);
>>>>   uint64_t __ldq_cmmu(target_ulong addr, int mmu_idx);
>>>>   void __stq_cmmu(target_ulong addr, uint64_t val, int mmu_idx);
>>>>   #else
>>>> +/* Extended versions of MMU helpers for qemu_ld/st optimization.
>>>> +   The additional argument is a host code address accessing guest
>>>> memory */
>>>> +uint8_t ext_ldb_mmu(target_ulong addr, int mmu_idx, uintptr_t ra);
>>>
>>> Don't tie LDST_OPTIMIZATION directly to the extended function calls.
>>>
>>> For a host supporting predication, like ARM, the best code sequence
>>> may look like
>>>
>>>     (1) TLB check
>>>     (2) If hit, load value from memory
>>>     (3) If miss, call miss case (5)
>>>     (4) ... next code
>>>     ...
>>>     (5) Load call parameters
>>>     (6) Tail call (aka jump) to MMU helper
>>>
>>> so that (a) we need not explicitly load the address of (3) by hand
>>> for your RA parameter and (b) the mmu helper returns directly to (4).
>>>
>>>
>>> r~
>>
>>
>> The difference between current HEAD and the code sequence you said is, I
>> think, code locality.
>> My LDST_OPTIMIZATION patches enhances the code locality and also removes
>> one jump.
>> It shows about 4% rising of CoreMark performance on x86 host which
>> supports predication like ARM.
>> Probably, the performance enhancement for AREG0 cases might get more
>> larger.
>> I'm not sure where the performance enhancement came from now, and I'll
>> check it by some tests later.
>>
>> In my humble opinion, there are no things to lose in LDST_OPTIMIZATION
>> except
>> for just adding one argument to MMU helper implicitly which doesn't look
>> so critical.
>> How about your opinion?
>>
>> Thanks.
>>
>
> It's been a long time.
>
> I've tested the performances of one jump difference when fast qemu_ld/st
> (TLB hit).
> The result shows 3.6% CoreMark enhancement when reducing one jump where slow
> paths are generated at the end of block as same for the both cases.
> That means reducing one jump dominates the majority of performance
> enhancement from LDST_OPTIMIZATION.
> As a result, it needs extended MMU helper functions for attaining that
> performance rising, and those extended functions are used only implicitly.
>
> BTW, who will finally confirm my patches?
> I have sent four version of my patches in which I have applied all the
> reasonable feedbacks from this community.
> Currently, v4 is the final candidate though it might need merge with latest
> HEAD because it was sent 1 month before.

I think the patches should be applied when 1.3 development opens.

>
> Thanks.
>
>
Peter Maydell Aug. 27, 2012, 6:31 p.m. UTC | #5
On 27 August 2012 08:23, Yeongkyoon Lee <yeongkyoon.lee@samsung.com> wrote:
> BTW, who will finally confirm my patches?
> I have sent four version of my patches in which I have applied all the
> reasonable feedbacks from this community.

If you'd like your patches committed you should not use the "[RFC]" tag
in the Subject, because "RFC" means "I would like feedback on this
patch but do not intend it to be committed to master".

-- PMM
YeongKyoon Lee Aug. 28, 2012, 6:38 a.m. UTC | #6
On 2012년 08월 28일 03:31, Peter Maydell wrote:
> On 27 August 2012 08:23, Yeongkyoon Lee <yeongkyoon.lee@samsung.com> wrote:
>> BTW, who will finally confirm my patches?
>> I have sent four version of my patches in which I have applied all the
>> reasonable feedbacks from this community.
> If you'd like your patches committed you should not use the "[RFC]" tag
> in the Subject, because "RFC" means "I would like feedback on this
> patch but do not intend it to be committed to master".
>
> -- PMM
>

Thanks, very nice information!
YeongKyoon Lee Aug. 28, 2012, 6:52 a.m. UTC | #7
>> It's been a long time.
>>
>> I've tested the performances of one jump difference when fast qemu_ld/st
>> (TLB hit).
>> The result shows 3.6% CoreMark enhancement when reducing one jump where slow
>> paths are generated at the end of block as same for the both cases.
>> That means reducing one jump dominates the majority of performance
>> enhancement from LDST_OPTIMIZATION.
>> As a result, it needs extended MMU helper functions for attaining that
>> performance rising, and those extended functions are used only implicitly.
>>
>> BTW, who will finally confirm my patches?
>> I have sent four version of my patches in which I have applied all the
>> reasonable feedbacks from this community.
>> Currently, v4 is the final candidate though it might need merge with latest
>> HEAD because it was sent 1 month before.
> I think the patches should be applied when 1.3 development opens.
>

Thanks for your reply.
How do you estimate when 1.3 development is open?
Blue Swirl Aug. 28, 2012, 4:58 p.m. UTC | #8
On Tue, Aug 28, 2012 at 6:52 AM, Yeongkyoon Lee
<yeongkyoon.lee@samsung.com> wrote:
>
>>> It's been a long time.
>>>
>>> I've tested the performances of one jump difference when fast qemu_ld/st
>>> (TLB hit).
>>> The result shows 3.6% CoreMark enhancement when reducing one jump where
>>> slow
>>> paths are generated at the end of block as same for the both cases.
>>> That means reducing one jump dominates the majority of performance
>>> enhancement from LDST_OPTIMIZATION.
>>> As a result, it needs extended MMU helper functions for attaining that
>>> performance rising, and those extended functions are used only
>>> implicitly.
>>>
>>> BTW, who will finally confirm my patches?
>>> I have sent four version of my patches in which I have applied all the
>>> reasonable feedbacks from this community.
>>> Currently, v4 is the final candidate though it might need merge with
>>> latest
>>> HEAD because it was sent 1 month before.
>>
>> I think the patches should be applied when 1.3 development opens.
>>
>
> Thanks for your reply.
> How do you estimate when 1.3 development is open?

2012-09-05 according to http://wiki.qemu.org/Planning/1.2
Andreas Färber Aug. 28, 2012, 5:18 p.m. UTC | #9
Am 27.08.2012 20:31, schrieb Peter Maydell:
> On 27 August 2012 08:23, Yeongkyoon Lee <yeongkyoon.lee@samsung.com> wrote:
>> BTW, who will finally confirm my patches?
>> I have sent four version of my patches in which I have applied all the
>> reasonable feedbacks from this community.
> 
> If you'd like your patches committed you should not use the "[RFC]" tag
> in the Subject, because "RFC" means "I would like feedback on this
> patch but do not intend it to be committed to master".

Literally, RFC means request for comments.

Personally I differentiate between [RFC n/m] and [PATCH RFC n/m], where
the lack of PATCH means "don't commit this version" and the latter
indicating "I'm not so sure if this is how we want to do it, but if
people agree it can go in". ;)

Not sure how [RFC][PATCH n/m] is intended here? If everyone adds RFC to
a regular PATCH, it looses meaning. In the course of review when you
feel the patches are okay to be committed, RFC should disappear as you
may well get comments without asking for them anyway. :)

HTE,
Andreas
diff mbox

Patch

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index da17bba..20c6ba5 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -966,43 +966,53 @@  static void tcg_out_jmp(TCGContext *s, tcg_target_long dest)
 #include "../../softmmu_defs.h"
 
 #ifdef CONFIG_TCG_PASS_AREG0
-/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
-   int mmu_idx) */
+/* extended helper signature: ext_helper_ld_mmu(CPUState *env,
+   target_ulong addr, int mmu_idx, uintptr_t raddr) */
 static const void *qemu_ld_helpers[4] = {
-    helper_ldb_mmu,
-    helper_ldw_mmu,
-    helper_ldl_mmu,
-    helper_ldq_mmu,
+    ext_helper_ldb_mmu,
+    ext_helper_ldw_mmu,
+    ext_helper_ldl_mmu,
+    ext_helper_ldq_mmu,
 };
 
-/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
-   uintxx_t val, int mmu_idx) */
+/* extended helper signature: ext_helper_st_mmu(CPUState *env,
+   target_ulong addr, uintxx_t val, int mmu_idx, uintptr_t raddr) */
 static const void *qemu_st_helpers[4] = {
-    helper_stb_mmu,
-    helper_stw_mmu,
-    helper_stl_mmu,
-    helper_stq_mmu,
+    ext_helper_stb_mmu,
+    ext_helper_stw_mmu,
+    ext_helper_stl_mmu,
+    ext_helper_stq_mmu,
 };
 #else
-/* legacy helper signature: __ld_mmu(target_ulong addr, int
-   mmu_idx) */
+/* extended legacy helper signature: ext_ld_mmu(target_ulong addr,
+   int mmu_idx, uintptr_t raddr) */
 static void *qemu_ld_helpers[4] = {
-    __ldb_mmu,
-    __ldw_mmu,
-    __ldl_mmu,
-    __ldq_mmu,
+    ext_ldb_mmu,
+    ext_ldw_mmu,
+    ext_ldl_mmu,
+    ext_ldq_mmu,
 };
 
-/* legacy helper signature: __st_mmu(target_ulong addr, uintxx_t val,
-   int mmu_idx) */
+/* extended legacy helper signature: ext_st_mmu(target_ulong addr,
+   uintxx_t val, int mmu_idx, uintptr_t raddr) */
 static void *qemu_st_helpers[4] = {
-    __stb_mmu,
-    __stw_mmu,
-    __stl_mmu,
-    __stq_mmu,
+    ext_stb_mmu,
+    ext_stw_mmu,
+    ext_stl_mmu,
+    ext_stq_mmu,
 };
 #endif
 
+static void add_qemu_ldst_label(TCGContext *s,
+                                int opc_ext,
+                                int data_reg,
+                                int data_reg2,
+                                int addrlo_reg,
+                                int addrhi_reg,
+                                int mem_index,
+                                uint8_t *raddr,
+                                uint8_t **label_ptr);
+
 /* Perform the TLB load and compare.
 
    Inputs:
@@ -1061,19 +1071,21 @@  static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,
 
     tcg_out_mov(s, type, r0, addrlo);
 
-    /* jne label1 */
-    tcg_out8(s, OPC_JCC_short + JCC_JNE);
+    /* jne slow_path */
+    /* XXX: How to avoid using OPC_JCC_long for peephole optimization? */
+    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
     label_ptr[0] = s->code_ptr;
-    s->code_ptr++;
+    s->code_ptr += 4;
 
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
         /* cmp 4(r1), addrhi */
         tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4);
 
-        /* jne label1 */
-        tcg_out8(s, OPC_JCC_short + JCC_JNE);
+        /* jne slow_path */
+        /* XXX: How to avoid using OPC_JCC_long for peephole optimization? */
+        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
         label_ptr[1] = s->code_ptr;
-        s->code_ptr++;
+        s->code_ptr += 4;
     }
 
     /* TLB Hit.  */
@@ -1171,12 +1183,7 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     int addrlo_idx;
 #if defined(CONFIG_SOFTMMU)
     int mem_index, s_bits;
-#if TCG_TARGET_REG_BITS == 64
-    int arg_idx;
-#else
-    int stack_adjust;
-#endif
-    uint8_t *label_ptr[3];
+    uint8_t *label_ptr[2];
 #endif
 
     data_reg = args[0];
@@ -1197,101 +1204,16 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     tcg_out_qemu_ld_direct(s, data_reg, data_reg2,
                            tcg_target_call_iarg_regs[0], 0, opc);
 
-    /* jmp label2 */
-    tcg_out8(s, OPC_JMP_short);
-    label_ptr[2] = s->code_ptr;
-    s->code_ptr++;
-
-    /* TLB Miss.  */
-
-    /* label1: */
-    *label_ptr[0] = s->code_ptr - label_ptr[0] - 1;
-    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        *label_ptr[1] = s->code_ptr - label_ptr[1] - 1;
-    }
-
-    /* XXX: move that code at the end of the TB */
-#if TCG_TARGET_REG_BITS == 32
-    tcg_out_pushi(s, mem_index);
-    stack_adjust = 4;
-    if (TARGET_LONG_BITS == 64) {
-        tcg_out_push(s, args[addrlo_idx + 1]);
-        stack_adjust += 4;
-    }
-    tcg_out_push(s, args[addrlo_idx]);
-    stack_adjust += 4;
-#ifdef CONFIG_TCG_PASS_AREG0
-    tcg_out_push(s, TCG_AREG0);
-    stack_adjust += 4;
-#endif
-#else
-    /* The first argument is already loaded with addrlo.  */
-    arg_idx = 1;
-    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx],
-                 mem_index);
-#ifdef CONFIG_TCG_PASS_AREG0
-    /* XXX/FIXME: suboptimal */
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
-                tcg_target_call_iarg_regs[2]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
-                tcg_target_call_iarg_regs[1]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
-                tcg_target_call_iarg_regs[0]);
-    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
-                TCG_AREG0);
-#endif
-#endif
-
-    tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
-
-#if TCG_TARGET_REG_BITS == 32
-    if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
-        /* Pop and discard.  This is 2 bytes smaller than the add.  */
-        tcg_out_pop(s, TCG_REG_ECX);
-    } else if (stack_adjust != 0) {
-        tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
-    }
-#endif
-
-    switch(opc) {
-    case 0 | 4:
-        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
-        break;
-    case 1 | 4:
-        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
-        break;
-    case 0:
-        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
-        break;
-    case 1:
-        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
-        break;
-    case 2:
-        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
-        break;
-#if TCG_TARGET_REG_BITS == 64
-    case 2 | 4:
-        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
-        break;
-#endif
-    case 3:
-        if (TCG_TARGET_REG_BITS == 64) {
-            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
-        } else if (data_reg == TCG_REG_EDX) {
-            /* xchg %edx, %eax */
-            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
-        } else {
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
-        }
-        break;
-    default:
-        tcg_abort();
-    }
-
-    /* label2: */
-    *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+    /* Record the current context of a load into ldst label */
+    add_qemu_ldst_label(s,
+                        opc,
+                        data_reg,
+                        data_reg2,
+                        args[addrlo_idx],
+                        args[addrlo_idx + 1],
+                        mem_index,
+                        s->code_ptr,
+                        label_ptr);
 #else
     {
         int32_t offset = GUEST_BASE;
@@ -1385,8 +1307,7 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     int addrlo_idx;
 #if defined(CONFIG_SOFTMMU)
     int mem_index, s_bits;
-    int stack_adjust;
-    uint8_t *label_ptr[3];
+    uint8_t *label_ptr[2];
 #endif
 
     data_reg = args[0];
@@ -1407,34 +1328,242 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     tcg_out_qemu_st_direct(s, data_reg, data_reg2,
                            tcg_target_call_iarg_regs[0], 0, opc);
 
-    /* jmp label2 */
-    tcg_out8(s, OPC_JMP_short);
-    label_ptr[2] = s->code_ptr;
-    s->code_ptr++;
+    /* Record the current context of a store into ldst label */
+    add_qemu_ldst_label(s,
+                        opc | HL_ST_MASK,
+                        data_reg,
+                        data_reg2,
+                        args[addrlo_idx],
+                        args[addrlo_idx + 1],
+                        mem_index,
+                        s->code_ptr,
+                        label_ptr);
+#else
+    {
+        int32_t offset = GUEST_BASE;
+        int base = args[addrlo_idx];
+
+        if (TCG_TARGET_REG_BITS == 64) {
+            /* ??? We assume all operations have left us with register
+               contents that are zero extended.  So far this appears to
+               be true.  If we want to enforce this, we can either do
+               an explicit zero-extension here, or (if GUEST_BASE == 0)
+               use the ADDR32 prefix.  For now, do nothing.  */
+
+            if (offset != GUEST_BASE) {
+                tcg_out_movi(s, TCG_TYPE_I64,
+                             tcg_target_call_iarg_regs[0], GUEST_BASE);
+                tgen_arithr(s, ARITH_ADD + P_REXW,
+                            tcg_target_call_iarg_regs[0], base);
+                base = tcg_target_call_iarg_regs[0];
+                offset = 0;
+            }
+        }
+
+        tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, opc);
+    }
+#endif
+}
+
+#if defined(CONFIG_SOFTMMU)
+/*
+ * Record the context of a call to the out of line helper code for the slow path
+ * for a load or store, so that we can later generate the correct helper code
+ */
+static void add_qemu_ldst_label(TCGContext *s,
+                                int opc_ext,
+                                int data_reg,
+                                int data_reg2,
+                                int addrlo_reg,
+                                int addrhi_reg,
+                                int mem_index,
+                                uint8_t *raddr,
+                                uint8_t **label_ptr)
+{
+    int idx;
+    TCGLabelQemuLdst *label;
 
-    /* TLB Miss.  */
+    if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
+        tcg_abort();
+    }
 
-    /* label1: */
-    *label_ptr[0] = s->code_ptr - label_ptr[0] - 1;
+    idx = s->nb_qemu_ldst_labels++;
+    label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
+    label->opc_ext = opc_ext;
+    label->datalo_reg = data_reg;
+    label->datahi_reg = data_reg2;
+    label->addrlo_reg = addrlo_reg;
+    label->addrhi_reg = addrhi_reg;
+    label->mem_index = mem_index;
+    label->raddr = raddr;
+    label->label_ptr[0] = label_ptr[0];
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        *label_ptr[1] = s->code_ptr - label_ptr[1] - 1;
+        label->label_ptr[1] = label_ptr[1];
     }
+}
 
-    /* XXX: move that code at the end of the TB */
+/*
+ * Generate code for the slow path for a load at the end of block
+ */
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+    int s_bits;
+    int opc = label->opc_ext & HL_OPC_MASK;
+    int mem_index = label->mem_index;
 #if TCG_TARGET_REG_BITS == 32
-    tcg_out_pushi(s, mem_index);
+    int stack_adjust;
+    int addrlo_reg = label->addrlo_reg;
+    int addrhi_reg = label->addrhi_reg;
+#endif
+    int data_reg = label->datalo_reg;
+    int data_reg2 = label->datahi_reg;
+    uint8_t *raddr = label->raddr;
+    uint8_t **label_ptr = &label->label_ptr[0];
+
+    s_bits = opc & 3;
+
+    /* resolve label address */
+    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+    }
+
+    /* extended legacy helper signature (w/o CONFIG_TCG_PASS_AREG0):
+       ext_ld_mmu(target_ulong addr, int mmu_idx, uintptr_t raddr) */
+#if TCG_TARGET_REG_BITS == 32
+    tcg_out_pushi(s, (tcg_target_ulong)(raddr - 1)); /* return address */
     stack_adjust = 4;
+    tcg_out_pushi(s, mem_index);        /* mmu index */
+    stack_adjust += 4;
+    if (TARGET_LONG_BITS == 64) {
+        tcg_out_push(s, addrhi_reg);
+        stack_adjust += 4;
+    }
+    tcg_out_push(s, addrlo_reg); /* guest addr */
+    stack_adjust += 4;
+#ifdef CONFIG_TCG_PASS_AREG0
+    /* extended helper signature (w/ CONFIG_TCG_PASS_AREG0):
+       ext_helper_ld_mmu(CPUState *env, target_ulong addr, int mmu_idx,
+       uintptr_t raddr) */
+    tcg_out_push(s, TCG_AREG0);
+    stack_adjust += 4;
+#endif
+#else
+    /* The first argument is already loaded with addrlo.  */
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[1],
+                 mem_index);
+    tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2],
+                 (tcg_target_ulong)(raddr - 1)); /* return address */
+#ifdef CONFIG_TCG_PASS_AREG0
+    /* XXX/FIXME: suboptimal */
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
+                tcg_target_call_iarg_regs[2]);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
+                tcg_target_call_iarg_regs[1]);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
+                tcg_target_call_iarg_regs[0]);
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
+                TCG_AREG0);
+#endif
+#endif
+
+    tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
+
+#if TCG_TARGET_REG_BITS == 32
+    if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
+        /* Pop and discard.  This is 2 bytes smaller than the add.  */
+        tcg_out_pop(s, TCG_REG_ECX);
+    } else if (stack_adjust != 0) {
+        tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
+    }
+#endif
+
+    switch (opc) {
+    case 0 | 4:
+        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
+        break;
+    case 1 | 4:
+        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
+        break;
+    case 0:
+        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
+        break;
+    case 1:
+        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
+        break;
+    case 2:
+        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+        break;
+#if TCG_TARGET_REG_BITS == 64
+    case 2 | 4:
+        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
+        break;
+#endif
+    case 3:
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
+        } else if (data_reg == TCG_REG_EDX) {
+            /* xchg %edx, %eax */
+            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+            tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
+        }
+        break;
+    default:
+        tcg_abort();
+    }
+
+    /* Jump back to the original code accessing a guest memory */
+    tcg_out_jmp(s, (tcg_target_long) raddr);
+}
+
+/*
+ * Generate code for the slow path for a store at the end of block
+ */
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+    int s_bits;
+    int stack_adjust;
+    int opc = label->opc_ext & HL_OPC_MASK;
+    int mem_index = label->mem_index;
+    int data_reg = label->datalo_reg;
+#if TCG_TARGET_REG_BITS == 32
+    int data_reg2 = label->datahi_reg;
+    int addrlo_reg = label->addrlo_reg;
+    int addrhi_reg = label->addrhi_reg;
+#endif
+    uint8_t *raddr = label->raddr;
+    uint8_t **label_ptr = &label->label_ptr[0];
+
+    s_bits = opc & 3;
+
+    /* resolve label address */
+    *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+    }
+
+    /* extended legacy helper signature (w/o CONFIG_TCG_PASS_AREG0):
+       ext_st_mmu(target_ulong addr, uintxx_t val, int mmu_idx,
+       uintptr_t raddr) */
+#if TCG_TARGET_REG_BITS == 32
+    tcg_out_pushi(s, (tcg_target_ulong)(raddr - 1)); /* return address */
+    stack_adjust = 4;
+    tcg_out_pushi(s, mem_index); /* mmu index */
+    stack_adjust += 4;
     if (opc == 3) {
         tcg_out_push(s, data_reg2);
         stack_adjust += 4;
     }
-    tcg_out_push(s, data_reg);
+    tcg_out_push(s, data_reg);   /* guest data */
     stack_adjust += 4;
     if (TARGET_LONG_BITS == 64) {
-        tcg_out_push(s, args[addrlo_idx + 1]);
+        tcg_out_push(s, addrhi_reg);
         stack_adjust += 4;
     }
-    tcg_out_push(s, args[addrlo_idx]);
+    tcg_out_push(s, addrlo_reg); /* guest addr */
     stack_adjust += 4;
 #ifdef CONFIG_TCG_PASS_AREG0
     tcg_out_push(s, TCG_AREG0);
@@ -1444,9 +1573,23 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),
                 tcg_target_call_iarg_regs[1], data_reg);
     tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index);
+#if defined(_WIN64)
+    tcg_out_pushi(s, (tcg_target_ulong)(raddr - 1)); /* return address */
+    stack_adjust += 8;
+#else
+    tcg_out_movi(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
+                 (tcg_target_ulong)(raddr - 1)); /* return address */
     stack_adjust = 0;
+#endif
 #ifdef CONFIG_TCG_PASS_AREG0
+    /* extended helper signature (w/ CONFIG_TCG_PASS_AREG0):
+       ext_helper_st_mmu(CPUState *env, target_ulong addr, uintxx_t val,
+       int mmu_idx, uintptr_t raddr) */
     /* XXX/FIXME: suboptimal */
+#if !defined(_WIN64)
+    tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[4],
+                tcg_target_call_iarg_regs[3]);
+#endif
     tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
                 tcg_target_call_iarg_regs[2]);
     tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
@@ -1467,34 +1610,28 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
         tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
     }
 
-    /* label2: */
-    *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
-#else
-    {
-        int32_t offset = GUEST_BASE;
-        int base = args[addrlo_idx];
+    /* Jump back to the original code accessing a guest memory */
+    tcg_out_jmp(s, (tcg_target_long) raddr);
+}
 
-        if (TCG_TARGET_REG_BITS == 64) {
-            /* ??? We assume all operations have left us with register
-               contents that are zero extended.  So far this appears to
-               be true.  If we want to enforce this, we can either do
-               an explicit zero-extension here, or (if GUEST_BASE == 0)
-               use the ADDR32 prefix.  For now, do nothing.  */
+/*
+ * Generate all of the slow paths of qemu_ld/st at the end of block
+ */
+void tcg_out_qemu_ldst_slow_path(TCGContext *s)
+{
+    int i;
+    TCGLabelQemuLdst *label;
 
-            if (offset != GUEST_BASE) {
-                tcg_out_movi(s, TCG_TYPE_I64,
-                             tcg_target_call_iarg_regs[0], GUEST_BASE);
-                tgen_arithr(s, ARITH_ADD + P_REXW,
-                            tcg_target_call_iarg_regs[0], base);
-                base = tcg_target_call_iarg_regs[0];
-                offset = 0;
+    for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
+        label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i];
+        if (IS_QEMU_LD_LABEL(label)) {
+            tcg_out_qemu_ld_slow_path(s, label);
+        } else {
+            tcg_out_qemu_st_slow_path(s, label);
             }
         }
-
-        tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, opc);
-    }
-#endif
 }
+#endif  /* CONFIG_SOFTMMU */
 
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 8386b70..346197f 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -301,6 +301,13 @@  void tcg_func_start(TCGContext *s)
 
     gen_opc_ptr = gen_opc_buf;
     gen_opparam_ptr = gen_opparam_buf;
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+    /* Initialize qemu_ld/st labels to assist code generation at the end of TB
+       for TLB miss cases at the end of TB */
+    s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) *
+                                     TCG_MAX_QEMU_LDST);
+    s->nb_qemu_ldst_labels = 0;
+#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION && CONFIG_SOFTMMU */
 }
 
 static inline void tcg_temp_alloc(TCGContext *s, int n)
@@ -2169,6 +2176,11 @@  static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
 #endif
     }
  the_end:
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+    /* Generate slow paths of qemu_ld/st IRs which call MMU helpers at
+       the end of block */
+    tcg_out_qemu_ldst_slow_path(s);
+#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION && CONFIG_SOFTMMU */
     return -1;
 }
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index d710694..a8454f8 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -187,6 +187,29 @@  typedef tcg_target_ulong TCGArg;
    are aliases for target_ulong and host pointer sized values respectively.
  */
 
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+/* Macros/structures for qemu_ld/st IR code optimization:
+   TCG_MAX_HELPER_LABELS is defined as same as OPC_BUF_SIZE in exec-all.h. */
+#define TCG_MAX_QEMU_LDST       640
+#define HL_LDST_SHIFT           4
+#define HL_LDST_MASK            (1 << HL_LDST_SHIFT)
+#define HL_ST_MASK              HL_LDST_MASK
+#define HL_OPC_MASK             (HL_LDST_MASK - 1)
+#define IS_QEMU_LD_LABEL(L)     (!((L)->opc_ext & HL_LDST_MASK))
+#define IS_QEMU_ST_LABEL(L)     ((L)->opc_ext & HL_LDST_MASK)
+
+typedef struct TCGLabelQemuLdst {
+    int opc_ext;            /* | 27bit(reserved) | 1bit(ld/st) | 4bit(opc) | */
+    int addrlo_reg;         /* reg index for low word of guest virtual addr */
+    int addrhi_reg;         /* reg index for high word of guest virtual addr */
+    int datalo_reg;         /* reg index for low word to be loaded or stored */
+    int datahi_reg;         /* reg index for high word to be loaded or stored */
+    int mem_index;          /* soft MMU memory index */
+    uint8_t *raddr;         /* gen code addr corresponding to qemu_ld/st IR */
+    uint8_t *label_ptr[2];  /* label pointers to be updated */
+} TCGLabelQemuLdst;
+#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION && CONFIG_SOFTMMU */
+
 #ifdef CONFIG_DEBUG_TCG
 #define DEBUG_TCGV 1
 #endif
@@ -389,6 +412,13 @@  struct TCGContext {
 #ifdef CONFIG_DEBUG_TCG
     int temps_in_use;
 #endif
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+    /* labels info for qemu_ld/st IRs
+       The labels help to generate TLB miss case codes at the end of TB */
+    TCGLabelQemuLdst *qemu_ldst_labels;
+    int nb_qemu_ldst_labels;
+#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION && CONFIG_SOFTMMU */
 };
 
 extern TCGContext tcg_ctx;
@@ -588,3 +618,8 @@  extern uint8_t code_gen_prologue[];
 #endif
 
 void tcg_register_jit(void *buf, size_t buf_size);
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+/* Generate all of the slow paths of qemu_ld/st at the end of block */
+void tcg_out_qemu_ldst_slow_path(TCGContext *s);
+#endif  /* CONFIG_QEMU_LDST_OPTIMIZATION && CONFIG_SOFTMMU */