Patchwork [PULL,7/7] tcg-i386: Use new return-argument ld/st helpers

login
register
mail settings
Submitter Richard Henderson
Date Aug. 26, 2013, 9 p.m.
Message ID <1377550812-908-8-git-send-email-rth@twiddle.net>
Download mbox | patch
Permalink /patch/269983/
State New
Headers show

Comments

Richard Henderson - Aug. 26, 2013, 9 p.m.
Discontinue the jump-around-jump-to-jump scheme, trading it for a single
immediate move instruction.  The two extra jumps always consume 7 bytes,
whereas the immediate move is either 5 or 7 bytes depending on where the
code_gen_buffer gets located.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/exec/exec-all.h |  13 +-----
 tcg/i386/tcg-target.c   | 103 ++++++++++++++++++++++--------------------------
 2 files changed, 49 insertions(+), 67 deletions(-)
Richard W.M. Jones - Aug. 28, 2013, 10:55 p.m.
On Mon, Aug 26, 2013 at 02:00:12PM -0700, Richard Henderson wrote:
> Discontinue the jump-around-jump-to-jump scheme, trading it for a single
> immediate move instruction.  The two extra jumps always consume 7 bytes,
> whereas the immediate move is either 5 or 7 bytes depending on where the
> code_gen_buffer gets located.

This seems to have broken qemu-system-ppc64:

https://bugs.launchpad.net/qemu/+bug/1218098

Rich.
Stefan Weil - May 27, 2014, 10:37 p.m.
Am 26.08.2013 23:00, schrieb Richard Henderson:
> Discontinue the jump-around-jump-to-jump scheme, trading it for a single
> immediate move instruction.  The two extra jumps always consume 7 bytes,
> whereas the immediate move is either 5 or 7 bytes depending on where the
> code_gen_buffer gets located.
>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  include/exec/exec-all.h |  13 +-----
>  tcg/i386/tcg-target.c   | 103 ++++++++++++++++++++++--------------------------
>  2 files changed, 49 insertions(+), 67 deletions(-)
>
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 5920f73..b70028a 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -326,18 +326,9 @@ extern uintptr_t tci_tb_ptr;
>     (6) jump to corresponding code of the next of fast path
>   */
>  # if defined(__i386__) || defined(__x86_64__)
> -/* To avoid broken disassembling, long jmp is used for embedding fast path pc,
> -   so that the destination is the next code of fast path, though this jmp is
> -   never executed.
> -
> -   call MMU helper
> -   jmp POST_PROC (2byte)    <- GETRA()
> -   jmp NEXT_CODE (5byte)
> -   POST_PROCESS ...         <- GETRA() + 7
> - */
>  #  define GETRA() ((uintptr_t)__builtin_return_address(0))
> -#  define GETPC_LDST() ((uintptr_t)(GETRA() + 7 + \
> -                                    *(int32_t *)((void *)GETRA() + 3) - 1))
> +/* The return address argument for ldst is passed directly.  */
> +#  define GETPC_LDST()  (abort(), 0)
>  # elif defined (_ARCH_PPC) && !defined (_ARCH_PPC64)
>  #  define GETRA() ((uintptr_t)__builtin_return_address(0))
>  #  define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() - 4)) - 1))
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index fba50f8..12a7ca3 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -190,11 +190,11 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
>          /* qemu_ld/st address constraint */
>      case 'L':
>          ct->ct |= TCG_CT_REG;
> -#if TCG_TARGET_REG_BITS == 64
> +        if (TCG_TARGET_REG_BITS == 64) {
>              tcg_regset_set32(ct->u.regs, 0, 0xffff);
> -#else
> +        } else {
>              tcg_regset_set32(ct->u.regs, 0, 0xff);
> -#endif
> +        }
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
>          break;
> @@ -1025,22 +1025,24 @@ static void tcg_out_jmp(TCGContext *s, tcg_target_long dest)
>  
>  #include "exec/softmmu_defs.h"
>  
> -/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
> -   int mmu_idx) */
> -static const void *qemu_ld_helpers[4] = {
> -    helper_ldb_mmu,
> -    helper_ldw_mmu,
> -    helper_ldl_mmu,
> -    helper_ldq_mmu,
> +/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
> + *                                     int mmu_idx, uintptr_t ra)
> + */
> +static const void * const qemu_ld_helpers[4] = {
> +    helper_ret_ldb_mmu,
> +    helper_ret_ldw_mmu,
> +    helper_ret_ldl_mmu,
> +    helper_ret_ldq_mmu,
>  };
>  
> -/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
> -   uintxx_t val, int mmu_idx) */
> -static const void *qemu_st_helpers[4] = {
> -    helper_stb_mmu,
> -    helper_stw_mmu,
> -    helper_stl_mmu,
> -    helper_stq_mmu,
> +/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
> + *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
> + */
> +static const void * const qemu_st_helpers[4] = {
> +    helper_ret_stb_mmu,
> +    helper_ret_stw_mmu,
> +    helper_ret_stl_mmu,
> +    helper_ret_stq_mmu,
>  };
>  
>  static void add_qemu_ldst_label(TCGContext *s,
> @@ -1468,6 +1470,12 @@ static void add_qemu_ldst_label(TCGContext *s,
>      }
>  }
>  
> +/* See the GETPC definition in include/exec/exec-all.h.  */
> +static inline uintptr_t do_getpc(uint8_t *raddr)
> +{
> +    return (uintptr_t)raddr - 1;
> +}
> +
>  /*
>   * Generate code for the slow path for a load at the end of block
>   */
> @@ -1499,33 +1507,20 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
>          }
>  
>          tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index);
> +        ofs += 4;
> +
> +        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, do_getpc(l->raddr));
>      } else {
> -        tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
> +        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
>          /* The second argument is already loaded with addrlo.  */
>          tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2],
>                       l->mem_index);
> +        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
> +                     do_getpc(l->raddr));
>      }
>  
> -    /* Code generation of qemu_ld/st's slow path calling MMU helper
> -
> -       PRE_PROC ...
> -       call MMU helper
> -       jmp POST_PROC (2b) : short forward jump <- GETRA()
> -       jmp next_code (5b) : dummy long backward jump which is never executed
> -       POST_PROC ... : do post-processing <- GETRA() + 7
> -       jmp next_code : jump to the code corresponding to next IR of qemu_ld/st
> -    */
> -
>      tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
>  
> -    /* Jump to post-processing code */
> -    tcg_out8(s, OPC_JMP_short);
> -    tcg_out8(s, 5);
> -    /* Dummy backward jump having information of fast path'pc for MMU helpers */
> -    tcg_out8(s, OPC_JMP_long);
> -    *(int32_t *)s->code_ptr = (int32_t)(l->raddr - s->code_ptr - 4);
> -    s->code_ptr += 4;
> -
>      data_reg = l->datalo_reg;
>      switch(opc) {
>      case 0 | 4:
> @@ -1606,36 +1601,32 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
>          }
>  
>          tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index);
> +        ofs += 4;
> +
> +        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, do_getpc(l->raddr));
>      } else {
> -        tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
> +        uintptr_t pc;
> +
> +        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
>          /* The second argument is already loaded with addrlo.  */
>          tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),
>                      tcg_target_call_iarg_regs[2], l->datalo_reg);
>          tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3],
>                       l->mem_index);
> -    }
>  
> -    /* Code generation of qemu_ld/st's slow path calling MMU helper
> -
> -       PRE_PROC ...
> -       call MMU helper
> -       jmp POST_PROC (2b) : short forward jump <- GETRA()
> -       jmp next_code (5b) : dummy long backward jump which is never executed
> -       POST_PROC ... : do post-processing <- GETRA() + 7
> -       jmp next_code : jump to the code corresponding to next IR of qemu_ld/st
> -    */
> +        pc = do_getpc(l->raddr);
> +        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
> +            tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[4], pc);
> +        } else if (pc == (int32_t)pc) {
> +            tcg_out_sti(s, TCG_TYPE_PTR, TCG_REG_ESP, 0, pc);
> +        } else {
> +            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, pc);
> +            tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_RAX, TCG_REG_ESP, 0);
> +        }
> +    }
>  
>      tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]);
>  
> -    /* Jump to post-processing code */
> -    tcg_out8(s, OPC_JMP_short);
> -    tcg_out8(s, 5);
> -    /* Dummy backward jump having information of fast path'pc for MMU helpers */
> -    tcg_out8(s, OPC_JMP_long);
> -    *(int32_t *)s->code_ptr = (int32_t)(l->raddr - s->code_ptr - 4);
> -    s->code_ptr += 4;
> -
> -    /* Jump to the code corresponding to next IR of qemu_st */
>      tcg_out_jmp(s, (tcg_target_long)l->raddr);
>  }

Hi Richard,

this patch has broken the 64 bit version of QEMU for Windows: a Linux
guest starts booting, but hangs after "Booting the kernel.". I got a bug
report from a user and did a "git bisect" with a Tiny Core Linux guest /
cross build with default options / cross test with wine64 and default
options. Git reported this commit:

401c227b0a1134245ec61c6c5a9997cfc963c8e4 is the first bad commit
commit 401c227b0a1134245ec61c6c5a9997cfc963c8e4
Author: Richard Henderson <rth@twiddle.net>
Date:   Thu Jul 25 07:16:52 2013 -1000

    tcg-i386: Use new return-argument ld/st helpers
   
    Discontinue the jump-around-jump-to-jump scheme, trading it for a single
    immediate move instruction.  The two extra jumps always consume 7 bytes,
    whereas the immediate move is either 5 or 7 bytes depending on where the
    code_gen_buffer gets located.
   
    Signed-off-by: Richard Henderson <rth@twiddle.net>

:040000 040000 dfd9a66c85713cd1886a3342de1e9ac95d7ea43f
df8673dea69bc89cc2cc979aa24415e3fea4ed53 M    include
:040000 040000 1f7cd5291f2c69b4126c63bd567c6b106eb332c9
87e7ece766168dda860b513dc97fe5af28ec2c4b M    tcg

32 bit versions of QEMU for Windows don't show this problem.

Regards
Stefan
Richard Henderson - May 28, 2014, 5:12 p.m.
On 05/27/2014 03:37 PM, Stefan Weil wrote:
> Hi Richard,
> 
> this patch has broken the 64 bit version of QEMU for Windows: a Linux
> guest starts booting, but hangs after "Booting the kernel.". I got a bug
> report from a user and did a "git bisect" with a Tiny Core Linux guest /
> cross build with default options / cross test with wine64 and default
> options. Git reported this commit:
> 
> 401c227b0a1134245ec61c6c5a9997cfc963c8e4 is the first bad commit
> commit 401c227b0a1134245ec61c6c5a9997cfc963c8e4
> Author: Richard Henderson <rth@twiddle.net>
> Date:   Thu Jul 25 07:16:52 2013 -1000
> 
>     tcg-i386: Use new return-argument ld/st helpers
>    
>     Discontinue the jump-around-jump-to-jump scheme, trading it for a single
>     immediate move instruction.  The two extra jumps always consume 7 bytes,
>     whereas the immediate move is either 5 or 7 bytes depending on where the
>     code_gen_buffer gets located.
>    
>     Signed-off-by: Richard Henderson <rth@twiddle.net>
> 
> :040000 040000 dfd9a66c85713cd1886a3342de1e9ac95d7ea43f
> df8673dea69bc89cc2cc979aa24415e3fea4ed53 M    include
> :040000 040000 1f7cd5291f2c69b4126c63bd567c6b106eb332c9
> 87e7ece766168dda860b513dc97fe5af28ec2c4b M    tcg
> 
> 32 bit versions of QEMU for Windows don't show this problem.

I'm having problem booting any iso with wine at the moment:

$ wine64 ./x86_64-softmmu/qemu-system-x86_64.exe -L ./pc-bios \
    -vnc :1 -cdrom ../../../Downloads/TinyCore-current.iso
Assertion failed!

Program: Z:\home\rth\work\qemu\bld-w64\x86_64-softmmu\qemu-system-x86_64.exe
File: /home/rth/work/qemu/qemu/qemu-coroutine-lock.c, Line 91

Expression: qemu_in_coroutine()

abnormal program termination

Naturally, this isn't happening with a native linux boot with the same arguments.

But I can boot an alpha rom:

$ wine64 ./alpha-softmmu/qemu-system-alpha.exe -L ./pc-bios/ -nographic
PCI: 00:00:0 class 0300 id 1013:00b8
PCI:   region 0: 10000000
PCI:   region 1: 12000000
PCI: 00:01:0 class 0200 id 8086:100e
PCI:   region 0: 12020000
PCI:   region 1: 0000c000
PCI: 00:02:0 class 0101 id 1095:0646
PCI:   region 0: 0000c040
PCI:   region 1: 0000c048
PCI:   region 3: 0000c04c
>>>

Which says to me that it's rather unlikely that this basic load/store patch
could be the problem.


r~
Paolo Bonzini - May 28, 2014, 5:16 p.m.
Il 28/05/2014 19:12, Richard Henderson ha scritto:
> $ wine64 ./x86_64-softmmu/qemu-system-x86_64.exe -L ./pc-bios \
>     -vnc :1 -cdrom ../../../Downloads/TinyCore-current.iso
> Assertion failed!
>
> Program: Z:\home\rth\work\qemu\bld-w64\x86_64-softmmu\qemu-system-x86_64.exe
> File: /home/rth/work/qemu/qemu/qemu-coroutine-lock.c, Line 91
>
> Expression: qemu_in_coroutine()
>
> abnormal program termination
>
> Naturally, this isn't happening with a native linux boot with the same arguments.

http://wiki.qemu.org/ChangeLog/2.0

"On Win32, QEMU must be compiled with --disable-coroutine-pool to work 
around a suspected compiler bug."

Paolo

Patch

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 5920f73..b70028a 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -326,18 +326,9 @@  extern uintptr_t tci_tb_ptr;
    (6) jump to corresponding code of the next of fast path
  */
 # if defined(__i386__) || defined(__x86_64__)
-/* To avoid broken disassembling, long jmp is used for embedding fast path pc,
-   so that the destination is the next code of fast path, though this jmp is
-   never executed.
-
-   call MMU helper
-   jmp POST_PROC (2byte)    <- GETRA()
-   jmp NEXT_CODE (5byte)
-   POST_PROCESS ...         <- GETRA() + 7
- */
 #  define GETRA() ((uintptr_t)__builtin_return_address(0))
-#  define GETPC_LDST() ((uintptr_t)(GETRA() + 7 + \
-                                    *(int32_t *)((void *)GETRA() + 3) - 1))
+/* The return address argument for ldst is passed directly.  */
+#  define GETPC_LDST()  (abort(), 0)
 # elif defined (_ARCH_PPC) && !defined (_ARCH_PPC64)
 #  define GETRA() ((uintptr_t)__builtin_return_address(0))
 #  define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() - 4)) - 1))
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index fba50f8..12a7ca3 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -190,11 +190,11 @@  static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         /* qemu_ld/st address constraint */
     case 'L':
         ct->ct |= TCG_CT_REG;
-#if TCG_TARGET_REG_BITS == 64
+        if (TCG_TARGET_REG_BITS == 64) {
             tcg_regset_set32(ct->u.regs, 0, 0xffff);
-#else
+        } else {
             tcg_regset_set32(ct->u.regs, 0, 0xff);
-#endif
+        }
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
         break;
@@ -1025,22 +1025,24 @@  static void tcg_out_jmp(TCGContext *s, tcg_target_long dest)
 
 #include "exec/softmmu_defs.h"
 
-/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
-   int mmu_idx) */
-static const void *qemu_ld_helpers[4] = {
-    helper_ldb_mmu,
-    helper_ldw_mmu,
-    helper_ldl_mmu,
-    helper_ldq_mmu,
+/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
+ *                                     int mmu_idx, uintptr_t ra)
+ */
+static const void * const qemu_ld_helpers[4] = {
+    helper_ret_ldb_mmu,
+    helper_ret_ldw_mmu,
+    helper_ret_ldl_mmu,
+    helper_ret_ldq_mmu,
 };
 
-/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
-   uintxx_t val, int mmu_idx) */
-static const void *qemu_st_helpers[4] = {
-    helper_stb_mmu,
-    helper_stw_mmu,
-    helper_stl_mmu,
-    helper_stq_mmu,
+/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
+ *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
+ */
+static const void * const qemu_st_helpers[4] = {
+    helper_ret_stb_mmu,
+    helper_ret_stw_mmu,
+    helper_ret_stl_mmu,
+    helper_ret_stq_mmu,
 };
 
 static void add_qemu_ldst_label(TCGContext *s,
@@ -1468,6 +1470,12 @@  static void add_qemu_ldst_label(TCGContext *s,
     }
 }
 
+/* See the GETPC definition in include/exec/exec-all.h.  */
+static inline uintptr_t do_getpc(uint8_t *raddr)
+{
+    return (uintptr_t)raddr - 1;
+}
+
 /*
  * Generate code for the slow path for a load at the end of block
  */
@@ -1499,33 +1507,20 @@  static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
         }
 
         tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index);
+        ofs += 4;
+
+        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, do_getpc(l->raddr));
     } else {
-        tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
+        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
         /* The second argument is already loaded with addrlo.  */
         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2],
                      l->mem_index);
+        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
+                     do_getpc(l->raddr));
     }
 
-    /* Code generation of qemu_ld/st's slow path calling MMU helper
-
-       PRE_PROC ...
-       call MMU helper
-       jmp POST_PROC (2b) : short forward jump <- GETRA()
-       jmp next_code (5b) : dummy long backward jump which is never executed
-       POST_PROC ... : do post-processing <- GETRA() + 7
-       jmp next_code : jump to the code corresponding to next IR of qemu_ld/st
-    */
-
     tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
 
-    /* Jump to post-processing code */
-    tcg_out8(s, OPC_JMP_short);
-    tcg_out8(s, 5);
-    /* Dummy backward jump having information of fast path'pc for MMU helpers */
-    tcg_out8(s, OPC_JMP_long);
-    *(int32_t *)s->code_ptr = (int32_t)(l->raddr - s->code_ptr - 4);
-    s->code_ptr += 4;
-
     data_reg = l->datalo_reg;
     switch(opc) {
     case 0 | 4:
@@ -1606,36 +1601,32 @@  static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
         }
 
         tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index);
+        ofs += 4;
+
+        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, do_getpc(l->raddr));
     } else {
-        tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0);
+        uintptr_t pc;
+
+        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
         /* The second argument is already loaded with addrlo.  */
         tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),
                     tcg_target_call_iarg_regs[2], l->datalo_reg);
         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3],
                      l->mem_index);
-    }
 
-    /* Code generation of qemu_ld/st's slow path calling MMU helper
-
-       PRE_PROC ...
-       call MMU helper
-       jmp POST_PROC (2b) : short forward jump <- GETRA()
-       jmp next_code (5b) : dummy long backward jump which is never executed
-       POST_PROC ... : do post-processing <- GETRA() + 7
-       jmp next_code : jump to the code corresponding to next IR of qemu_ld/st
-    */
+        pc = do_getpc(l->raddr);
+        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
+            tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[4], pc);
+        } else if (pc == (int32_t)pc) {
+            tcg_out_sti(s, TCG_TYPE_PTR, TCG_REG_ESP, 0, pc);
+        } else {
+            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, pc);
+            tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_RAX, TCG_REG_ESP, 0);
+        }
+    }
 
     tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]);
 
-    /* Jump to post-processing code */
-    tcg_out8(s, OPC_JMP_short);
-    tcg_out8(s, 5);
-    /* Dummy backward jump having information of fast path'pc for MMU helpers */
-    tcg_out8(s, OPC_JMP_long);
-    *(int32_t *)s->code_ptr = (int32_t)(l->raddr - s->code_ptr - 4);
-    s->code_ptr += 4;
-
-    /* Jump to the code corresponding to next IR of qemu_st */
     tcg_out_jmp(s, (tcg_target_long)l->raddr);
 }