Message ID | 1377550812-908-8-git-send-email-rth@twiddle.net |
---|---|
State | New |
Headers | show |
On Mon, Aug 26, 2013 at 02:00:12PM -0700, Richard Henderson wrote: > Discontinue the jump-around-jump-to-jump scheme, trading it for a single > immediate move instruction. The two extra jumps always consume 7 bytes, > whereas the immediate move is either 5 or 7 bytes depending on where the > code_gen_buffer gets located. This seems to have broken qemu-system-ppc64: https://bugs.launchpad.net/qemu/+bug/1218098 Rich.
Am 26.08.2013 23:00, schrieb Richard Henderson: > Discontinue the jump-around-jump-to-jump scheme, trading it for a single > immediate move instruction. The two extra jumps always consume 7 bytes, > whereas the immediate move is either 5 or 7 bytes depending on where the > code_gen_buffer gets located. > > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > include/exec/exec-all.h | 13 +----- > tcg/i386/tcg-target.c | 103 ++++++++++++++++++++++-------------------------- > 2 files changed, 49 insertions(+), 67 deletions(-) > > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h > index 5920f73..b70028a 100644 > --- a/include/exec/exec-all.h > +++ b/include/exec/exec-all.h > @@ -326,18 +326,9 @@ extern uintptr_t tci_tb_ptr; > (6) jump to corresponding code of the next of fast path > */ > # if defined(__i386__) || defined(__x86_64__) > -/* To avoid broken disassembling, long jmp is used for embedding fast path pc, > - so that the destination is the next code of fast path, though this jmp is > - never executed. > - > - call MMU helper > - jmp POST_PROC (2byte) <- GETRA() > - jmp NEXT_CODE (5byte) > - POST_PROCESS ... <- GETRA() + 7 > - */ > # define GETRA() ((uintptr_t)__builtin_return_address(0)) > -# define GETPC_LDST() ((uintptr_t)(GETRA() + 7 + \ > - *(int32_t *)((void *)GETRA() + 3) - 1)) > +/* The return address argument for ldst is passed directly. */ > +# define GETPC_LDST() (abort(), 0) > # elif defined (_ARCH_PPC) && !defined (_ARCH_PPC64) > # define GETRA() ((uintptr_t)__builtin_return_address(0)) > # define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() - 4)) - 1)) > diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c > index fba50f8..12a7ca3 100644 > --- a/tcg/i386/tcg-target.c > +++ b/tcg/i386/tcg-target.c > @@ -190,11 +190,11 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) > /* qemu_ld/st address constraint */ > case 'L': > ct->ct |= TCG_CT_REG; > -#if TCG_TARGET_REG_BITS == 64 > + if (TCG_TARGET_REG_BITS == 64) { > tcg_regset_set32(ct->u.regs, 0, 0xffff); > -#else > + } else { > tcg_regset_set32(ct->u.regs, 0, 0xff); > -#endif > + } > tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0); > tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1); > break; > @@ -1025,22 +1025,24 @@ static void tcg_out_jmp(TCGContext *s, tcg_target_long dest) > > #include "exec/softmmu_defs.h" > > -/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr, > - int mmu_idx) */ > -static const void *qemu_ld_helpers[4] = { > - helper_ldb_mmu, > - helper_ldw_mmu, > - helper_ldl_mmu, > - helper_ldq_mmu, > +/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, > + * int mmu_idx, uintptr_t ra) > + */ > +static const void * const qemu_ld_helpers[4] = { > + helper_ret_ldb_mmu, > + helper_ret_ldw_mmu, > + helper_ret_ldl_mmu, > + helper_ret_ldq_mmu, > }; > > -/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr, > - uintxx_t val, int mmu_idx) */ > -static const void *qemu_st_helpers[4] = { > - helper_stb_mmu, > - helper_stw_mmu, > - helper_stl_mmu, > - helper_stq_mmu, > +/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, > + * uintxx_t val, int mmu_idx, uintptr_t ra) > + */ > +static const void * const qemu_st_helpers[4] = { > + helper_ret_stb_mmu, > + helper_ret_stw_mmu, > + helper_ret_stl_mmu, > + helper_ret_stq_mmu, > }; > > static void add_qemu_ldst_label(TCGContext *s, > @@ -1468,6 +1470,12 @@ static void add_qemu_ldst_label(TCGContext *s, > } > } > > +/* See the GETPC definition in include/exec/exec-all.h. */ > +static inline uintptr_t do_getpc(uint8_t *raddr) > +{ > + return (uintptr_t)raddr - 1; > +} > + > /* > * Generate code for the slow path for a load at the end of block > */ > @@ -1499,33 +1507,20 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) > } > > tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index); > + ofs += 4; > + > + tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, do_getpc(l->raddr)); > } else { > - tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0); > + tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); > /* The second argument is already loaded with addrlo. */ > tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], > l->mem_index); > + tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3], > + do_getpc(l->raddr)); > } > > - /* Code generation of qemu_ld/st's slow path calling MMU helper > - > - PRE_PROC ... > - call MMU helper > - jmp POST_PROC (2b) : short forward jump <- GETRA() > - jmp next_code (5b) : dummy long backward jump which is never executed > - POST_PROC ... : do post-processing <- GETRA() + 7 > - jmp next_code : jump to the code corresponding to next IR of qemu_ld/st > - */ > - > tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]); > > - /* Jump to post-processing code */ > - tcg_out8(s, OPC_JMP_short); > - tcg_out8(s, 5); > - /* Dummy backward jump having information of fast path'pc for MMU helpers */ > - tcg_out8(s, OPC_JMP_long); > - *(int32_t *)s->code_ptr = (int32_t)(l->raddr - s->code_ptr - 4); > - s->code_ptr += 4; > - > data_reg = l->datalo_reg; > switch(opc) { > case 0 | 4: > @@ -1606,36 +1601,32 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) > } > > tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index); > + ofs += 4; > + > + tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, do_getpc(l->raddr)); > } else { > - tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0); > + uintptr_t pc; > + > + tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); > /* The second argument is already loaded with addrlo. */ > tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32), > tcg_target_call_iarg_regs[2], l->datalo_reg); > tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], > l->mem_index); > - } > > - /* Code generation of qemu_ld/st's slow path calling MMU helper > - > - PRE_PROC ... > - call MMU helper > - jmp POST_PROC (2b) : short forward jump <- GETRA() > - jmp next_code (5b) : dummy long backward jump which is never executed > - POST_PROC ... : do post-processing <- GETRA() + 7 > - jmp next_code : jump to the code corresponding to next IR of qemu_ld/st > - */ > + pc = do_getpc(l->raddr); > + if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) { > + tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[4], pc); > + } else if (pc == (int32_t)pc) { > + tcg_out_sti(s, TCG_TYPE_PTR, TCG_REG_ESP, 0, pc); > + } else { > + tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, pc); > + tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_RAX, TCG_REG_ESP, 0); > + } > + } > > tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]); > > - /* Jump to post-processing code */ > - tcg_out8(s, OPC_JMP_short); > - tcg_out8(s, 5); > - /* Dummy backward jump having information of fast path'pc for MMU helpers */ > - tcg_out8(s, OPC_JMP_long); > - *(int32_t *)s->code_ptr = (int32_t)(l->raddr - s->code_ptr - 4); > - s->code_ptr += 4; > - > - /* Jump to the code corresponding to next IR of qemu_st */ > tcg_out_jmp(s, (tcg_target_long)l->raddr); > } Hi Richard, this patch has broken the 64 bit version of QEMU for Windows: a Linux guest starts booting, but hangs after "Booting the kernel.". I got a bug report from a user and did a "git bisect" with a Tiny Core Linux guest / cross build with default options / cross test with wine64 and default options. Git reported this commit: 401c227b0a1134245ec61c6c5a9997cfc963c8e4 is the first bad commit commit 401c227b0a1134245ec61c6c5a9997cfc963c8e4 Author: Richard Henderson <rth@twiddle.net> Date: Thu Jul 25 07:16:52 2013 -1000 tcg-i386: Use new return-argument ld/st helpers Discontinue the jump-around-jump-to-jump scheme, trading it for a single immediate move instruction. The two extra jumps always consume 7 bytes, whereas the immediate move is either 5 or 7 bytes depending on where the code_gen_buffer gets located. Signed-off-by: Richard Henderson <rth@twiddle.net> :040000 040000 dfd9a66c85713cd1886a3342de1e9ac95d7ea43f df8673dea69bc89cc2cc979aa24415e3fea4ed53 M include :040000 040000 1f7cd5291f2c69b4126c63bd567c6b106eb332c9 87e7ece766168dda860b513dc97fe5af28ec2c4b M tcg 32 bit versions of QEMU for Windows don't show this problem. Regards Stefan
On 05/27/2014 03:37 PM, Stefan Weil wrote: > Hi Richard, > > this patch has broken the 64 bit version of QEMU for Windows: a Linux > guest starts booting, but hangs after "Booting the kernel.". I got a bug > report from a user and did a "git bisect" with a Tiny Core Linux guest / > cross build with default options / cross test with wine64 and default > options. Git reported this commit: > > 401c227b0a1134245ec61c6c5a9997cfc963c8e4 is the first bad commit > commit 401c227b0a1134245ec61c6c5a9997cfc963c8e4 > Author: Richard Henderson <rth@twiddle.net> > Date: Thu Jul 25 07:16:52 2013 -1000 > > tcg-i386: Use new return-argument ld/st helpers > > Discontinue the jump-around-jump-to-jump scheme, trading it for a single > immediate move instruction. The two extra jumps always consume 7 bytes, > whereas the immediate move is either 5 or 7 bytes depending on where the > code_gen_buffer gets located. > > Signed-off-by: Richard Henderson <rth@twiddle.net> > > :040000 040000 dfd9a66c85713cd1886a3342de1e9ac95d7ea43f > df8673dea69bc89cc2cc979aa24415e3fea4ed53 M include > :040000 040000 1f7cd5291f2c69b4126c63bd567c6b106eb332c9 > 87e7ece766168dda860b513dc97fe5af28ec2c4b M tcg > > 32 bit versions of QEMU for Windows don't show this problem. I'm having problem booting any iso with wine at the moment: $ wine64 ./x86_64-softmmu/qemu-system-x86_64.exe -L ./pc-bios \ -vnc :1 -cdrom ../../../Downloads/TinyCore-current.iso Assertion failed! Program: Z:\home\rth\work\qemu\bld-w64\x86_64-softmmu\qemu-system-x86_64.exe File: /home/rth/work/qemu/qemu/qemu-coroutine-lock.c, Line 91 Expression: qemu_in_coroutine() abnormal program termination Naturally, this isn't happening with a native linux boot with the same arguments. But I can boot an alpha rom: $ wine64 ./alpha-softmmu/qemu-system-alpha.exe -L ./pc-bios/ -nographic PCI: 00:00:0 class 0300 id 1013:00b8 PCI: region 0: 10000000 PCI: region 1: 12000000 PCI: 00:01:0 class 0200 id 8086:100e PCI: region 0: 12020000 PCI: region 1: 0000c000 PCI: 00:02:0 class 0101 id 1095:0646 PCI: region 0: 0000c040 PCI: region 1: 0000c048 PCI: region 3: 0000c04c >>> Which says to me that it's rather unlikely that this basic load/store patch could be the problem. r~
Il 28/05/2014 19:12, Richard Henderson ha scritto: > $ wine64 ./x86_64-softmmu/qemu-system-x86_64.exe -L ./pc-bios \ > -vnc :1 -cdrom ../../../Downloads/TinyCore-current.iso > Assertion failed! > > Program: Z:\home\rth\work\qemu\bld-w64\x86_64-softmmu\qemu-system-x86_64.exe > File: /home/rth/work/qemu/qemu/qemu-coroutine-lock.c, Line 91 > > Expression: qemu_in_coroutine() > > abnormal program termination > > Naturally, this isn't happening with a native linux boot with the same arguments. http://wiki.qemu.org/ChangeLog/2.0 "On Win32, QEMU must be compiled with --disable-coroutine-pool to work around a suspected compiler bug." Paolo
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 5920f73..b70028a 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -326,18 +326,9 @@ extern uintptr_t tci_tb_ptr; (6) jump to corresponding code of the next of fast path */ # if defined(__i386__) || defined(__x86_64__) -/* To avoid broken disassembling, long jmp is used for embedding fast path pc, - so that the destination is the next code of fast path, though this jmp is - never executed. - - call MMU helper - jmp POST_PROC (2byte) <- GETRA() - jmp NEXT_CODE (5byte) - POST_PROCESS ... <- GETRA() + 7 - */ # define GETRA() ((uintptr_t)__builtin_return_address(0)) -# define GETPC_LDST() ((uintptr_t)(GETRA() + 7 + \ - *(int32_t *)((void *)GETRA() + 3) - 1)) +/* The return address argument for ldst is passed directly. */ +# define GETPC_LDST() (abort(), 0) # elif defined (_ARCH_PPC) && !defined (_ARCH_PPC64) # define GETRA() ((uintptr_t)__builtin_return_address(0)) # define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() - 4)) - 1)) diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index fba50f8..12a7ca3 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -190,11 +190,11 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) /* qemu_ld/st address constraint */ case 'L': ct->ct |= TCG_CT_REG; -#if TCG_TARGET_REG_BITS == 64 + if (TCG_TARGET_REG_BITS == 64) { tcg_regset_set32(ct->u.regs, 0, 0xffff); -#else + } else { tcg_regset_set32(ct->u.regs, 0, 0xff); -#endif + } tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0); tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1); break; @@ -1025,22 +1025,24 @@ static void tcg_out_jmp(TCGContext *s, tcg_target_long dest) #include "exec/softmmu_defs.h" -/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr, - int mmu_idx) */ -static const void *qemu_ld_helpers[4] = { - helper_ldb_mmu, - helper_ldw_mmu, - helper_ldl_mmu, - helper_ldq_mmu, +/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, + * int mmu_idx, uintptr_t ra) + */ +static const void * const qemu_ld_helpers[4] = { + helper_ret_ldb_mmu, + helper_ret_ldw_mmu, + helper_ret_ldl_mmu, + helper_ret_ldq_mmu, }; -/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr, - uintxx_t val, int mmu_idx) */ -static const void *qemu_st_helpers[4] = { - helper_stb_mmu, - helper_stw_mmu, - helper_stl_mmu, - helper_stq_mmu, +/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, + * uintxx_t val, int mmu_idx, uintptr_t ra) + */ +static const void * const qemu_st_helpers[4] = { + helper_ret_stb_mmu, + helper_ret_stw_mmu, + helper_ret_stl_mmu, + helper_ret_stq_mmu, }; static void add_qemu_ldst_label(TCGContext *s, @@ -1468,6 +1470,12 @@ static void add_qemu_ldst_label(TCGContext *s, } } +/* See the GETPC definition in include/exec/exec-all.h. */ +static inline uintptr_t do_getpc(uint8_t *raddr) +{ + return (uintptr_t)raddr - 1; +} + /* * Generate code for the slow path for a load at the end of block */ @@ -1499,33 +1507,20 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) } tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index); + ofs += 4; + + tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, do_getpc(l->raddr)); } else { - tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0); + tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); /* The second argument is already loaded with addrlo. */ tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], l->mem_index); + tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3], + do_getpc(l->raddr)); } - /* Code generation of qemu_ld/st's slow path calling MMU helper - - PRE_PROC ... - call MMU helper - jmp POST_PROC (2b) : short forward jump <- GETRA() - jmp next_code (5b) : dummy long backward jump which is never executed - POST_PROC ... : do post-processing <- GETRA() + 7 - jmp next_code : jump to the code corresponding to next IR of qemu_ld/st - */ - tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]); - /* Jump to post-processing code */ - tcg_out8(s, OPC_JMP_short); - tcg_out8(s, 5); - /* Dummy backward jump having information of fast path'pc for MMU helpers */ - tcg_out8(s, OPC_JMP_long); - *(int32_t *)s->code_ptr = (int32_t)(l->raddr - s->code_ptr - 4); - s->code_ptr += 4; - data_reg = l->datalo_reg; switch(opc) { case 0 | 4: @@ -1606,36 +1601,32 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) } tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index); + ofs += 4; + + tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, do_getpc(l->raddr)); } else { - tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0); + uintptr_t pc; + + tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); /* The second argument is already loaded with addrlo. */ tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32), tcg_target_call_iarg_regs[2], l->datalo_reg); tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], l->mem_index); - } - /* Code generation of qemu_ld/st's slow path calling MMU helper - - PRE_PROC ... - call MMU helper - jmp POST_PROC (2b) : short forward jump <- GETRA() - jmp next_code (5b) : dummy long backward jump which is never executed - POST_PROC ... : do post-processing <- GETRA() + 7 - jmp next_code : jump to the code corresponding to next IR of qemu_ld/st - */ + pc = do_getpc(l->raddr); + if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) { + tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[4], pc); + } else if (pc == (int32_t)pc) { + tcg_out_sti(s, TCG_TYPE_PTR, TCG_REG_ESP, 0, pc); + } else { + tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, pc); + tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_RAX, TCG_REG_ESP, 0); + } + } tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]); - /* Jump to post-processing code */ - tcg_out8(s, OPC_JMP_short); - tcg_out8(s, 5); - /* Dummy backward jump having information of fast path'pc for MMU helpers */ - tcg_out8(s, OPC_JMP_long); - *(int32_t *)s->code_ptr = (int32_t)(l->raddr - s->code_ptr - 4); - s->code_ptr += 4; - - /* Jump to the code corresponding to next IR of qemu_st */ tcg_out_jmp(s, (tcg_target_long)l->raddr); }
Discontinue the jump-around-jump-to-jump scheme, trading it for a single immediate move instruction. The two extra jumps always consume 7 bytes, whereas the immediate move is either 5 or 7 bytes depending on where the code_gen_buffer gets located. Signed-off-by: Richard Henderson <rth@twiddle.net> --- include/exec/exec-all.h | 13 +----- tcg/i386/tcg-target.c | 103 ++++++++++++++++++++++-------------------------- 2 files changed, 49 insertions(+), 67 deletions(-)