Message ID | 1378369329-2187-3-git-send-email-pbonzini@redhat.com |
---|---|
State | New |
Headers | show |
On 09/05/2013 01:22 AM, Paolo Bonzini wrote: > These use a 32-bit load-of-immediate to save a mflr+addi+mtlr sequence. > Tested with a Windows 98 guest (pretty much the most recent thing I > could run on my PPC machine) and kvm-unit-tests's sieve.flat. The > speed up for sieve.flat is as high as 10% for qemu-system-i386, 25% > (no kidding) for qemu-system-x86_64 on my PowerBook G4. See also the series beginning at http://lists.nongnu.org/archive/html/qemu-devel/2013-09/msg00025.html The major difference is that I use a conditional call out of the fast path, which lets me later just use one mflr to pass the parameter. I also, perhaps foolishly, got rid of the trampolines. E.g. 0xf57a1838: rlwinm r3,r15,24,20,27 0xf57a183c: rlwinm r0,r15,0,30,19 0xf57a1840: add r3,r3,r27 0xf57a1844: lwz r4,6436(r3) 0xf57a1848: cmpw cr7,r0,r4 0xf57a184c: lwz r3,6444(r3) 0xf57a1850: bnel- cr7,0xf57a1910 0xf57a1854: stwx r16,r3,r15 ... 0xf57a1910: mr r3,r27 0xf57a1914: mr r4,r15 0xf57a1918: mr r5,r16 0xf57a191c: li r6,1 0xf57a1920: mflr r7 0xf57a1924: lis r0,4120 0xf57a1928: ori r0,r0,45040 0xf57a192c: mtctr r0 0xf57a1930: bctrl 0xf57a1934: b 0xf57a1858 I don't see anything technically wrong with your patch. But I'd be interested to compare vs mine. r~
Il 05/09/2013 17:17, Richard Henderson ha scritto: > On 09/05/2013 01:22 AM, Paolo Bonzini wrote: >> These use a 32-bit load-of-immediate to save a mflr+addi+mtlr sequence. >> Tested with a Windows 98 guest (pretty much the most recent thing I >> could run on my PPC machine) and kvm-unit-tests's sieve.flat. The >> speed up for sieve.flat is as high as 10% for qemu-system-i386, 25% >> (no kidding) for qemu-system-x86_64 on my PowerBook G4. > > See also the series beginning at > > http://lists.nongnu.org/archive/html/qemu-devel/2013-09/msg00025.html > > The major difference is that I use a conditional call out of the fast > path, which lets me later just use one mflr to pass the parameter. I > also, perhaps foolishly, got rid of the trampolines. E.g. > > 0xf57a1838: rlwinm r3,r15,24,20,27 > 0xf57a183c: rlwinm r0,r15,0,30,19 > 0xf57a1840: add r3,r3,r27 > 0xf57a1844: lwz r4,6436(r3) > 0xf57a1848: cmpw cr7,r0,r4 > 0xf57a184c: lwz r3,6444(r3) > 0xf57a1850: bnel- cr7,0xf57a1910 > 0xf57a1854: stwx r16,r3,r15 > ... > 0xf57a1910: mr r3,r27 > 0xf57a1914: mr r4,r15 > 0xf57a1918: mr r5,r16 > 0xf57a191c: li r6,1 > 0xf57a1920: mflr r7 > 0xf57a1924: lis r0,4120 > 0xf57a1928: ori r0,r0,45040 > 0xf57a192c: mtctr r0 > 0xf57a1930: bctrl > 0xf57a1934: b 0xf57a1858 > > I don't see anything technically wrong with your patch. But I'd be > interested to compare vs mine. Sure, I'll give it a try tomorrow or in the weekend. The G4 in my computer must simply hate the mflr/add/mtlr sequence in the trampoline; there's no other explanation for such a huge performance improvement. So even though I suspect that there won't be much difference between our patches it's good to check what's better in case your sequences are triggering something as bad. The bnel/mflr is a nice trick to save one instruction, though! Regarding removal of the trampolines, the extra icache cost should be a wash now that they are half the size, but I'd still prefer it to be a separate patch. Paolo
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index beb4149..a81e805 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -324,9 +324,7 @@ extern uintptr_t tci_tb_ptr; In some implementations, we pass the "logical" return address manually; in others, we must infer the logical return from the true return. */ #if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU) -# if defined (_ARCH_PPC) && !defined (_ARCH_PPC64) -# define GETRA_LDST(RA) (*(int32_t *)((RA) - 4)) -# elif defined(__arm__) +# if defined(__arm__) /* We define two insns between the return address and the branch back to straight-line. Find and decode that branch insn. */ # define GETRA_LDST(RA) tcg_getra_ldst(RA) diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c index 204ffbe..24a8621 100644 --- a/tcg/ppc/tcg-target.c +++ b/tcg/ppc/tcg-target.c @@ -550,22 +550,24 @@ static void add_qemu_ldst_label (TCGContext *s, label->label_ptr[0] = label_ptr; } -/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr, - int mmu_idx) */ +/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, + * int mmu_idx, uintptr_t ra) + */ static const void * const qemu_ld_helpers[4] = { - helper_ldb_mmu, - helper_ldw_mmu, - helper_ldl_mmu, - helper_ldq_mmu, + helper_ret_ldub_mmu, + helper_ret_lduw_mmu, + helper_ret_ldul_mmu, + helper_ret_ldq_mmu, }; -/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr, - uintxx_t val, int mmu_idx) */ +/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, + * uintxx_t val, int mmu_idx, uintptr_t ra) + */ static const void * const qemu_st_helpers[4] = { - helper_stb_mmu, - helper_stw_mmu, - helper_stl_mmu, - helper_stq_mmu, + helper_ret_stb_mmu, + helper_ret_stw_mmu, + helper_ret_stl_mmu, + helper_ret_stq_mmu, }; static void *ld_trampolines[4]; @@ -860,9 +862,9 @@ static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label) tcg_out_mov (s, TCG_TYPE_I32, ir++, label->addrhi_reg); tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg); #endif - tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index); + tcg_out_movi (s, TCG_TYPE_I32, ir++, mem_index); + tcg_out_movi (s, TCG_TYPE_I32, ir, (tcg_target_long) raddr); tcg_out_b (s, LK, (tcg_target_long) ld_trampolines[s_bits]); - tcg_out32 (s, (tcg_target_long) raddr); switch (opc) { case 0|4: tcg_out32 (s, EXTSB | RA (data_reg) | RS (3)); @@ -954,10 +956,10 @@ static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label) } ir++; - tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index); - tcg_out_b (s, LK, (tcg_target_long) st_trampolines[opc]); - tcg_out32 (s, (tcg_target_long) raddr); - tcg_out_b (s, 0, (tcg_target_long) raddr); + tcg_out_movi (s, TCG_TYPE_I32, ir++, mem_index); + tcg_out_movi (s, TCG_TYPE_I32, ir, (tcg_target_long) raddr); + tcg_out32 (s, MTSPR | RS (ir) | LR); + tcg_out_b (s, 0, (tcg_target_long) st_trampolines[opc]); } void tcg_out_tb_finalize(TCGContext *s) @@ -981,9 +983,6 @@ void tcg_out_tb_finalize(TCGContext *s) #ifdef CONFIG_SOFTMMU static void emit_ldst_trampoline (TCGContext *s, const void *ptr) { - tcg_out32 (s, MFSPR | RT (3) | LR); - tcg_out32 (s, ADDI | RT (3) | RA (3) | 4); - tcg_out32 (s, MTSPR | RS (3) | LR); tcg_out_mov (s, TCG_TYPE_I32, 3, TCG_AREG0); tcg_out_call (s, (tcg_target_long) ptr, 1, 0); }
These use a 32-bit load-of-immediate to save a mflr+addi+mtlr sequence. Tested with a Windows 98 guest (pretty much the most recent thing I could run on my PPC machine) and kvm-unit-tests's sieve.flat. The speed up for sieve.flat is as high as 10% for qemu-system-i386, 25% (no kidding) for qemu-system-x86_64 on my PowerBook G4. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- include/exec/exec-all.h | 4 +--- tcg/ppc/tcg-target.c | 41 ++++++++++++++++++++--------------------- 2 files changed, 21 insertions(+), 24 deletions(-)