Message ID | 1377639991-16028-8-git-send-email-rth@twiddle.net |
---|---|
State | New |
Headers | show |
On 08/27/2013 02:46 PM, Richard Henderson wrote: > - { INDEX_op_qemu_ld8u, { "r", "L" } }, > - { INDEX_op_qemu_ld8s, { "r", "L" } }, > - { INDEX_op_qemu_ld16u, { "r", "L" } }, > - { INDEX_op_qemu_ld16s, { "r", "L" } }, > - { INDEX_op_qemu_ld32, { "r", "L" } }, > - { INDEX_op_qemu_ld32u, { "r", "L" } }, > - { INDEX_op_qemu_ld32s, { "r", "L" } }, > - { INDEX_op_qemu_ld64, { "r", "L" } }, > + { INDEX_op_qemu_ld8u, { "a", "L" } }, > + { INDEX_op_qemu_ld8s, { "a", "L" } }, > + { INDEX_op_qemu_ld16u, { "a", "L" } }, > + { INDEX_op_qemu_ld16s, { "a", "L" } }, > + { INDEX_op_qemu_ld32, { "a", "L" } }, > + { INDEX_op_qemu_ld32u, { "a", "L" } }, > + { INDEX_op_qemu_ld32s, { "a", "L" } }, > + { INDEX_op_qemu_ld64, { "a", "L" } }, I should have invented a new constraint letter here. While this works, and is ideal for softmmu, this unnecessarily penalizes non-softmmu. r~
Il 27/08/2013 23:46, Richard Henderson ha scritto: > This does require the fast path always load to the function return > value register, but apparently the loaded value usually needs to be > spilled back to its memory slot anyway so the change in register > does not really change much. Even for something like mov (%rdi), %rax add (%r8), %rax ? Memory operands should avoid the need to spill anything. Is this change really an advantage considering the additional icache footprint of the new helpers? Paolo > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > tcg/i386/tcg-target.c | 107 ++++++++++++++++++-------------------------------- > 1 file changed, 39 insertions(+), 68 deletions(-) > > diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c > index 5aee0fa..b1d05b8 100644 > --- a/tcg/i386/tcg-target.c > +++ b/tcg/i386/tcg-target.c > @@ -1025,11 +1025,20 @@ static void tcg_out_jmp(TCGContext *s, tcg_target_long dest) > /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, > * int mmu_idx, uintptr_t ra) > */ > -static const void * const qemu_ld_helpers[4] = { > +static const void * const qemu_ld_helpers[8] = { > helper_ret_ldub_mmu, > helper_ret_lduw_mmu, > helper_ret_ldul_mmu, > helper_ret_ldq_mmu, > + > + helper_ret_ldsb_mmu, > + helper_ret_ldsw_mmu, > +#if TCG_TARGET_REG_BITS == 64 > + helper_ret_ldsl_mmu, > +#else > + helper_ret_ldul_mmu, > +#endif > + helper_ret_ldq_mmu, > }; > > /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, > @@ -1473,9 +1482,8 @@ static void add_qemu_ldst_label(TCGContext *s, > static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) > { > int opc = l->opc; > - int s_bits = opc & 3; > - TCGReg data_reg; > uint8_t **label_ptr = &l->label_ptr[0]; > + TCGReg retaddr; > > /* resolve label address */ > *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4); > @@ -1500,58 +1508,21 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) > tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index); > ofs += 4; > > - tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, (uintptr_t)l->raddr); > + retaddr = TCG_REG_EAX; > + tcg_out_movi(s, TCG_TYPE_I32, retaddr, (uintptr_t)l->raddr); > + tcg_out_st(s, TCG_TYPE_I32, retaddr, TCG_REG_ESP, ofs); > } else { > tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); > /* The second argument is already loaded with addrlo. */ > tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], > l->mem_index); > - tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3], > - (uintptr_t)l->raddr); > - } > - > - tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]); > - > - data_reg = l->datalo_reg; > - switch(opc) { > - case 0 | 4: > - tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW); > - break; > - case 1 | 4: > - tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW); > - break; > - case 0: > - tcg_out_ext8u(s, data_reg, TCG_REG_EAX); > - break; > - case 1: > - tcg_out_ext16u(s, data_reg, TCG_REG_EAX); > - break; > - case 2: > - tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); > - break; > -#if TCG_TARGET_REG_BITS == 64 > - case 2 | 4: > - tcg_out_ext32s(s, data_reg, TCG_REG_EAX); > - break; > -#endif > - case 3: > - if (TCG_TARGET_REG_BITS == 64) { > - tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX); > - } else if (data_reg == TCG_REG_EDX) { > - /* xchg %edx, %eax */ > - tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0); > - tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX); > - } else { > - tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); > - tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX); > - } > - break; > - default: > - tcg_abort(); > + retaddr = tcg_target_call_iarg_regs[3]; > + tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr); > } > > - /* Jump to the code corresponding to next IR of qemu_st */ > - tcg_out_jmp(s, (tcg_target_long)l->raddr); > + /* "Tail call" to the helper, with the return address back inline. */ > + tcg_out_push(s, retaddr); > + tcg_out_jmp(s, (tcg_target_long)qemu_ld_helpers[opc]); > } > > /* > @@ -2125,38 +2096,38 @@ static const TCGTargetOpDef x86_op_defs[] = { > #endif > > #if TCG_TARGET_REG_BITS == 64 > - { INDEX_op_qemu_ld8u, { "r", "L" } }, > - { INDEX_op_qemu_ld8s, { "r", "L" } }, > - { INDEX_op_qemu_ld16u, { "r", "L" } }, > - { INDEX_op_qemu_ld16s, { "r", "L" } }, > - { INDEX_op_qemu_ld32, { "r", "L" } }, > - { INDEX_op_qemu_ld32u, { "r", "L" } }, > - { INDEX_op_qemu_ld32s, { "r", "L" } }, > - { INDEX_op_qemu_ld64, { "r", "L" } }, > + { INDEX_op_qemu_ld8u, { "a", "L" } }, > + { INDEX_op_qemu_ld8s, { "a", "L" } }, > + { INDEX_op_qemu_ld16u, { "a", "L" } }, > + { INDEX_op_qemu_ld16s, { "a", "L" } }, > + { INDEX_op_qemu_ld32, { "a", "L" } }, > + { INDEX_op_qemu_ld32u, { "a", "L" } }, > + { INDEX_op_qemu_ld32s, { "a", "L" } }, > + { INDEX_op_qemu_ld64, { "a", "L" } }, > > { INDEX_op_qemu_st8, { "L", "L" } }, > { INDEX_op_qemu_st16, { "L", "L" } }, > { INDEX_op_qemu_st32, { "L", "L" } }, > { INDEX_op_qemu_st64, { "L", "L" } }, > #elif TARGET_LONG_BITS <= TCG_TARGET_REG_BITS > - { INDEX_op_qemu_ld8u, { "r", "L" } }, > - { INDEX_op_qemu_ld8s, { "r", "L" } }, > - { INDEX_op_qemu_ld16u, { "r", "L" } }, > - { INDEX_op_qemu_ld16s, { "r", "L" } }, > - { INDEX_op_qemu_ld32, { "r", "L" } }, > - { INDEX_op_qemu_ld64, { "r", "r", "L" } }, > + { INDEX_op_qemu_ld8u, { "a", "L" } }, > + { INDEX_op_qemu_ld8s, { "a", "L" } }, > + { INDEX_op_qemu_ld16u, { "a", "L" } }, > + { INDEX_op_qemu_ld16s, { "a", "L" } }, > + { INDEX_op_qemu_ld32, { "a", "L" } }, > + { INDEX_op_qemu_ld64, { "a", "d", "L" } }, > > { INDEX_op_qemu_st8, { "cb", "L" } }, > { INDEX_op_qemu_st16, { "L", "L" } }, > { INDEX_op_qemu_st32, { "L", "L" } }, > { INDEX_op_qemu_st64, { "L", "L", "L" } }, > #else > - { INDEX_op_qemu_ld8u, { "r", "L", "L" } }, > - { INDEX_op_qemu_ld8s, { "r", "L", "L" } }, > - { INDEX_op_qemu_ld16u, { "r", "L", "L" } }, > - { INDEX_op_qemu_ld16s, { "r", "L", "L" } }, > - { INDEX_op_qemu_ld32, { "r", "L", "L" } }, > - { INDEX_op_qemu_ld64, { "r", "r", "L", "L" } }, > + { INDEX_op_qemu_ld8u, { "a", "L", "L" } }, > + { INDEX_op_qemu_ld8s, { "a", "L", "L" } }, > + { INDEX_op_qemu_ld16u, { "a", "L", "L" } }, > + { INDEX_op_qemu_ld16s, { "a", "L", "L" } }, > + { INDEX_op_qemu_ld32, { "a", "L", "L" } }, > + { INDEX_op_qemu_ld64, { "a", "d", "L", "L" } }, > > { INDEX_op_qemu_st8, { "cb", "L", "L" } }, > { INDEX_op_qemu_st16, { "L", "L", "L" } }, >
On 08/29/2013 08:31 AM, Paolo Bonzini wrote: > Il 27/08/2013 23:46, Richard Henderson ha scritto: >> This does require the fast path always load to the function return >> value register, but apparently the loaded value usually needs to be >> spilled back to its memory slot anyway so the change in register >> does not really change much. > > Even for something like > > mov (%rdi), %rax > add (%r8), %rax > > ? Memory operands should avoid the need to spill anything. No, not that kind of spilling. The kind in which subsequent operations in the TB perform something that requires the register backing store within env be up to date. Thus a "spill" from register to the memory slot. I could have used better verbage i guess... In theory, having the load go to a call-saved register instead of the call-clobbered return value register would mean that (even with updating the memory backing store) if we had a future read-only use of the register within the basic block we could re-use the value in the register. It's just that I didn't see that ever happen in the couple of MB of dumps that I saw. Thus my conclusion that it's rare enough not to worry about it. > Is this change really an advantage considering the additional icache > footprint of the new helpers? We *are* getting fractionally smaller TBs, for the low low price of three functions like > 00000000005f2df0 <helper_ret_ldsw_mmu>: > 5f2df0: 48 83 ec 18 sub $0x18,%rsp > 5f2df4: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax > 5f2dfb: 00 00 > 5f2dfd: 48 89 44 24 08 mov %rax,0x8(%rsp) > 5f2e02: 31 c0 xor %eax,%eax > 5f2e04: e8 27 fe ff ff callq 5f2c30 <helper_ret_lduw_mmu> > 5f2e09: 48 8b 54 24 08 mov 0x8(%rsp),%rdx > 5f2e0e: 64 48 33 14 25 28 00 xor %fs:0x28,%rdx > 5f2e15: 00 00 > 5f2e17: 48 0f bf c0 movswq %ax,%rax > 5f2e1b: 75 05 jne 5f2e22 <helper_ret_ldsw_mmu+0x32> > 5f2e1d: 48 83 c4 18 add $0x18,%rsp > 5f2e21: c3 retq > 5f2e22: e8 79 a7 e1 ff callq 40d5a0 <__stack_chk_fail@plt> > 5f2e27: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) > 5f2e2e: 00 00 Spend 64 * 3 bytes on new helpers, and save 10 * zillion bytes on the TBs. In all likelyhood we're saving icache rather than using additional. Where I do think there's cause for treading carefully is wrt Aurelien's statement "it's the slow path exception, the call-return stack doesn't matter". Alternately, given that it *is* the slow path, who cares if the return from the helper immediately hits a branch, rather than tail-calling back into the fast path, if the benefit is that the call-return stack is still valid above the code_gen_buffer after a simple tlb miss? As an aside, why why o why do we default to -fstack-protector-all? Do we really need checks in every single function, as opposed to those that actually do something with arrays? Switch to plain -fstack-protector so we have > 00000000005a1fd0 <helper_ret_ldsw_mmu>: > 5a1fd0: 48 83 ec 08 sub $0x8,%rsp > 5a1fd4: e8 57 fe ff ff callq 5a1e30 <helper_ret_lduw_mmu> > 5a1fd9: 48 83 c4 08 add $0x8,%rsp > 5a1fdd: 48 0f bf c0 movswq %ax,%rax > 5a1fe1: c3 retq > 5a1fe2: 66 66 66 66 66 2e 0f data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1) > 5a1fe9: 1f 84 00 00 00 00 00 and then lets talk about icache savings... r~
Il 29/08/2013 18:08, Richard Henderson ha scritto: > Where I do think there's cause for treading carefully is wrt Aurelien's > statement "it's the slow path exception, the call-return stack doesn't matter". > Alternately, given that it *is* the slow path, who cares if the return from > the helper immediately hits a branch, rather than tail-calling back into the > fast path, if the benefit is that the call-return stack is still valid above > the code_gen_buffer after a simple tlb miss? Aurelien's comment was that lea+push+jmp is smaller than lea+call+ret, which I can buy. I guess it depends more than everything on the hardware implementation of return branch prediction, and _how much_ the call-return stack is broken. PPC's mtlr+b+...+blr and x86's push+jmp+...+ret are quite similar in this respect, and they beg the same question. After the blr/ret, is the entire predictor state broken or will the processor simply take a miss and still keep the remainder of the stack valid? (For x86 it could in principle see that the stack pointer is lower and thus keep the entries above it. For PPC it's not that simple since LR is a callee-save register, but there's probably plenty of tricks and heuristics that can be employed). > > As an aside, why why o why do we default to -fstack-protector-all? Do we > really need checks in every single function, as opposed to those that actually > do something with arrays? Switch to plain -fstack-protector so we have > >> 00000000005a1fd0 <helper_ret_ldsw_mmu>: >> 5a1fd0: 48 83 ec 08 sub $0x8,%rsp >> 5a1fd4: e8 57 fe ff ff callq 5a1e30 <helper_ret_lduw_mmu> >> 5a1fd9: 48 83 c4 08 add $0x8,%rsp >> 5a1fdd: 48 0f bf c0 movswq %ax,%rax >> 5a1fe1: c3 retq >> 5a1fe2: 66 66 66 66 66 2e 0f data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1) >> 5a1fe9: 1f 84 00 00 00 00 00 > > and then lets talk about icache savings... I think it was simply paranoia + not knowing the difference. Patch welcome I guess. (And I admit I only skimmed the patches so I didn't know how small the wrappers were). Paolo
On Tue, Aug 27, 2013 at 02:46:31PM -0700, Richard Henderson wrote: > This does require the fast path always load to the function return > value register, but apparently the loaded value usually needs to be > spilled back to its memory slot anyway so the change in register > does not really change much. My suggestion was to only do that for the store, not for the load. Forcing a single call clobbered register for load function means we are going to generate a lot more code for spilling the currently used value, but also to reload it later. You might argue that if the value is a global, it has to be saved back to memory, but spilling it instead of saving it back means it has to be reloaded later. Here is an example, in the first few TB from seabios: 0x7fd21c18d241: mov %ebp,%edi 0x7fcade58f241: mov %ebp,%edi 0x7fd21c18d243: mov %ebp,%esi 0x7fcade58f243: mov %ebp,%esi 0x7fd21c18d245: shr $0x7,%edi 0x7fcade58f245: shr $0x7,%edi 0x7fd21c18d248: and $0xfffff001,%esi 0x7fcade58f248: and $0xfffff001,%esi 0x7fd21c18d24e: and $0x1fe0,%edi 0x7fcade58f24e: and $0x1fe0,%edi 0x7fd21c18d254: lea 0x380(%r14,%rdi,1),%rdi 0x7fcade58f254: lea 0x380(%r14,%rdi,1),%rdi 0x7fd21c18d25c: cmp (%rdi),%esi 0x7fcade58f25c: cmp (%rdi),%esi 0x7fd21c18d25e: mov %ebp,%esi 0x7fcade58f25e: mov %ebp,%esi 0x7fd21c18d260: jne 0x7fd21c18d384 0x7fcade58f260: jne 0x7fcade58f3a0 0x7fd21c18d266: add 0x10(%rdi),%rsi 0x7fcade58f266: add 0x10(%rdi),%rsi 0x7fd21c18d26a: movzwl (%rsi),%ebx 0x7fcade58f26a: movzwl (%rsi),%eax 0x7fd21c18d26d: add $0x2,%ebp 0x7fcade58f26d: add $0x2,%ebp 0x7fcade58f270: mov %eax,0x80(%rsp) Here %eax needs to be spilled as the value is needed for the next qemu_ld operation. 0x7fd21c18d270: mov %ebp,%edi 0x7fcade58f277: mov %ebp,%edi 0x7fd21c18d272: mov %ebp,%esi 0x7fcade58f279: mov %ebp,%esi 0x7fd21c18d274: shr $0x7,%edi 0x7fcade58f27b: shr $0x7,%edi 0x7fd21c18d277: and $0xfffff003,%esi 0x7fcade58f27e: and $0xfffff003,%esi 0x7fd21c18d27d: and $0x1fe0,%edi 0x7fcade58f284: and $0x1fe0,%edi 0x7fd21c18d283: lea 0x380(%r14,%rdi,1),%rdi 0x7fcade58f28a: lea 0x380(%r14,%rdi,1),%rdi 0x7fd21c18d28b: cmp (%rdi),%esi 0x7fcade58f292: cmp (%rdi),%esi 0x7fd21c18d28d: mov %ebp,%esi 0x7fcade58f294: mov %ebp,%esi 0x7fd21c18d28f: jne 0x7fd21c18d39d 0x7fcade58f296: jne 0x7fcade58f3b2 0x7fd21c18d295: add 0x10(%rdi),%rsi 0x7fcade58f29c: add 0x10(%rdi),%rsi 0x7fd21c18d299: mov (%rsi),%ebp 0x7fcade58f2a0: mov (%rsi),%eax 0x7fd21c18d29b: and $0xffffff,%ebp 0x7fcade58f2a2: and $0xffffff,%eax 0x7fd21c18d2a1: mov %ebp,0xd8(%r14) 0x7fcade58f2a8: mov %eax,0xd8(%r14) 0x7fcade58f2b6: mov %ebp,0xdc(%r14) And here it has to be reloaded from memory. 0x7fd21c18d2af: mov 0x58(%r14),%ebp 0x7fcade58f2af: mov 0x80(%rsp),%ebp 0x7fd21c18d2a8: mov %ebx,0xdc(%r14) 0x7fcade58f2bd: mov 0x58(%r14),%ebp Overall it even makes the TB bigger. As the fast path is really what is important and executed a lot more often than the slow path, we should not deoptimize the fast path to get a smaller slow path. > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > tcg/i386/tcg-target.c | 107 ++++++++++++++++++-------------------------------- > 1 file changed, 39 insertions(+), 68 deletions(-) > > diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c > index 5aee0fa..b1d05b8 100644 > --- a/tcg/i386/tcg-target.c > +++ b/tcg/i386/tcg-target.c > @@ -1025,11 +1025,20 @@ static void tcg_out_jmp(TCGContext *s, tcg_target_long dest) > /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, > * int mmu_idx, uintptr_t ra) > */ > -static const void * const qemu_ld_helpers[4] = { > +static const void * const qemu_ld_helpers[8] = { > helper_ret_ldub_mmu, > helper_ret_lduw_mmu, > helper_ret_ldul_mmu, > helper_ret_ldq_mmu, > + > + helper_ret_ldsb_mmu, > + helper_ret_ldsw_mmu, > +#if TCG_TARGET_REG_BITS == 64 > + helper_ret_ldsl_mmu, > +#else > + helper_ret_ldul_mmu, > +#endif > + helper_ret_ldq_mmu, > }; > > /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, > @@ -1473,9 +1482,8 @@ static void add_qemu_ldst_label(TCGContext *s, > static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) > { > int opc = l->opc; > - int s_bits = opc & 3; > - TCGReg data_reg; > uint8_t **label_ptr = &l->label_ptr[0]; > + TCGReg retaddr; > > /* resolve label address */ > *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4); > @@ -1500,58 +1508,21 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) > tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index); > ofs += 4; > > - tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, (uintptr_t)l->raddr); > + retaddr = TCG_REG_EAX; > + tcg_out_movi(s, TCG_TYPE_I32, retaddr, (uintptr_t)l->raddr); > + tcg_out_st(s, TCG_TYPE_I32, retaddr, TCG_REG_ESP, ofs); > } else { > tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); > /* The second argument is already loaded with addrlo. */ > tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], > l->mem_index); > - tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3], > - (uintptr_t)l->raddr); > - } > - > - tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]); > - > - data_reg = l->datalo_reg; > - switch(opc) { > - case 0 | 4: > - tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW); > - break; > - case 1 | 4: > - tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW); > - break; > - case 0: > - tcg_out_ext8u(s, data_reg, TCG_REG_EAX); > - break; > - case 1: > - tcg_out_ext16u(s, data_reg, TCG_REG_EAX); > - break; > - case 2: > - tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); > - break; > -#if TCG_TARGET_REG_BITS == 64 > - case 2 | 4: > - tcg_out_ext32s(s, data_reg, TCG_REG_EAX); > - break; > -#endif > - case 3: > - if (TCG_TARGET_REG_BITS == 64) { > - tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX); > - } else if (data_reg == TCG_REG_EDX) { > - /* xchg %edx, %eax */ > - tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0); > - tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX); > - } else { > - tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); > - tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX); > - } > - break; > - default: > - tcg_abort(); > + retaddr = tcg_target_call_iarg_regs[3]; > + tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr); > } > > - /* Jump to the code corresponding to next IR of qemu_st */ > - tcg_out_jmp(s, (tcg_target_long)l->raddr); > + /* "Tail call" to the helper, with the return address back inline. */ > + tcg_out_push(s, retaddr); > + tcg_out_jmp(s, (tcg_target_long)qemu_ld_helpers[opc]); > } > > /* > @@ -2125,38 +2096,38 @@ static const TCGTargetOpDef x86_op_defs[] = { > #endif > > #if TCG_TARGET_REG_BITS == 64 > - { INDEX_op_qemu_ld8u, { "r", "L" } }, > - { INDEX_op_qemu_ld8s, { "r", "L" } }, > - { INDEX_op_qemu_ld16u, { "r", "L" } }, > - { INDEX_op_qemu_ld16s, { "r", "L" } }, > - { INDEX_op_qemu_ld32, { "r", "L" } }, > - { INDEX_op_qemu_ld32u, { "r", "L" } }, > - { INDEX_op_qemu_ld32s, { "r", "L" } }, > - { INDEX_op_qemu_ld64, { "r", "L" } }, > + { INDEX_op_qemu_ld8u, { "a", "L" } }, > + { INDEX_op_qemu_ld8s, { "a", "L" } }, > + { INDEX_op_qemu_ld16u, { "a", "L" } }, > + { INDEX_op_qemu_ld16s, { "a", "L" } }, > + { INDEX_op_qemu_ld32, { "a", "L" } }, > + { INDEX_op_qemu_ld32u, { "a", "L" } }, > + { INDEX_op_qemu_ld32s, { "a", "L" } }, > + { INDEX_op_qemu_ld64, { "a", "L" } }, > > { INDEX_op_qemu_st8, { "L", "L" } }, > { INDEX_op_qemu_st16, { "L", "L" } }, > { INDEX_op_qemu_st32, { "L", "L" } }, > { INDEX_op_qemu_st64, { "L", "L" } }, > #elif TARGET_LONG_BITS <= TCG_TARGET_REG_BITS > - { INDEX_op_qemu_ld8u, { "r", "L" } }, > - { INDEX_op_qemu_ld8s, { "r", "L" } }, > - { INDEX_op_qemu_ld16u, { "r", "L" } }, > - { INDEX_op_qemu_ld16s, { "r", "L" } }, > - { INDEX_op_qemu_ld32, { "r", "L" } }, > - { INDEX_op_qemu_ld64, { "r", "r", "L" } }, > + { INDEX_op_qemu_ld8u, { "a", "L" } }, > + { INDEX_op_qemu_ld8s, { "a", "L" } }, > + { INDEX_op_qemu_ld16u, { "a", "L" } }, > + { INDEX_op_qemu_ld16s, { "a", "L" } }, > + { INDEX_op_qemu_ld32, { "a", "L" } }, > + { INDEX_op_qemu_ld64, { "a", "d", "L" } }, > > { INDEX_op_qemu_st8, { "cb", "L" } }, > { INDEX_op_qemu_st16, { "L", "L" } }, > { INDEX_op_qemu_st32, { "L", "L" } }, > { INDEX_op_qemu_st64, { "L", "L", "L" } }, > #else > - { INDEX_op_qemu_ld8u, { "r", "L", "L" } }, > - { INDEX_op_qemu_ld8s, { "r", "L", "L" } }, > - { INDEX_op_qemu_ld16u, { "r", "L", "L" } }, > - { INDEX_op_qemu_ld16s, { "r", "L", "L" } }, > - { INDEX_op_qemu_ld32, { "r", "L", "L" } }, > - { INDEX_op_qemu_ld64, { "r", "r", "L", "L" } }, > + { INDEX_op_qemu_ld8u, { "a", "L", "L" } }, > + { INDEX_op_qemu_ld8s, { "a", "L", "L" } }, > + { INDEX_op_qemu_ld16u, { "a", "L", "L" } }, > + { INDEX_op_qemu_ld16s, { "a", "L", "L" } }, > + { INDEX_op_qemu_ld32, { "a", "L", "L" } }, > + { INDEX_op_qemu_ld64, { "a", "d", "L", "L" } }, > > { INDEX_op_qemu_st8, { "cb", "L", "L" } }, > { INDEX_op_qemu_st16, { "L", "L", "L" } }, > -- > 1.8.1.4 > > >
On 08/29/2013 10:06 AM, Aurelien Jarno wrote: > On Tue, Aug 27, 2013 at 02:46:31PM -0700, Richard Henderson wrote: >> This does require the fast path always load to the function return >> value register, but apparently the loaded value usually needs to be >> spilled back to its memory slot anyway so the change in register >> does not really change much. > > My suggestion was to only do that for the store, not for the load. Ah, ok. Fair enough. Shall I include the tail-call for store only if it saves code space, due any possible concern for call-return stack? I'm thinking here of PPC, which is 100% space neutral on this, exchanging one B for one MTLR in order to implement the tail call. r~
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 5aee0fa..b1d05b8 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -1025,11 +1025,20 @@ static void tcg_out_jmp(TCGContext *s, tcg_target_long dest) /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, * int mmu_idx, uintptr_t ra) */ -static const void * const qemu_ld_helpers[4] = { +static const void * const qemu_ld_helpers[8] = { helper_ret_ldub_mmu, helper_ret_lduw_mmu, helper_ret_ldul_mmu, helper_ret_ldq_mmu, + + helper_ret_ldsb_mmu, + helper_ret_ldsw_mmu, +#if TCG_TARGET_REG_BITS == 64 + helper_ret_ldsl_mmu, +#else + helper_ret_ldul_mmu, +#endif + helper_ret_ldq_mmu, }; /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, @@ -1473,9 +1482,8 @@ static void add_qemu_ldst_label(TCGContext *s, static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) { int opc = l->opc; - int s_bits = opc & 3; - TCGReg data_reg; uint8_t **label_ptr = &l->label_ptr[0]; + TCGReg retaddr; /* resolve label address */ *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4); @@ -1500,58 +1508,21 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l) tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index); ofs += 4; - tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, (uintptr_t)l->raddr); + retaddr = TCG_REG_EAX; + tcg_out_movi(s, TCG_TYPE_I32, retaddr, (uintptr_t)l->raddr); + tcg_out_st(s, TCG_TYPE_I32, retaddr, TCG_REG_ESP, ofs); } else { tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0); /* The second argument is already loaded with addrlo. */ tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], l->mem_index); - tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3], - (uintptr_t)l->raddr); - } - - tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]); - - data_reg = l->datalo_reg; - switch(opc) { - case 0 | 4: - tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW); - break; - case 1 | 4: - tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW); - break; - case 0: - tcg_out_ext8u(s, data_reg, TCG_REG_EAX); - break; - case 1: - tcg_out_ext16u(s, data_reg, TCG_REG_EAX); - break; - case 2: - tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); - break; -#if TCG_TARGET_REG_BITS == 64 - case 2 | 4: - tcg_out_ext32s(s, data_reg, TCG_REG_EAX); - break; -#endif - case 3: - if (TCG_TARGET_REG_BITS == 64) { - tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX); - } else if (data_reg == TCG_REG_EDX) { - /* xchg %edx, %eax */ - tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0); - tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX); - } else { - tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); - tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX); - } - break; - default: - tcg_abort(); + retaddr = tcg_target_call_iarg_regs[3]; + tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr); } - /* Jump to the code corresponding to next IR of qemu_st */ - tcg_out_jmp(s, (tcg_target_long)l->raddr); + /* "Tail call" to the helper, with the return address back inline. */ + tcg_out_push(s, retaddr); + tcg_out_jmp(s, (tcg_target_long)qemu_ld_helpers[opc]); } /* @@ -2125,38 +2096,38 @@ static const TCGTargetOpDef x86_op_defs[] = { #endif #if TCG_TARGET_REG_BITS == 64 - { INDEX_op_qemu_ld8u, { "r", "L" } }, - { INDEX_op_qemu_ld8s, { "r", "L" } }, - { INDEX_op_qemu_ld16u, { "r", "L" } }, - { INDEX_op_qemu_ld16s, { "r", "L" } }, - { INDEX_op_qemu_ld32, { "r", "L" } }, - { INDEX_op_qemu_ld32u, { "r", "L" } }, - { INDEX_op_qemu_ld32s, { "r", "L" } }, - { INDEX_op_qemu_ld64, { "r", "L" } }, + { INDEX_op_qemu_ld8u, { "a", "L" } }, + { INDEX_op_qemu_ld8s, { "a", "L" } }, + { INDEX_op_qemu_ld16u, { "a", "L" } }, + { INDEX_op_qemu_ld16s, { "a", "L" } }, + { INDEX_op_qemu_ld32, { "a", "L" } }, + { INDEX_op_qemu_ld32u, { "a", "L" } }, + { INDEX_op_qemu_ld32s, { "a", "L" } }, + { INDEX_op_qemu_ld64, { "a", "L" } }, { INDEX_op_qemu_st8, { "L", "L" } }, { INDEX_op_qemu_st16, { "L", "L" } }, { INDEX_op_qemu_st32, { "L", "L" } }, { INDEX_op_qemu_st64, { "L", "L" } }, #elif TARGET_LONG_BITS <= TCG_TARGET_REG_BITS - { INDEX_op_qemu_ld8u, { "r", "L" } }, - { INDEX_op_qemu_ld8s, { "r", "L" } }, - { INDEX_op_qemu_ld16u, { "r", "L" } }, - { INDEX_op_qemu_ld16s, { "r", "L" } }, - { INDEX_op_qemu_ld32, { "r", "L" } }, - { INDEX_op_qemu_ld64, { "r", "r", "L" } }, + { INDEX_op_qemu_ld8u, { "a", "L" } }, + { INDEX_op_qemu_ld8s, { "a", "L" } }, + { INDEX_op_qemu_ld16u, { "a", "L" } }, + { INDEX_op_qemu_ld16s, { "a", "L" } }, + { INDEX_op_qemu_ld32, { "a", "L" } }, + { INDEX_op_qemu_ld64, { "a", "d", "L" } }, { INDEX_op_qemu_st8, { "cb", "L" } }, { INDEX_op_qemu_st16, { "L", "L" } }, { INDEX_op_qemu_st32, { "L", "L" } }, { INDEX_op_qemu_st64, { "L", "L", "L" } }, #else - { INDEX_op_qemu_ld8u, { "r", "L", "L" } }, - { INDEX_op_qemu_ld8s, { "r", "L", "L" } }, - { INDEX_op_qemu_ld16u, { "r", "L", "L" } }, - { INDEX_op_qemu_ld16s, { "r", "L", "L" } }, - { INDEX_op_qemu_ld32, { "r", "L", "L" } }, - { INDEX_op_qemu_ld64, { "r", "r", "L", "L" } }, + { INDEX_op_qemu_ld8u, { "a", "L", "L" } }, + { INDEX_op_qemu_ld8s, { "a", "L", "L" } }, + { INDEX_op_qemu_ld16u, { "a", "L", "L" } }, + { INDEX_op_qemu_ld16s, { "a", "L", "L" } }, + { INDEX_op_qemu_ld32, { "a", "L", "L" } }, + { INDEX_op_qemu_ld64, { "a", "d", "L", "L" } }, { INDEX_op_qemu_st8, { "cb", "L", "L" } }, { INDEX_op_qemu_st16, { "L", "L", "L" } },
This does require the fast path always load to the function return value register, but apparently the loaded value usually needs to be spilled back to its memory slot anyway so the change in register does not really change much. Signed-off-by: Richard Henderson <rth@twiddle.net> --- tcg/i386/tcg-target.c | 107 ++++++++++++++++++-------------------------------- 1 file changed, 39 insertions(+), 68 deletions(-)