Message ID | 1255696735-21396-2-git-send-email-uli@suse.de |
---|---|
State | New |
Headers | show |
On Fri, Oct 16, 2009 at 02:38:47PM +0200, Ulrich Hecht wrote: > sync allows concurrent accesses to locations in memory through different TCG > variables. This comes in handy when you are emulating CPU registers that can > be used as either 32 or 64 bit, as TCG doesn't know anything about aliases. > See the s390x target for an example. > > Fixed sync_i64 build failure on 32-bit targets. It don't really see the point of such a new op, especially the way it is used in the S390 target. If a global is "synced" before each load/store, it will be load/stored from/to memory each time it is used. This is exactly what tcg_gen_ld/st do, except it's only one op. The benefit of globals in TCG is to hold them as long as possible in host register and avoid costly memory load/store. tcg_gen_ld/st would probably be even more efficient, as it is one op instead of two, and also because mapping more globals means more time spent in the code looping over all globals. IMHO, the correct way to do it is to use the following code, assuming you want to use 64-bit TCG regs to hold 32-bit values (that's something that is not really clear in your next patch): - for register load: | static TCGv load_reg(int reg) | { | TCGv r = tcg_temp_new_i64(); | tcg_gen_ext32u_i64(r, tcgregs[reg]); | return r; | } | | static void store_reg32(int reg, TCGv v) | { | tcg_gen_ext32u_i64(v, v); /* may be optional */ | tcg_gen_andi_i64(tcgregs[reg], tcgregs[reg], 0xffffffff00000000ULL); | tcg_gen_or_i64(tcgregs[reg], tcgregs[reg], v); | } If you want to do the same using 32-bit TCG regs: | static TCGv_i32 load_reg(int reg) | { | TCGv_i32 r = tcg_temp_new_i32(); | tcg_gen_extu_i32_i64(r, tcgregs[reg]); | return r; | } | | static void store_reg32(int reg, TCGv_i32 v) | { | TCGv tmp = tcg_temp_new(); | tcg_gen_extu_i32_i64(tmp, v); | tcg_gen_andi_i64(tcgregs[reg], tcgregs[reg], 0xffffffff00000000ULL); | tcg_gen_or_i64(tcgregs[reg], tcgregs[reg], tmp); | tcg_temp_free(tmp); | } Regards, Aurelien > Signed-off-by: Ulrich Hecht <uli@suse.de> > --- > tcg/tcg-op.h | 12 ++++++++++++ > tcg/tcg-opc.h | 2 ++ > tcg/tcg.c | 6 ++++++ > 3 files changed, 20 insertions(+), 0 deletions(-) > > diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h > index faf2e8b..c1b4710 100644 > --- a/tcg/tcg-op.h > +++ b/tcg/tcg-op.h > @@ -316,6 +316,18 @@ static inline void tcg_gen_br(int label) > tcg_gen_op1i(INDEX_op_br, label); > } > > +static inline void tcg_gen_sync_i32(TCGv_i32 arg) > +{ > + tcg_gen_op1_i32(INDEX_op_sync_i32, arg); > +} > + > +#if TCG_TARGET_REG_BITS == 64 > +static inline void tcg_gen_sync_i64(TCGv_i64 arg) > +{ > + tcg_gen_op1_i64(INDEX_op_sync_i64, arg); > +} > +#endif > + > static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg) > { > if (!TCGV_EQUAL_I32(ret, arg)) > diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h > index b7f3fd7..5dcdeba 100644 > --- a/tcg/tcg-opc.h > +++ b/tcg/tcg-opc.h > @@ -40,6 +40,7 @@ DEF2(call, 0, 1, 2, TCG_OPF_SIDE_EFFECTS) /* variable number of parameters */ > DEF2(jmp, 0, 1, 0, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS) > DEF2(br, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS) > > +DEF2(sync_i32, 0, 1, 0, 0) > DEF2(mov_i32, 1, 1, 0, 0) > DEF2(movi_i32, 1, 0, 1, 0) > /* load/store */ > @@ -109,6 +110,7 @@ DEF2(neg_i32, 1, 1, 0, 0) > #endif > > #if TCG_TARGET_REG_BITS == 64 > +DEF2(sync_i64, 0, 1, 0, 0) > DEF2(mov_i64, 1, 1, 0, 0) > DEF2(movi_i64, 1, 0, 1, 0) > /* load/store */ > diff --git a/tcg/tcg.c b/tcg/tcg.c > index 3c0e296..8eb60f8 100644 > --- a/tcg/tcg.c > +++ b/tcg/tcg.c > @@ -1930,6 +1930,12 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf, > // dump_regs(s); > #endif > switch(opc) { > + case INDEX_op_sync_i32: > +#if TCG_TARGET_REG_BITS == 64 > + case INDEX_op_sync_i64: > +#endif > + temp_save(s, args[0], s->reserved_regs); > + break; > case INDEX_op_mov_i32: > #if TCG_TARGET_REG_BITS == 64 > case INDEX_op_mov_i64: > -- > 1.6.2.1 > > > >
On Friday 16 October 2009, Aurelien Jarno wrote: > IMHO, the correct way to do it is to use the following code, assuming > you want to use 64-bit TCG regs to hold 32-bit values (that's > something that is not really clear in your next patch): > > - for register load: > | static TCGv load_reg(int reg) > | { > | TCGv r = tcg_temp_new_i64(); > | tcg_gen_ext32u_i64(r, tcgregs[reg]); > | return r; > | } > | > | static void store_reg32(int reg, TCGv v) > | { > | tcg_gen_ext32u_i64(v, v); /* may be optional */ > | tcg_gen_andi_i64(tcgregs[reg], tcgregs[reg], > | 0xffffffff00000000ULL); tcg_gen_or_i64(tcgregs[reg], tcgregs[reg], > | v); > | } This is _extremely_ detrimental to performance. The point of the sync op is that in most cases it's a nop because registers are usually used with the same bitness again and again. The sign extension/masking stuff is done every time a register is accessed as 32 bits, which is the most common case. Compare the translation of the following sequence of instructions: IN: _dl_aux_init 0x0000000080044ff6: lhi %r4,0 0x0000000080044ffa: lhi %r5,0 0x0000000080044ffe: lhi %r0,0 with sync: OP: mov_i32 loc0,global_cc movi_i32 tmp1,$0x0 sync_i64 R4 mov_i32 r4,tmp1 movi_i32 tmp1,$0x0 sync_i64 R5 mov_i32 r5,tmp1 movi_i32 tmp1,$0x0 sync_i64 R0 mov_i32 r0,tmp1 mov_i32 global_cc,loc0 movi_i64 tmp2,$0x80045002 st_i64 tmp2,env,$0x158 exit_tb $0x0 OUT: [size=61] 0x6019a030: mov 0x160(%r14),%ebp 0x6019a037: mov %rbp,%rbx 0x6019a03a: mov $0x80045002,%r12d 0x6019a040: mov %r12,0x158(%r14) 0x6019a047: mov %ebp,0xd1a0(%r14) 0x6019a04e: mov %ebx,0x160(%r14) 0x6019a055: xor %ebp,%ebp 0x6019a057: mov %ebp,(%r14) 0x6019a05a: xor %ebp,%ebp 0x6019a05c: mov %ebp,0x20(%r14) 0x6019a060: xor %ebp,%ebp 0x6019a062: mov %ebp,0x28(%r14) 0x6019a066: xor %eax,%eax 0x6019a068: jmpq 0x621dc8ce with sign extension: OP: mov_i32 loc0,global_cc movi_i32 tmp1,$0x0 ext32u_i64 tmp1,tmp1 movi_i64 tmp2,$0xffffffff00000000 and_i64 R4,R4,tmp2 or_i64 R4,R4,tmp1 movi_i32 tmp1,$0x0 ext32u_i64 tmp1,tmp1 movi_i64 tmp2,$0xffffffff00000000 and_i64 R5,R5,tmp2 or_i64 R5,R5,tmp1 movi_i32 tmp1,$0x0 ext32u_i64 tmp1,tmp1 movi_i64 tmp2,$0xffffffff00000000 and_i64 R0,R0,tmp2 or_i64 R0,R0,tmp1 mov_i32 global_cc,loc0 movi_i64 tmp2,$0x80045002 st_i64 tmp2,env,$0x158 exit_tb $0x0 OUT: [size=126] 0x6019af10: mov 0x160(%r14),%ebp 0x6019af17: xor %ebx,%ebx 0x6019af19: mov %ebx,%ebx 0x6019af1b: mov 0x20(%r14),%r12 0x6019af1f: mov $0xffffffff00000000,%r13 0x6019af29: and %r13,%r12 0x6019af2c: or %rbx,%r12 0x6019af2f: xor %ebx,%ebx 0x6019af31: mov %ebx,%ebx 0x6019af33: mov 0x28(%r14),%r13 0x6019af37: mov $0xffffffff00000000,%r15 0x6019af41: and %r15,%r13 0x6019af44: or %rbx,%r13 0x6019af47: xor %ebx,%ebx 0x6019af49: mov %ebx,%ebx 0x6019af4b: mov (%r14),%r15 0x6019af4e: mov $0xffffffff00000000,%r10 0x6019af58: and %r10,%r15 0x6019af5b: or %rbx,%r15 0x6019af5e: mov %rbp,%rbx 0x6019af61: mov $0x80045002,%r10d 0x6019af67: mov %r10,0x158(%r14) 0x6019af6e: mov %ebp,0xd1a0(%r14) 0x6019af75: mov %ebx,0x160(%r14) 0x6019af7c: mov %r15,(%r14) 0x6019af7f: mov %r12,0x20(%r14) 0x6019af83: mov %r13,0x28(%r14) 0x6019af87: xor %eax,%eax 0x6019af89: jmpq 0x621dd78e Its more than twice the size and has ten memory accesses instead of seven. CU Uli
On Fri, Oct 16, 2009 at 06:37:31PM +0200, Ulrich Hecht wrote: > On Friday 16 October 2009, Aurelien Jarno wrote: > > IMHO, the correct way to do it is to use the following code, assuming > > you want to use 64-bit TCG regs to hold 32-bit values (that's > > something that is not really clear in your next patch): > > > > - for register load: > > | static TCGv load_reg(int reg) > > | { > > | TCGv r = tcg_temp_new_i64(); > > | tcg_gen_ext32u_i64(r, tcgregs[reg]); > > | return r; > > | } > > | > > | static void store_reg32(int reg, TCGv v) > > | { > > | tcg_gen_ext32u_i64(v, v); /* may be optional */ > > | tcg_gen_andi_i64(tcgregs[reg], tcgregs[reg], > > | 0xffffffff00000000ULL); tcg_gen_or_i64(tcgregs[reg], tcgregs[reg], > > | v); > > | } > > This is _extremely_ detrimental to performance. The point of the sync op > is that in most cases it's a nop because registers are usually used with > the same bitness again and again. The sign extension/masking stuff is I don't really understand how it can be simply be a nop, given it calls temp_save. It means if a register is used twice in a BB, it is fetch again from memory. > done every time a register is accessed as 32 bits, which is the most > common case. Compare the translation of the following sequence of > instructions: > > IN: _dl_aux_init > 0x0000000080044ff6: lhi %r4,0 > 0x0000000080044ffa: lhi %r5,0 > 0x0000000080044ffe: lhi %r0,0 This example is a bit biased, as registers are only saved, and never reused. Let's comment on it though. > with sync: > > OP: > mov_i32 loc0,global_cc > movi_i32 tmp1,$0x0 > sync_i64 R4 > mov_i32 r4,tmp1 > movi_i32 tmp1,$0x0 > sync_i64 R5 > mov_i32 r5,tmp1 > movi_i32 tmp1,$0x0 > sync_i64 R0 > mov_i32 r0,tmp1 > mov_i32 global_cc,loc0 > movi_i64 tmp2,$0x80045002 > st_i64 tmp2,env,$0x158 > exit_tb $0x0 > > OUT: [size=61] > 0x6019a030: mov 0x160(%r14),%ebp > 0x6019a037: mov %rbp,%rbx > 0x6019a03a: mov $0x80045002,%r12d > 0x6019a040: mov %r12,0x158(%r14) > 0x6019a047: mov %ebp,0xd1a0(%r14) > 0x6019a04e: mov %ebx,0x160(%r14) > 0x6019a055: xor %ebp,%ebp > 0x6019a057: mov %ebp,(%r14) > 0x6019a05a: xor %ebp,%ebp > 0x6019a05c: mov %ebp,0x20(%r14) > 0x6019a060: xor %ebp,%ebp > 0x6019a062: mov %ebp,0x28(%r14) > 0x6019a066: xor %eax,%eax > 0x6019a068: jmpq 0x621dc8ce > > > with sign extension: > > OP: > mov_i32 loc0,global_cc > movi_i32 tmp1,$0x0 > ext32u_i64 tmp1,tmp1 > movi_i64 tmp2,$0xffffffff00000000 > and_i64 R4,R4,tmp2 > or_i64 R4,R4,tmp1 > movi_i32 tmp1,$0x0 > ext32u_i64 tmp1,tmp1 > movi_i64 tmp2,$0xffffffff00000000 > and_i64 R5,R5,tmp2 > or_i64 R5,R5,tmp1 > movi_i32 tmp1,$0x0 > ext32u_i64 tmp1,tmp1 > movi_i64 tmp2,$0xffffffff00000000 > and_i64 R0,R0,tmp2 > or_i64 R0,R0,tmp1 > mov_i32 global_cc,loc0 > movi_i64 tmp2,$0x80045002 > st_i64 tmp2,env,$0x158 > exit_tb $0x0 > > OUT: [size=126] > 0x6019af10: mov 0x160(%r14),%ebp > 0x6019af17: xor %ebx,%ebx > 0x6019af19: mov %ebx,%ebx > 0x6019af1b: mov 0x20(%r14),%r12 > 0x6019af1f: mov $0xffffffff00000000,%r13 > 0x6019af29: and %r13,%r12 > 0x6019af2c: or %rbx,%r12 > 0x6019af2f: xor %ebx,%ebx > 0x6019af31: mov %ebx,%ebx > 0x6019af33: mov 0x28(%r14),%r13 > 0x6019af37: mov $0xffffffff00000000,%r15 > 0x6019af41: and %r15,%r13 > 0x6019af44: or %rbx,%r13 > 0x6019af47: xor %ebx,%ebx > 0x6019af49: mov %ebx,%ebx > 0x6019af4b: mov (%r14),%r15 > 0x6019af4e: mov $0xffffffff00000000,%r10 > 0x6019af58: and %r10,%r15 > 0x6019af5b: or %rbx,%r15 > 0x6019af5e: mov %rbp,%rbx > 0x6019af61: mov $0x80045002,%r10d > 0x6019af67: mov %r10,0x158(%r14) > 0x6019af6e: mov %ebp,0xd1a0(%r14) > 0x6019af75: mov %ebx,0x160(%r14) > 0x6019af7c: mov %r15,(%r14) > 0x6019af7f: mov %r12,0x20(%r14) > 0x6019af83: mov %r13,0x28(%r14) > 0x6019af87: xor %eax,%eax > 0x6019af89: jmpq 0x621dd78e > > Its more than twice the size and has ten memory accesses instead of > seven. > There is clearly a huge impact for saving the globals, that I didn't expect. I still believe a sync op as implemented in your patch is in opposite direction to TCG's philosophy, probably does not work with fixed registers, and I don't understand what is the gain compared to the use of tcg_gen_ld/st. Maybe you can post a dump of such a version so that we can see the benefit? I think instead of a sync op, we should think of a way to use only the low part of a global variable, maybe by adding new ops. That can also help to improve the concat_i32_i64 op on 64-bit hosts. Does someone has any idea? Aurelien
On Fri, Oct 16, 2009 at 05:52:21PM +0200, Aurelien Jarno wrote: > On Fri, Oct 16, 2009 at 02:38:47PM +0200, Ulrich Hecht wrote: > > sync allows concurrent accesses to locations in memory through different TCG > > variables. This comes in handy when you are emulating CPU registers that can > > be used as either 32 or 64 bit, as TCG doesn't know anything about aliases. > > See the s390x target for an example. > > > > Fixed sync_i64 build failure on 32-bit targets. > > It don't really see the point of such a new op, especially the way it is > used in the S390 target. Hi, I looked at the s390 patches and was also unsure about this sync op. I'm not convinced it's bad but my first feeling was as Aurelien points out that the translator shoud take care of it. Not sure though, it would be nice to hear what other ppl think. Another thing I noticed was the large amount of helpers. Without looking at the details my feeling was that you could probably do more at translation time. That kind of optimization can be done incrementally with follow-up patches though. Other than that the s390 series looks OK and should IMO get committed. Nice work! Cheers
On Friday 16 October 2009, Aurelien Jarno wrote: > This example is a bit biased, as registers are only saved, and never > reused. Let's comment on it though. Yeah, well, I searched from the top for the first case where it makes a difference. If it's of any help, I can upload a complete dump of both versions somewhere. > and I don't understand what is the gain compared to > the use of tcg_gen_ld/st. There are two sets of TCG values, tcgregs (which would arguably better called tcgregs64) and tcgregs32. When doing a 32-bit access, tcgregs(64) is synced, which is a nop if tcgregs(64) hasn't been touched. When doing a 64-bit access, tcgregs32 is synced, which is a nop if tcgregs32 hasn't been touched. In practice, 32-bit accesses followed by 64-bit accesses and vice versa are very rare, so in most cases, sync is a nop. tcg_gen_ld/st is never a nop. That's the benefit. CU Uli
On Saturday 17 October 2009, Edgar E. Iglesias wrote: > I looked at the s390 patches and was also unsure about this sync op. > I'm not convinced it's bad but my first feeling was as Aurelien points > out that the translator shoud take care of it. Indeed. I would have expected it to, in fact. But it doesn't. The sync op is the simplest and quickest way to get what I want that I could come up with. I'd be perfectly happy if TCG could handle aliases on its own, but doing a lot of ALU operations on every register access is not an option. > Another thing I noticed was the large amount of helpers. Without > looking at the details my feeling was that you could probably do more > at translation time. My experience is that helper functions have an undeservedly bad image. A pure const helper call is very easy to optimize away for TCG. Random bit shifting, comparing and branching isn't. > Nice work! Thank you. CU Uli
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h index faf2e8b..c1b4710 100644 --- a/tcg/tcg-op.h +++ b/tcg/tcg-op.h @@ -316,6 +316,18 @@ static inline void tcg_gen_br(int label) tcg_gen_op1i(INDEX_op_br, label); } +static inline void tcg_gen_sync_i32(TCGv_i32 arg) +{ + tcg_gen_op1_i32(INDEX_op_sync_i32, arg); +} + +#if TCG_TARGET_REG_BITS == 64 +static inline void tcg_gen_sync_i64(TCGv_i64 arg) +{ + tcg_gen_op1_i64(INDEX_op_sync_i64, arg); +} +#endif + static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg) { if (!TCGV_EQUAL_I32(ret, arg)) diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h index b7f3fd7..5dcdeba 100644 --- a/tcg/tcg-opc.h +++ b/tcg/tcg-opc.h @@ -40,6 +40,7 @@ DEF2(call, 0, 1, 2, TCG_OPF_SIDE_EFFECTS) /* variable number of parameters */ DEF2(jmp, 0, 1, 0, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS) DEF2(br, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS) +DEF2(sync_i32, 0, 1, 0, 0) DEF2(mov_i32, 1, 1, 0, 0) DEF2(movi_i32, 1, 0, 1, 0) /* load/store */ @@ -109,6 +110,7 @@ DEF2(neg_i32, 1, 1, 0, 0) #endif #if TCG_TARGET_REG_BITS == 64 +DEF2(sync_i64, 0, 1, 0, 0) DEF2(mov_i64, 1, 1, 0, 0) DEF2(movi_i64, 1, 0, 1, 0) /* load/store */ diff --git a/tcg/tcg.c b/tcg/tcg.c index 3c0e296..8eb60f8 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -1930,6 +1930,12 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf, // dump_regs(s); #endif switch(opc) { + case INDEX_op_sync_i32: +#if TCG_TARGET_REG_BITS == 64 + case INDEX_op_sync_i64: +#endif + temp_save(s, args[0], s->reserved_regs); + break; case INDEX_op_mov_i32: #if TCG_TARGET_REG_BITS == 64 case INDEX_op_mov_i64:
sync allows concurrent accesses to locations in memory through different TCG variables. This comes in handy when you are emulating CPU registers that can be used as either 32 or 64 bit, as TCG doesn't know anything about aliases. See the s390x target for an example. Fixed sync_i64 build failure on 32-bit targets. Signed-off-by: Ulrich Hecht <uli@suse.de> --- tcg/tcg-op.h | 12 ++++++++++++ tcg/tcg-opc.h | 2 ++ tcg/tcg.c | 6 ++++++ 3 files changed, 20 insertions(+), 0 deletions(-)