Patchwork [1/9] TCG "sync" op

login
register
mail settings
Submitter Ulrich Hecht
Date Oct. 16, 2009, 12:38 p.m.
Message ID <1255696735-21396-2-git-send-email-uli@suse.de>
Download mbox | patch
Permalink /patch/36206/
State New
Headers show

Comments

Ulrich Hecht - Oct. 16, 2009, 12:38 p.m.
sync allows concurrent accesses to locations in memory through different TCG
variables. This comes in handy when you are emulating CPU registers that can
be used as either 32 or 64 bit, as TCG doesn't know anything about aliases.
See the s390x target for an example.

Fixed sync_i64 build failure on 32-bit targets.

Signed-off-by: Ulrich Hecht <uli@suse.de>
---
 tcg/tcg-op.h  |   12 ++++++++++++
 tcg/tcg-opc.h |    2 ++
 tcg/tcg.c     |    6 ++++++
 3 files changed, 20 insertions(+), 0 deletions(-)
Aurelien Jarno - Oct. 16, 2009, 3:52 p.m.
On Fri, Oct 16, 2009 at 02:38:47PM +0200, Ulrich Hecht wrote:
> sync allows concurrent accesses to locations in memory through different TCG
> variables. This comes in handy when you are emulating CPU registers that can
> be used as either 32 or 64 bit, as TCG doesn't know anything about aliases.
> See the s390x target for an example.
> 
> Fixed sync_i64 build failure on 32-bit targets.

It don't really see the point of such a new op, especially the way it is
used in the S390 target.

If a global is "synced" before each load/store, it will be load/stored 
from/to memory each time it is used. This is exactly what tcg_gen_ld/st
do, except it's only one op. The benefit of globals in TCG is to hold 
them as long as possible in host register and avoid costly memory 
load/store. tcg_gen_ld/st would probably be even more efficient, as 
it is one op instead of two, and also because mapping more globals 
means more time spent in the code looping over all globals.

IMHO, the correct way to do it is to use the following code, assuming 
you want to use 64-bit TCG regs to hold 32-bit values (that's something
that is not really clear in your next patch):

- for register load:

| static TCGv load_reg(int reg)
| {
|    TCGv r = tcg_temp_new_i64();
|    tcg_gen_ext32u_i64(r, tcgregs[reg]);
|    return r;
| }
|
| static void store_reg32(int reg, TCGv v)
| {
|    tcg_gen_ext32u_i64(v, v); /* may be optional */
|    tcg_gen_andi_i64(tcgregs[reg], tcgregs[reg], 0xffffffff00000000ULL);
|    tcg_gen_or_i64(tcgregs[reg], tcgregs[reg], v);
| }

If you want to do the same using 32-bit TCG regs:

| static TCGv_i32 load_reg(int reg)
| {
|    TCGv_i32 r = tcg_temp_new_i32();
|    tcg_gen_extu_i32_i64(r, tcgregs[reg]);
|    return r;
| }
|
| static void store_reg32(int reg, TCGv_i32 v)
| {
|    TCGv tmp = tcg_temp_new();
|    tcg_gen_extu_i32_i64(tmp, v);
|    tcg_gen_andi_i64(tcgregs[reg], tcgregs[reg], 0xffffffff00000000ULL);
|    tcg_gen_or_i64(tcgregs[reg], tcgregs[reg], tmp);
|    tcg_temp_free(tmp);
| }

Regards,
Aurelien

> Signed-off-by: Ulrich Hecht <uli@suse.de>
> ---
>  tcg/tcg-op.h  |   12 ++++++++++++
>  tcg/tcg-opc.h |    2 ++
>  tcg/tcg.c     |    6 ++++++
>  3 files changed, 20 insertions(+), 0 deletions(-)
> 
> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
> index faf2e8b..c1b4710 100644
> --- a/tcg/tcg-op.h
> +++ b/tcg/tcg-op.h
> @@ -316,6 +316,18 @@ static inline void tcg_gen_br(int label)
>      tcg_gen_op1i(INDEX_op_br, label);
>  }
>  
> +static inline void tcg_gen_sync_i32(TCGv_i32 arg)
> +{
> +    tcg_gen_op1_i32(INDEX_op_sync_i32, arg);
> +}
> +
> +#if TCG_TARGET_REG_BITS == 64
> +static inline void tcg_gen_sync_i64(TCGv_i64 arg)
> +{
> +    tcg_gen_op1_i64(INDEX_op_sync_i64, arg);
> +}
> +#endif
> +
>  static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg)
>  {
>      if (!TCGV_EQUAL_I32(ret, arg))
> diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
> index b7f3fd7..5dcdeba 100644
> --- a/tcg/tcg-opc.h
> +++ b/tcg/tcg-opc.h
> @@ -40,6 +40,7 @@ DEF2(call, 0, 1, 2, TCG_OPF_SIDE_EFFECTS) /* variable number of parameters */
>  DEF2(jmp, 0, 1, 0, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
>  DEF2(br, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
>  
> +DEF2(sync_i32, 0, 1, 0, 0)
>  DEF2(mov_i32, 1, 1, 0, 0)
>  DEF2(movi_i32, 1, 0, 1, 0)
>  /* load/store */
> @@ -109,6 +110,7 @@ DEF2(neg_i32, 1, 1, 0, 0)
>  #endif
>  
>  #if TCG_TARGET_REG_BITS == 64
> +DEF2(sync_i64, 0, 1, 0, 0)
>  DEF2(mov_i64, 1, 1, 0, 0)
>  DEF2(movi_i64, 1, 0, 1, 0)
>  /* load/store */
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 3c0e296..8eb60f8 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -1930,6 +1930,12 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
>          //        dump_regs(s);
>  #endif
>          switch(opc) {
> +        case INDEX_op_sync_i32:
> +#if TCG_TARGET_REG_BITS == 64
> +        case INDEX_op_sync_i64:
> +#endif
> +            temp_save(s, args[0], s->reserved_regs);
> +            break;
>          case INDEX_op_mov_i32:
>  #if TCG_TARGET_REG_BITS == 64
>          case INDEX_op_mov_i64:
> -- 
> 1.6.2.1
> 
> 
> 
>
Ulrich Hecht - Oct. 16, 2009, 4:37 p.m.
On Friday 16 October 2009, Aurelien Jarno wrote:
> IMHO, the correct way to do it is to use the following code, assuming
> you want to use 64-bit TCG regs to hold 32-bit values (that's
> something that is not really clear in your next patch):
>
> - for register load:
> | static TCGv load_reg(int reg)
> | {
> |    TCGv r = tcg_temp_new_i64();
> |    tcg_gen_ext32u_i64(r, tcgregs[reg]);
> |    return r;
> | }
> |
> | static void store_reg32(int reg, TCGv v)
> | {
> |    tcg_gen_ext32u_i64(v, v); /* may be optional */
> |    tcg_gen_andi_i64(tcgregs[reg], tcgregs[reg],
> | 0xffffffff00000000ULL); tcg_gen_or_i64(tcgregs[reg], tcgregs[reg],
> | v);
> | }

This is _extremely_ detrimental to performance. The point of the sync op 
is that in most cases it's a nop because registers are usually used with 
the same bitness again and again. The sign extension/masking stuff is 
done every time a register is accessed as 32 bits, which is the most 
common case. Compare the translation of the following sequence of 
instructions:

IN: _dl_aux_init
0x0000000080044ff6:  lhi        %r4,0
0x0000000080044ffa:  lhi        %r5,0
0x0000000080044ffe:  lhi        %r0,0

with sync:

OP:
 mov_i32 loc0,global_cc
 movi_i32 tmp1,$0x0
 sync_i64 R4
 mov_i32 r4,tmp1
 movi_i32 tmp1,$0x0
 sync_i64 R5
 mov_i32 r5,tmp1
 movi_i32 tmp1,$0x0
 sync_i64 R0
 mov_i32 r0,tmp1
 mov_i32 global_cc,loc0
 movi_i64 tmp2,$0x80045002
 st_i64 tmp2,env,$0x158
 exit_tb $0x0

OUT: [size=61]
0x6019a030:  mov    0x160(%r14),%ebp
0x6019a037:  mov    %rbp,%rbx
0x6019a03a:  mov    $0x80045002,%r12d
0x6019a040:  mov    %r12,0x158(%r14)
0x6019a047:  mov    %ebp,0xd1a0(%r14)
0x6019a04e:  mov    %ebx,0x160(%r14)
0x6019a055:  xor    %ebp,%ebp
0x6019a057:  mov    %ebp,(%r14)
0x6019a05a:  xor    %ebp,%ebp
0x6019a05c:  mov    %ebp,0x20(%r14)
0x6019a060:  xor    %ebp,%ebp
0x6019a062:  mov    %ebp,0x28(%r14)
0x6019a066:  xor    %eax,%eax
0x6019a068:  jmpq   0x621dc8ce


with sign extension:

OP:
 mov_i32 loc0,global_cc
 movi_i32 tmp1,$0x0
 ext32u_i64 tmp1,tmp1
 movi_i64 tmp2,$0xffffffff00000000
 and_i64 R4,R4,tmp2
 or_i64 R4,R4,tmp1
 movi_i32 tmp1,$0x0
 ext32u_i64 tmp1,tmp1
 movi_i64 tmp2,$0xffffffff00000000
 and_i64 R5,R5,tmp2
 or_i64 R5,R5,tmp1
 movi_i32 tmp1,$0x0
 ext32u_i64 tmp1,tmp1
 movi_i64 tmp2,$0xffffffff00000000
 and_i64 R0,R0,tmp2
 or_i64 R0,R0,tmp1
 mov_i32 global_cc,loc0
 movi_i64 tmp2,$0x80045002
 st_i64 tmp2,env,$0x158
 exit_tb $0x0

OUT: [size=126]
0x6019af10:  mov    0x160(%r14),%ebp
0x6019af17:  xor    %ebx,%ebx
0x6019af19:  mov    %ebx,%ebx
0x6019af1b:  mov    0x20(%r14),%r12
0x6019af1f:  mov    $0xffffffff00000000,%r13
0x6019af29:  and    %r13,%r12
0x6019af2c:  or     %rbx,%r12
0x6019af2f:  xor    %ebx,%ebx
0x6019af31:  mov    %ebx,%ebx
0x6019af33:  mov    0x28(%r14),%r13
0x6019af37:  mov    $0xffffffff00000000,%r15
0x6019af41:  and    %r15,%r13
0x6019af44:  or     %rbx,%r13
0x6019af47:  xor    %ebx,%ebx
0x6019af49:  mov    %ebx,%ebx
0x6019af4b:  mov    (%r14),%r15
0x6019af4e:  mov    $0xffffffff00000000,%r10
0x6019af58:  and    %r10,%r15
0x6019af5b:  or     %rbx,%r15
0x6019af5e:  mov    %rbp,%rbx
0x6019af61:  mov    $0x80045002,%r10d
0x6019af67:  mov    %r10,0x158(%r14)
0x6019af6e:  mov    %ebp,0xd1a0(%r14)
0x6019af75:  mov    %ebx,0x160(%r14)
0x6019af7c:  mov    %r15,(%r14)
0x6019af7f:  mov    %r12,0x20(%r14)
0x6019af83:  mov    %r13,0x28(%r14)
0x6019af87:  xor    %eax,%eax
0x6019af89:  jmpq   0x621dd78e

Its more than twice the size and has ten memory accesses instead of 
seven.

CU
Uli
Aurelien Jarno - Oct. 16, 2009, 5:29 p.m.
On Fri, Oct 16, 2009 at 06:37:31PM +0200, Ulrich Hecht wrote:
> On Friday 16 October 2009, Aurelien Jarno wrote:
> > IMHO, the correct way to do it is to use the following code, assuming
> > you want to use 64-bit TCG regs to hold 32-bit values (that's
> > something that is not really clear in your next patch):
> >
> > - for register load:
> > | static TCGv load_reg(int reg)
> > | {
> > |    TCGv r = tcg_temp_new_i64();
> > |    tcg_gen_ext32u_i64(r, tcgregs[reg]);
> > |    return r;
> > | }
> > |
> > | static void store_reg32(int reg, TCGv v)
> > | {
> > |    tcg_gen_ext32u_i64(v, v); /* may be optional */
> > |    tcg_gen_andi_i64(tcgregs[reg], tcgregs[reg],
> > | 0xffffffff00000000ULL); tcg_gen_or_i64(tcgregs[reg], tcgregs[reg],
> > | v);
> > | }
> 
> This is _extremely_ detrimental to performance. The point of the sync op 
> is that in most cases it's a nop because registers are usually used with 
> the same bitness again and again. The sign extension/masking stuff is 

I don't really understand how it can be simply be a nop, given it calls
temp_save. It means if a register is used twice in a BB, it is fetch
again from memory.

> done every time a register is accessed as 32 bits, which is the most 
> common case. Compare the translation of the following sequence of 
> instructions:
> 
> IN: _dl_aux_init
> 0x0000000080044ff6:  lhi        %r4,0
> 0x0000000080044ffa:  lhi        %r5,0
> 0x0000000080044ffe:  lhi        %r0,0

This example is a bit biased, as registers are only saved, and never
reused. Let's comment on it though.

> with sync:
> 
> OP:
>  mov_i32 loc0,global_cc
>  movi_i32 tmp1,$0x0
>  sync_i64 R4
>  mov_i32 r4,tmp1
>  movi_i32 tmp1,$0x0
>  sync_i64 R5
>  mov_i32 r5,tmp1
>  movi_i32 tmp1,$0x0
>  sync_i64 R0
>  mov_i32 r0,tmp1
>  mov_i32 global_cc,loc0
>  movi_i64 tmp2,$0x80045002
>  st_i64 tmp2,env,$0x158
>  exit_tb $0x0
> 
> OUT: [size=61]
> 0x6019a030:  mov    0x160(%r14),%ebp
> 0x6019a037:  mov    %rbp,%rbx
> 0x6019a03a:  mov    $0x80045002,%r12d
> 0x6019a040:  mov    %r12,0x158(%r14)
> 0x6019a047:  mov    %ebp,0xd1a0(%r14)
> 0x6019a04e:  mov    %ebx,0x160(%r14)
> 0x6019a055:  xor    %ebp,%ebp
> 0x6019a057:  mov    %ebp,(%r14)
> 0x6019a05a:  xor    %ebp,%ebp
> 0x6019a05c:  mov    %ebp,0x20(%r14)
> 0x6019a060:  xor    %ebp,%ebp
> 0x6019a062:  mov    %ebp,0x28(%r14)
> 0x6019a066:  xor    %eax,%eax
> 0x6019a068:  jmpq   0x621dc8ce
> 
> 
> with sign extension:
> 
> OP:
>  mov_i32 loc0,global_cc
>  movi_i32 tmp1,$0x0
>  ext32u_i64 tmp1,tmp1
>  movi_i64 tmp2,$0xffffffff00000000
>  and_i64 R4,R4,tmp2
>  or_i64 R4,R4,tmp1
>  movi_i32 tmp1,$0x0
>  ext32u_i64 tmp1,tmp1
>  movi_i64 tmp2,$0xffffffff00000000
>  and_i64 R5,R5,tmp2
>  or_i64 R5,R5,tmp1
>  movi_i32 tmp1,$0x0
>  ext32u_i64 tmp1,tmp1
>  movi_i64 tmp2,$0xffffffff00000000
>  and_i64 R0,R0,tmp2
>  or_i64 R0,R0,tmp1
>  mov_i32 global_cc,loc0
>  movi_i64 tmp2,$0x80045002
>  st_i64 tmp2,env,$0x158
>  exit_tb $0x0
> 
> OUT: [size=126]
> 0x6019af10:  mov    0x160(%r14),%ebp
> 0x6019af17:  xor    %ebx,%ebx
> 0x6019af19:  mov    %ebx,%ebx
> 0x6019af1b:  mov    0x20(%r14),%r12
> 0x6019af1f:  mov    $0xffffffff00000000,%r13
> 0x6019af29:  and    %r13,%r12
> 0x6019af2c:  or     %rbx,%r12
> 0x6019af2f:  xor    %ebx,%ebx
> 0x6019af31:  mov    %ebx,%ebx
> 0x6019af33:  mov    0x28(%r14),%r13
> 0x6019af37:  mov    $0xffffffff00000000,%r15
> 0x6019af41:  and    %r15,%r13
> 0x6019af44:  or     %rbx,%r13
> 0x6019af47:  xor    %ebx,%ebx
> 0x6019af49:  mov    %ebx,%ebx
> 0x6019af4b:  mov    (%r14),%r15
> 0x6019af4e:  mov    $0xffffffff00000000,%r10
> 0x6019af58:  and    %r10,%r15
> 0x6019af5b:  or     %rbx,%r15
> 0x6019af5e:  mov    %rbp,%rbx
> 0x6019af61:  mov    $0x80045002,%r10d
> 0x6019af67:  mov    %r10,0x158(%r14)
> 0x6019af6e:  mov    %ebp,0xd1a0(%r14)
> 0x6019af75:  mov    %ebx,0x160(%r14)
> 0x6019af7c:  mov    %r15,(%r14)
> 0x6019af7f:  mov    %r12,0x20(%r14)
> 0x6019af83:  mov    %r13,0x28(%r14)
> 0x6019af87:  xor    %eax,%eax
> 0x6019af89:  jmpq   0x621dd78e
> 
> Its more than twice the size and has ten memory accesses instead of 
> seven.
> 

There is clearly a huge impact for saving the globals, that I didn't
expect. I still believe a sync op as implemented in your patch is in
opposite direction to TCG's philosophy, probably does not work with
fixed registers, and I don't understand what is the gain compared to 
the use of tcg_gen_ld/st. Maybe you can post a dump of such a version 
so that we can see the benefit?

I think instead of a sync op, we should think of a way to use only the
low part of a global variable, maybe by adding new ops. That can also 
help to improve the concat_i32_i64 op on 64-bit hosts. Does someone has
any idea?

Aurelien
Edgar Iglesias - Oct. 17, 2009, 8:59 a.m.
On Fri, Oct 16, 2009 at 05:52:21PM +0200, Aurelien Jarno wrote:
> On Fri, Oct 16, 2009 at 02:38:47PM +0200, Ulrich Hecht wrote:
> > sync allows concurrent accesses to locations in memory through different TCG
> > variables. This comes in handy when you are emulating CPU registers that can
> > be used as either 32 or 64 bit, as TCG doesn't know anything about aliases.
> > See the s390x target for an example.
> > 
> > Fixed sync_i64 build failure on 32-bit targets.
> 
> It don't really see the point of such a new op, especially the way it is
> used in the S390 target.

Hi,

I looked at the s390 patches and was also unsure about this sync op.
I'm not convinced it's bad but my first feeling was as Aurelien points
out that the translator shoud take care of it. Not sure though, it would
be nice to hear what other ppl think.

Another thing I noticed was the large amount of helpers. Without looking
at the details my feeling was that you could probably do more at
translation time. That kind of optimization can be done incrementally
with follow-up patches though.

Other than that the s390 series looks OK and should IMO get committed.

Nice work!

Cheers
Ulrich Hecht - Oct. 19, 2009, 5:17 p.m.
On Friday 16 October 2009, Aurelien Jarno wrote:
> This example is a bit biased, as registers are only saved, and never
> reused. Let's comment on it though.

Yeah, well, I searched from the top for the first case where it makes a 
difference. If it's of any help, I can upload a complete dump of both 
versions somewhere.

> and I don't understand what is the gain compared to 
> the use of tcg_gen_ld/st.

There are two sets of TCG values, tcgregs (which would arguably better 
called tcgregs64) and tcgregs32. When doing a 32-bit access, tcgregs(64) 
is synced, which is a nop if tcgregs(64) hasn't been touched. When doing 
a 64-bit access, tcgregs32 is synced, which is a nop if tcgregs32 hasn't 
been touched. In practice, 32-bit accesses followed by 64-bit accesses 
and vice versa are very rare, so in most cases, sync is a nop. 
tcg_gen_ld/st is never a nop. That's the benefit.

CU
Uli
Ulrich Hecht - Oct. 19, 2009, 5:17 p.m.
On Saturday 17 October 2009, Edgar E. Iglesias wrote:
> I looked at the s390 patches and was also unsure about this sync op.
> I'm not convinced it's bad but my first feeling was as Aurelien points
> out that the translator shoud take care of it.

Indeed. I would have expected it to, in fact. But it doesn't. The sync op 
is the simplest and quickest way to get what I want that I could come up 
with. I'd be perfectly happy if TCG could handle aliases on its own, but 
doing a lot of ALU operations on every register access is not an option.

> Another thing I noticed was the large amount of helpers. Without
> looking at the details my feeling was that you could probably do more
> at translation time.

My experience is that helper functions have an undeservedly bad image. A 
pure const helper call is very easy to optimize away for TCG. Random bit 
shifting, comparing and branching isn't.

> Nice work!

Thank you.

CU
Uli

Patch

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index faf2e8b..c1b4710 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -316,6 +316,18 @@  static inline void tcg_gen_br(int label)
     tcg_gen_op1i(INDEX_op_br, label);
 }
 
+static inline void tcg_gen_sync_i32(TCGv_i32 arg)
+{
+    tcg_gen_op1_i32(INDEX_op_sync_i32, arg);
+}
+
+#if TCG_TARGET_REG_BITS == 64
+static inline void tcg_gen_sync_i64(TCGv_i64 arg)
+{
+    tcg_gen_op1_i64(INDEX_op_sync_i64, arg);
+}
+#endif
+
 static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg)
 {
     if (!TCGV_EQUAL_I32(ret, arg))
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index b7f3fd7..5dcdeba 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -40,6 +40,7 @@  DEF2(call, 0, 1, 2, TCG_OPF_SIDE_EFFECTS) /* variable number of parameters */
 DEF2(jmp, 0, 1, 0, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
 DEF2(br, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
 
+DEF2(sync_i32, 0, 1, 0, 0)
 DEF2(mov_i32, 1, 1, 0, 0)
 DEF2(movi_i32, 1, 0, 1, 0)
 /* load/store */
@@ -109,6 +110,7 @@  DEF2(neg_i32, 1, 1, 0, 0)
 #endif
 
 #if TCG_TARGET_REG_BITS == 64
+DEF2(sync_i64, 0, 1, 0, 0)
 DEF2(mov_i64, 1, 1, 0, 0)
 DEF2(movi_i64, 1, 0, 1, 0)
 /* load/store */
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 3c0e296..8eb60f8 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1930,6 +1930,12 @@  static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
         //        dump_regs(s);
 #endif
         switch(opc) {
+        case INDEX_op_sync_i32:
+#if TCG_TARGET_REG_BITS == 64
+        case INDEX_op_sync_i64:
+#endif
+            temp_save(s, args[0], s->reserved_regs);
+            break;
         case INDEX_op_mov_i32:
 #if TCG_TARGET_REG_BITS == 64
         case INDEX_op_mov_i64: