Patchwork TCG: fix copy propagation

login
register
mail settings
Submitter Blue Swirl
Date Aug. 6, 2011, 2:06 p.m.
Message ID <CAAu8pHvSJvL-ypbieMj=h3=xKpmYz3iuzi9BpwsZcrNiS2Svug@mail.gmail.com>
Download mbox | patch
Permalink /patch/108773/
State New
Headers show

Comments

Blue Swirl - Aug. 6, 2011, 2:06 p.m.
Copy propagation introduced in 22613af4a6d9602001e6d0e7b6d98aa40aa018dc
considered only global registers. However, register temps and stack
allocated locals must be handled differently because register temps
don't survive across brcond.

Fix by propagating only within same class of temps.

Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
---
 tcg/optimize.c |   13 +++++++------
 tcg/tcg.h      |    5 +++++
 2 files changed, 12 insertions(+), 6 deletions(-)

  * code which is not supposed to leak any TCG temporaries, then
Stefan Weil - Aug. 6, 2011, 8:13 p.m.
Am 06.08.2011 16:06, schrieb Blue Swirl:
> Copy propagation introduced in 22613af4a6d9602001e6d0e7b6d98aa40aa018dc
> considered only global registers. However, register temps and stack
> allocated locals must be handled differently because register temps
> don't survive across brcond.
>
> Fix by propagating only within same class of temps.
>
> Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
> ---
> tcg/optimize.c | 13 +++++++------
> tcg/tcg.h | 5 +++++
> 2 files changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index a3bfa5e..748ecf9 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -185,12 +185,13 @@ static int op_to_movi(int op)
> }
> }
>
> -static void tcg_opt_gen_mov(TCGArg *gen_args, TCGArg dst, TCGArg src,
> - int nb_temps, int nb_globals)
> +static void tcg_opt_gen_mov(TCGContext *s, TCGArg *gen_args, TCGArg dst,
> + TCGArg src, int nb_temps, int nb_globals)
> {
> reset_temp(dst, nb_temps, nb_globals);
> assert(temps[src].state != TCG_TEMP_COPY);
> - if (src >= nb_globals) {
> + if (src >= nb_globals &&
> + tcg_arg_is_local(s, src) == tcg_arg_is_local(s, dst)) {
> assert(temps[src].state != TCG_TEMP_CONST);
[snip]

Hi Blue,

your patch fixes qemu-system-x86_64 which now seems to work on 32 bit 
hosts, too.

qemu-system-mips64(el) still fail with the same abort. They work when I 
remove the
if block in tcg_opt_gen_mov.

The Debian kernel for qemu-system-mips64 which I used for the test is 
available on
http://qemu.weilnetz.de/mips64/.

I could not reproduce the crash with qemu-system-ppc64 - neither with 
nor without
your patch.

Kind regards,
Stefan
Stefan Weil - Aug. 6, 2011, 8:33 p.m.
Am 06.08.2011 22:13, schrieb Stefan Weil:
> Am 06.08.2011 16:06, schrieb Blue Swirl:
>> Copy propagation introduced in 22613af4a6d9602001e6d0e7b6d98aa40aa018dc
>> considered only global registers. However, register temps and stack
>> allocated locals must be handled differently because register temps
>> don't survive across brcond.
>>
>> Fix by propagating only within same class of temps.
>>
>> Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
>> ---
>> tcg/optimize.c | 13 +++++++------
>> tcg/tcg.h | 5 +++++
>> 2 files changed, 12 insertions(+), 6 deletions(-)
>>
>> diff --git a/tcg/optimize.c b/tcg/optimize.c
>> index a3bfa5e..748ecf9 100644
>> --- a/tcg/optimize.c
>> +++ b/tcg/optimize.c
>> @@ -185,12 +185,13 @@ static int op_to_movi(int op)
>> }
>> }
>>
>> -static void tcg_opt_gen_mov(TCGArg *gen_args, TCGArg dst, TCGArg src,
>> - int nb_temps, int nb_globals)
>> +static void tcg_opt_gen_mov(TCGContext *s, TCGArg *gen_args, TCGArg 
>> dst,
>> + TCGArg src, int nb_temps, int nb_globals)
>> {
>> reset_temp(dst, nb_temps, nb_globals);
>> assert(temps[src].state != TCG_TEMP_COPY);
>> - if (src >= nb_globals) {
>> + if (src >= nb_globals &&
>> + tcg_arg_is_local(s, src) == tcg_arg_is_local(s, dst)) {
>> assert(temps[src].state != TCG_TEMP_CONST);
> [snip]
>
> Hi Blue,
>
> your patch fixes qemu-system-x86_64 which now seems to work on 32 bit 
> hosts, too.
>
> qemu-system-mips64(el) still fail with the same abort. They work when 
> I remove the
> if block in tcg_opt_gen_mov.
>
> The Debian kernel for qemu-system-mips64 which I used for the test is 
> available on
> http://qemu.weilnetz.de/mips64/.
>
> I could not reproduce the crash with qemu-system-ppc64 - neither with 
> nor without
> your patch.
>
> Kind regards,
> Stefan

The patch works with qemu-system-mips64(el) after a small modification:

if (src >= nb_globals &&
     tcg_arg_is_local(s, src) && tcg_arg_is_local(s, dst)) {

Cheers,
Stefan
Blue Swirl - Aug. 6, 2011, 9:07 p.m.
On Sat, Aug 6, 2011 at 8:33 PM, Stefan Weil <weil@mail.berlios.de> wrote:
> Am 06.08.2011 22:13, schrieb Stefan Weil:
>>
>> Am 06.08.2011 16:06, schrieb Blue Swirl:
>>>
>>> Copy propagation introduced in 22613af4a6d9602001e6d0e7b6d98aa40aa018dc
>>> considered only global registers. However, register temps and stack
>>> allocated locals must be handled differently because register temps
>>> don't survive across brcond.
>>>
>>> Fix by propagating only within same class of temps.
>>>
>>> Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
>>> ---
>>> tcg/optimize.c | 13 +++++++------
>>> tcg/tcg.h | 5 +++++
>>> 2 files changed, 12 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/tcg/optimize.c b/tcg/optimize.c
>>> index a3bfa5e..748ecf9 100644
>>> --- a/tcg/optimize.c
>>> +++ b/tcg/optimize.c
>>> @@ -185,12 +185,13 @@ static int op_to_movi(int op)
>>> }
>>> }
>>>
>>> -static void tcg_opt_gen_mov(TCGArg *gen_args, TCGArg dst, TCGArg src,
>>> - int nb_temps, int nb_globals)
>>> +static void tcg_opt_gen_mov(TCGContext *s, TCGArg *gen_args, TCGArg dst,
>>> + TCGArg src, int nb_temps, int nb_globals)
>>> {
>>> reset_temp(dst, nb_temps, nb_globals);
>>> assert(temps[src].state != TCG_TEMP_COPY);
>>> - if (src >= nb_globals) {
>>> + if (src >= nb_globals &&
>>> + tcg_arg_is_local(s, src) == tcg_arg_is_local(s, dst)) {
>>> assert(temps[src].state != TCG_TEMP_CONST);
>>
>> [snip]
>>
>> Hi Blue,
>>
>> your patch fixes qemu-system-x86_64 which now seems to work on 32 bit
>> hosts, too.
>>
>> qemu-system-mips64(el) still fail with the same abort. They work when I
>> remove the
>> if block in tcg_opt_gen_mov.
>>
>> The Debian kernel for qemu-system-mips64 which I used for the test is
>> available on
>> http://qemu.weilnetz.de/mips64/.
>>
>> I could not reproduce the crash with qemu-system-ppc64 - neither with nor
>> without
>> your patch.
>>
>> Kind regards,
>> Stefan
>
> The patch works with qemu-system-mips64(el) after a small modification:
>
> if (src >= nb_globals &&
>    tcg_arg_is_local(s, src) && tcg_arg_is_local(s, dst)) {

This removes the optimization completely for register temps. I think
there is a better solution.

The problem is similar to x86_64:

IN: free_area_init_nodes
0xffffffff806004d8:  sw v0,21664(s5)
0xffffffff806004dc:  lw v1,0(s0)
0xffffffff806004e0:  lui        v0,0x8062
0xffffffff806004e4:  sw v1,21676(v0)
0xffffffff806004e8:  sw v1,4(s8)
0xffffffff806004ec:  lw a0,4(s0)
0xffffffff806004f0:  move       a1,zero
0xffffffff806004f4:  sltu       v0,a0,v1
0xffffffff806004f8:  movz       v1,a0,v0
0xffffffff806004fc:  lui        v0,0x8062
0xffffffff80600500:  addiu      s4,v0,21676
0xffffffff80600504:  lui        v0,0x8062
0xffffffff80600508:  sw v1,4(s4)
0xffffffff8060050c:  addiu      a0,v0,21688
0xffffffff80600510:  li a2,4
0xffffffff80600514:  sw zero,8(s8)
0xffffffff80600518:  jal        0xffffffff80104000
0xffffffff8060051c:  sw zero,8(s4)

OP:
 ---- 0xffffffff806004d8
 movi_i32 tmp0,$0x54a0
 movi_i32 tmp1,$0x0
 add2_i32 tmp0,tmp1,s5_0,s5_1,tmp0,tmp1
 mov_i32 tmp2,v0_0
 mov_i32 tmp3,v0_1
 qemu_st32 tmp2,tmp0,tmp1,$0x0

 ---- 0xffffffff806004dc
 mov_i32 tmp2,s0_0
 mov_i32 tmp3,s0_1
 qemu_ld32 tmp2,tmp2,tmp3,$0x0
 movi_i32 tmp4,$0x1f
 sar_i32 tmp3,tmp2,tmp4
 mov_i32 v1_0,tmp2
 mov_i32 v1_1,tmp3

 ---- 0xffffffff806004e0
 movi_i32 v0_0,$0x80620000
 movi_i32 v0_1,$0xffffffff

 ---- 0xffffffff806004e4
 movi_i32 tmp0,$0x54ac
 movi_i32 tmp1,$0x0
 add2_i32 tmp0,tmp1,v0_0,v0_1,tmp0,tmp1
 mov_i32 tmp2,v1_0
 mov_i32 tmp3,v1_1
 qemu_st32 tmp2,tmp0,tmp1,$0x0

 ---- 0xffffffff806004e8
 movi_i32 tmp2,$0x4
 movi_i32 tmp3,$0x0
 add2_i32 tmp2,tmp3,s8_0,s8_1,tmp2,tmp3
 mov_i32 tmp0,v1_0
 mov_i32 tmp1,v1_1
 qemu_st32 tmp0,tmp2,tmp3,$0x0

 ---- 0xffffffff806004ec
 movi_i32 tmp0,$0x4
 movi_i32 tmp1,$0x0
 add2_i32 tmp0,tmp1,s0_0,s0_1,tmp0,tmp1
 qemu_ld32 tmp0,tmp0,tmp1,$0x0
 movi_i32 tmp4,$0x1f
 sar_i32 tmp1,tmp0,tmp4
 mov_i32 a0_0,tmp0
 mov_i32 a0_1,tmp1

 ---- 0xffffffff806004f0
 movi_i32 a1_0,$0x0
 movi_i32 a1_1,$0x0

 ---- 0xffffffff806004f4
 mov_i32 tmp2,a0_0
 mov_i32 tmp3,a0_1
 mov_i32 tmp0,v1_0
 mov_i32 tmp1,v1_1
 setcond2_i32 v0_0,tmp2,tmp3,tmp0,tmp1,ltu
 movi_i32 v0_1,$0x0

 ---- 0xffffffff806004f8
 movi_i32 tmp0,$0x0
 movi_i32 tmp1,$0x0
 brcond2_i32 v0_0,v0_1,tmp0,tmp1,ne,$0x0
 mov_i32 v1_0,a0_0
 mov_i32 v1_1,a0_1
 set_label $0x0

Here a0 locals are used after brcond.

 ---- 0xffffffff806004fc
 movi_i32 v0_0,$0x80620000
 movi_i32 v0_1,$0xffffffff

 ---- 0xffffffff80600500
 movi_i32 tmp0,$0x54ac
 movi_i32 tmp1,$0x0
 add2_i32 s4_0,s4_1,v0_0,v0_1,tmp0,tmp1
 movi_i32 tmp4,$0x1f
 sar_i32 s4_1,s4_0,tmp4

 ---- 0xffffffff80600504
 movi_i32 v0_0,$0x80620000
 movi_i32 v0_1,$0xffffffff

 ---- 0xffffffff80600508
 movi_i32 tmp0,$0x4
 movi_i32 tmp1,$0x0
 add2_i32 tmp0,tmp1,s4_0,s4_1,tmp0,tmp1
 mov_i32 tmp2,v1_0
 mov_i32 tmp3,v1_1
 qemu_st32 tmp2,tmp0,tmp1,$0x0

 ---- 0xffffffff8060050c
 movi_i32 tmp2,$0x54b8
 movi_i32 tmp3,$0x0
 add2_i32 a0_0,a0_1,v0_0,v0_1,tmp2,tmp3
 movi_i32 tmp4,$0x1f
 sar_i32 a0_1,a0_0,tmp4

 ---- 0xffffffff80600510
 movi_i32 a2_0,$0x4
 movi_i32 a2_1,$0x0

 ---- 0xffffffff80600514
 movi_i32 tmp2,$0x8
 movi_i32 tmp3,$0x0
 add2_i32 tmp2,tmp3,s8_0,s8_1,tmp2,tmp3
 movi_i32 tmp0,$0x0
 movi_i32 tmp1,$0x0
 qemu_st32 tmp0,tmp2,tmp3,$0x0

 ---- 0xffffffff80600518
 movi_i32 ra_0,$0x80600520
 movi_i32 ra_1,$0xffffffff

 ---- 0xffffffff8060051c
 movi_i32 tmp2,$0x8
 movi_i32 tmp3,$0x0
 add2_i32 tmp2,tmp3,s4_0,s4_1,tmp2,tmp3
 movi_i32 tmp0,$0x0
 movi_i32 tmp1,$0x0
 movi_i32 hflags,$0x10898
 movi_i32 btarget_0,$0x80104000
 movi_i32 btarget_1,$0xffffffff
 qemu_st32 tmp0,tmp2,tmp3,$0x0
 movi_i32 hflags,$0x98
 movi_i32 PC_0,$0x80104000
 movi_i32 PC_1,$0xffffffff
 exit_tb $0x0

OP after liveness analysis:
 ---- 0xffffffff806004d8
 movi_i32 tmp0,$0x54a0
 movi_i32 tmp1,$0x0
 add2_i32 tmp0,tmp1,s5_0,s5_1,tmp0,tmp1
 mov_i32 tmp2,v0_0
 nopn $0x2,$0x2
 qemu_st32 tmp2,tmp0,tmp1,$0x0

 ---- 0xffffffff806004dc
 mov_i32 tmp2,s0_0
 mov_i32 tmp3,s0_1
 qemu_ld32 tmp2,tmp2,tmp3,$0x0
 movi_i32 tmp4,$0x1f
 sar_i32 tmp3,tmp2,tmp4
 mov_i32 v1_0,tmp2
 mov_i32 v1_1,tmp3

 ---- 0xffffffff806004e0
 movi_i32 v0_0,$0x80620000
 movi_i32 v0_1,$0xffffffff

 ---- 0xffffffff806004e4
 movi_i32 tmp0,$0x54ac
 movi_i32 tmp1,$0x0
 add2_i32 tmp0,tmp1,v0_0,v0_1,tmp0,tmp1
 nop
 nop
 qemu_st32 tmp2,tmp0,tmp1,$0x0

 ---- 0xffffffff806004e8
 movi_i32 tmp2,$0x4
 movi_i32 tmp3,$0x0
 add2_i32 tmp2,tmp3,s8_0,s8_1,tmp2,tmp3
 mov_i32 tmp0,v1_0
 nopn $0x2,$0x2
 qemu_st32 tmp0,tmp2,tmp3,$0x0

 ---- 0xffffffff806004ec
 movi_i32 tmp0,$0x4
 movi_i32 tmp1,$0x0
 add2_i32 tmp0,tmp1,s0_0,s0_1,tmp0,tmp1
 qemu_ld32 tmp0,tmp0,tmp1,$0x0
 movi_i32 tmp4,$0x1f
 sar_i32 tmp1,tmp0,tmp4
 mov_i32 a0_0,tmp0
 mov_i32 a0_1,tmp1

 ---- 0xffffffff806004f0
 movi_i32 a1_0,$0x0
 movi_i32 a1_1,$0x0

 ---- 0xffffffff806004f4
 mov_i32 tmp2,tmp0
 mov_i32 tmp3,tmp1
 mov_i32 tmp0,v1_0
 mov_i32 tmp1,v1_1
 setcond2_i32 v0_0,tmp2,tmp3,tmp0,tmp1,ltu
 movi_i32 v0_1,$0x0

 ---- 0xffffffff806004f8
 movi_i32 tmp0,$0x0
 movi_i32 tmp1,$0x0
 brcond2_i32 v0_0,v0_1,tmp0,tmp1,ne,$0x0
 mov_i32 v1_0,tmp2
 mov_i32 v1_1,tmp3
 set_label $0x0

But here tmp2 and tmp3 which are dead after brcond. They are set to a0
values in 0xffffffff806004ec.

Maybe my patch doesn't work because tmp2&3 are register temps while
a0_0&1 are globals. So this should work:
if (src >= nb_globals && dst >= nb_globals &&
   tcg_arg_is_local(s, src) == tcg_arg_is_local(s, dst)) {

 ---- 0xffffffff806004fc
 movi_i32 v0_0,$0x80620000
 movi_i32 v0_1,$0xffffffff

 ---- 0xffffffff80600500
 movi_i32 tmp0,$0x54ac
 movi_i32 tmp1,$0x0
 add2_i32 s4_0,s4_1,v0_0,v0_1,tmp0,tmp1
 movi_i32 tmp4,$0x1f
 sar_i32 s4_1,s4_0,tmp4

 ---- 0xffffffff80600504
 movi_i32 v0_0,$0x80620000
 movi_i32 v0_1,$0xffffffff

 ---- 0xffffffff80600508
 movi_i32 tmp0,$0x4
 movi_i32 tmp1,$0x0
 add2_i32 tmp0,tmp1,s4_0,s4_1,tmp0,tmp1
 mov_i32 tmp2,v1_0
 nopn $0x2,$0x2
 qemu_st32 tmp2,tmp0,tmp1,$0x0

 ---- 0xffffffff8060050c
 movi_i32 tmp2,$0x54b8
 movi_i32 tmp3,$0x0
 add2_i32 a0_0,a0_1,v0_0,v0_1,tmp2,tmp3
 movi_i32 tmp4,$0x1f
 sar_i32 a0_1,a0_0,tmp4

 ---- 0xffffffff80600510
 movi_i32 a2_0,$0x4
 movi_i32 a2_1,$0x0

 ---- 0xffffffff80600514
 movi_i32 tmp2,$0x8
 movi_i32 tmp3,$0x0
 add2_i32 tmp2,tmp3,s8_0,s8_1,tmp2,tmp3
 movi_i32 tmp0,$0x0
 nopn $0x2,$0x2
 qemu_st32 tmp0,tmp2,tmp3,$0x0

 ---- 0xffffffff80600518
 movi_i32 ra_0,$0x80600520
 movi_i32 ra_1,$0xffffffff

 ---- 0xffffffff8060051c
 movi_i32 tmp2,$0x8
 movi_i32 tmp3,$0x0
 add2_i32 tmp2,tmp3,s4_0,s4_1,tmp2,tmp3
 movi_i32 tmp0,$0x0
 nopn $0x2,$0x2
 movi_i32 hflags,$0x10898
 movi_i32 btarget_0,$0x80104000
 movi_i32 btarget_1,$0xffffffff
 qemu_st32 tmp0,tmp2,tmp3,$0x0
 movi_i32 hflags,$0x98
 movi_i32 PC_0,$0x80104000
 movi_i32 PC_1,$0xffffffff
 exit_tb $0x0
 end

Patch

From f8f16733f469eebae8f6132abc70e7357e41cf7f Mon Sep 17 00:00:00 2001
Message-Id: <f8f16733f469eebae8f6132abc70e7357e41cf7f.1312639467.git.blauwirbel@gmail.com>
From: Blue Swirl <blauwirbel@gmail.com>
Date: Sat, 6 Aug 2011 13:58:47 +0000
Subject: [PATCH] TCG: fix copy propagation

Copy propagation introduced in 22613af4a6d9602001e6d0e7b6d98aa40aa018dc
considered only global registers. However, register temps and stack
allocated locals must be handled differently because register temps
don't survive across brcond.

Fix by propagating only within same class of temps.

Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
---
 tcg/optimize.c |   13 +++++++------
 tcg/tcg.h      |    5 +++++
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index a3bfa5e..748ecf9 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -185,12 +185,13 @@  static int op_to_movi(int op)
     }
 }
 
-static void tcg_opt_gen_mov(TCGArg *gen_args, TCGArg dst, TCGArg src,
-                            int nb_temps, int nb_globals)
+static void tcg_opt_gen_mov(TCGContext *s, TCGArg *gen_args, TCGArg dst,
+                            TCGArg src, int nb_temps, int nb_globals)
 {
         reset_temp(dst, nb_temps, nb_globals);
         assert(temps[src].state != TCG_TEMP_COPY);
-        if (src >= nb_globals) {
+        if (src >= nb_globals &&
+            tcg_arg_is_local(s, src) == tcg_arg_is_local(s, dst)) {
             assert(temps[src].state != TCG_TEMP_CONST);
             if (temps[src].state != TCG_TEMP_HAS_COPY) {
                 temps[src].state = TCG_TEMP_HAS_COPY;
@@ -474,7 +475,7 @@  static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                     gen_opc_buf[op_index] = INDEX_op_nop;
                 } else {
                     gen_opc_buf[op_index] = op_to_mov(op);
-                    tcg_opt_gen_mov(gen_args, args[0], args[1],
+                    tcg_opt_gen_mov(s, gen_args, args[0], args[1],
                                     nb_temps, nb_globals);
                     gen_args += 2;
                     args += 3;
@@ -500,7 +501,7 @@  static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                     gen_opc_buf[op_index] = INDEX_op_nop;
                 } else {
                     gen_opc_buf[op_index] = op_to_mov(op);
-                    tcg_opt_gen_mov(gen_args, args[0], args[1], nb_temps,
+                    tcg_opt_gen_mov(s, gen_args, args[0], args[1], nb_temps,
                                     nb_globals);
                     gen_args += 2;
                     args += 3;
@@ -523,7 +524,7 @@  static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                 break;
             }
             if (temps[args[1]].state != TCG_TEMP_CONST) {
-                tcg_opt_gen_mov(gen_args, args[0], args[1],
+                tcg_opt_gen_mov(s, gen_args, args[0], args[1],
                                 nb_temps, nb_globals);
                 gen_args += 2;
                 args += 2;
diff --git a/tcg/tcg.h b/tcg/tcg.h
index e76f9af..e2a7095 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -410,6 +410,11 @@  static inline TCGv_i64 tcg_temp_local_new_i64(void)
 void tcg_temp_free_i64(TCGv_i64 arg);
 char *tcg_get_arg_str_i64(TCGContext *s, char *buf, int buf_size, TCGv_i64 arg);
 
+static inline bool tcg_arg_is_local(TCGContext *s, TCGArg arg)
+{
+    return s->temps[arg].temp_local;
+}
+
 #if defined(CONFIG_DEBUG_TCG)
 /* If you call tcg_clear_temp_count() at the start of a section of
  * code which is not supposed to leak any TCG temporaries, then
-- 
1.7.2.5