diff mbox

[2/2] tcg/optimize: Remember garbage high bits for 32-bit ops

Message ID 1400871431-12655-3-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson May 23, 2014, 6:57 p.m. UTC
For a 64-bit host, the high bits of a register after a 32-bit operation
are undefined.  Adjust the temps mask for all 32-bit ops to reflect that.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

Comments

Paolo Bonzini May 23, 2014, 7:46 p.m. UTC | #1
Il 23/05/2014 20:57, Richard Henderson ha scritto:
> For a 64-bit host, the high bits of a register after a 32-bit operation
> are undefined.  Adjust the temps mask for all 32-bit ops to reflect that.
>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/optimize.c | 33 ++++++++++++++++++++++++++-------
>  1 file changed, 26 insertions(+), 7 deletions(-)
>
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index 83e1387..19e4831 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -166,11 +166,18 @@ static void tcg_opt_gen_mov(TCGContext *s, int op_index, TCGArg *gen_args,
>                              TCGOpcode old_op, TCGArg dst, TCGArg src)
>  {
>      TCGOpcode new_op = op_to_mov(old_op);
> +    tcg_target_ulong mask;
>
>      s->gen_opc_buf[op_index] = new_op;
>
>      reset_temp(dst);
> -    temps[dst].mask = temps[src].mask;
> +    mask = temps[src].mask;
> +    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
> +        /* High bits of the destination are now garbage.  */

Or they are zero on x86_64... perhaps this could be some kind of TCG 
target hook.

> +        mask |= ~0xffffffffull;
> +    }
> +    temps[dst].mask = mask;
> +
>      assert(temps[src].state != TCG_TEMP_CONST);
>
>      if (s->temps[src].type == s->temps[dst].type) {
> @@ -194,13 +201,20 @@ static void tcg_opt_gen_movi(TCGContext *s, int op_index, TCGArg *gen_args,
>                               TCGOpcode old_op, TCGArg dst, TCGArg val)
>  {
>      TCGOpcode new_op = op_to_movi(old_op);
> +    tcg_target_ulong mask;
>
>      s->gen_opc_buf[op_index] = new_op;
>
>      reset_temp(dst);
>      temps[dst].state = TCG_TEMP_CONST;
>      temps[dst].val = val;
> -    temps[dst].mask = val;
> +    mask = val;
> +    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
> +        /* High bits of the destination are now garbage.  */
> +        mask |= ~0xffffffffull;
> +    }
> +    temps[dst].mask = mask;
> +
>      gen_args[0] = dst;
>      gen_args[1] = val;
>  }
> @@ -539,7 +553,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>      for (op_index = 0; op_index < nb_ops; op_index++) {
>          TCGOpcode op = s->gen_opc_buf[op_index];
>          const TCGOpDef *def = &tcg_op_defs[op];
> -        tcg_target_ulong mask, affected;
> +        tcg_target_ulong mask, partmask, affected;
>          int nb_oargs, nb_iargs, nb_args, i;
>          TCGArg tmp;
>
> @@ -901,13 +915,18 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>              break;
>          }
>
> -        /* 32-bit ops (non 64-bit ops and non load/store ops) generate 32-bit
> -           results */
> +        /* 32-bit ops (non 64-bit ops and non load/store ops) generate
> +           32-bit results.  For the result is zero test below, we can
> +           ignore high bits, but for further optimizations we need to
> +           record that the high bits contain garbage.  */
> +        partmask = mask;
>          if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) {
> -            mask &= 0xffffffffu;
> +            mask |= ~(tcg_target_ulong)0xffffffffu;
> +            partmask &= 0xffffffffu;
> +            affected &= 0xffffffffu;
>          }
>
> -        if (mask == 0) {
> +        if (partmask == 0) {
>              assert(nb_oargs == 1);
>              tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
>              args += nb_args;
>

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Richard Henderson May 23, 2014, 8:01 p.m. UTC | #2
On 05/23/2014 12:46 PM, Paolo Bonzini wrote:
>> @@ -166,11 +166,18 @@ static void tcg_opt_gen_mov(TCGContext *s, int
>> op_index, TCGArg *gen_args,
>>                              TCGOpcode old_op, TCGArg dst, TCGArg src)
>>  {
>>      TCGOpcode new_op = op_to_mov(old_op);
>> +    tcg_target_ulong mask;
>>
>>      s->gen_opc_buf[op_index] = new_op;
>>
>>      reset_temp(dst);
>> -    temps[dst].mask = temps[src].mask;
>> +    mask = temps[src].mask;
>> +    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
>> +        /* High bits of the destination are now garbage.  */
> 
> Or they are zero on x86_64... perhaps this could be some kind of TCG target hook.

Only if we actually issued a "mov" insn.

If the register allocator decided to adjust its tables instead, we won't issue
an insn, at which point we really do have garbage in the high bits.  I don't
think we should be thinking about the target at all at this point.


r~
Aurelien Jarno May 30, 2014, 11:48 p.m. UTC | #3
On Fri, May 23, 2014 at 11:57:11AM -0700, Richard Henderson wrote:
> For a 64-bit host, the high bits of a register after a 32-bit operation
> are undefined.  Adjust the temps mask for all 32-bit ops to reflect that.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/optimize.c | 33 ++++++++++++++++++++++++++-------
>  1 file changed, 26 insertions(+), 7 deletions(-)
> 
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index 83e1387..19e4831 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -166,11 +166,18 @@ static void tcg_opt_gen_mov(TCGContext *s, int op_index, TCGArg *gen_args,
>                              TCGOpcode old_op, TCGArg dst, TCGArg src)
>  {
>      TCGOpcode new_op = op_to_mov(old_op);
> +    tcg_target_ulong mask;
>  
>      s->gen_opc_buf[op_index] = new_op;
>  
>      reset_temp(dst);
> -    temps[dst].mask = temps[src].mask;
> +    mask = temps[src].mask;
> +    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
> +        /* High bits of the destination are now garbage.  */
> +        mask |= ~0xffffffffull;
> +    }
> +    temps[dst].mask = mask;
> +
>      assert(temps[src].state != TCG_TEMP_CONST);
>  
>      if (s->temps[src].type == s->temps[dst].type) {
> @@ -194,13 +201,20 @@ static void tcg_opt_gen_movi(TCGContext *s, int op_index, TCGArg *gen_args,
>                               TCGOpcode old_op, TCGArg dst, TCGArg val)
>  {
>      TCGOpcode new_op = op_to_movi(old_op);
> +    tcg_target_ulong mask;
>  
>      s->gen_opc_buf[op_index] = new_op;
>  
>      reset_temp(dst);
>      temps[dst].state = TCG_TEMP_CONST;
>      temps[dst].val = val;
> -    temps[dst].mask = val;
> +    mask = val;
> +    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
> +        /* High bits of the destination are now garbage.  */
> +        mask |= ~0xffffffffull;
> +    }
> +    temps[dst].mask = mask;
> +
>      gen_args[0] = dst;
>      gen_args[1] = val;
>  }
> @@ -539,7 +553,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>      for (op_index = 0; op_index < nb_ops; op_index++) {
>          TCGOpcode op = s->gen_opc_buf[op_index];
>          const TCGOpDef *def = &tcg_op_defs[op];
> -        tcg_target_ulong mask, affected;
> +        tcg_target_ulong mask, partmask, affected;
>          int nb_oargs, nb_iargs, nb_args, i;
>          TCGArg tmp;
>  
> @@ -901,13 +915,18 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>              break;
>          }
>  
> -        /* 32-bit ops (non 64-bit ops and non load/store ops) generate 32-bit
> -           results */
> +        /* 32-bit ops (non 64-bit ops and non load/store ops) generate
> +           32-bit results.  For the result is zero test below, we can
> +           ignore high bits, but for further optimizations we need to
> +           record that the high bits contain garbage.  */
> +        partmask = mask;
>          if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) {
> -            mask &= 0xffffffffu;
> +            mask |= ~(tcg_target_ulong)0xffffffffu;
> +            partmask &= 0xffffffffu;
> +            affected &= 0xffffffffu;
>          }
>  
> -        if (mask == 0) {
> +        if (partmask == 0) {
>              assert(nb_oargs == 1);
>              tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
>              args += nb_args;


I think the real problem is that we decided to ignore very quickly in
the TCG code that a 32-bit zero/sign extension is different than a move
from a 32-bit to a 64-bit TCG temp. Your patch is basically recreating
this information by setting a flag when a 32-bit operation occurs (that
is an operation on 32-bit TCG temps), and clearing it when a 32-bit to
64-bit conversion occurs.

If we add real extu_i32_i64 and ext_i32_i64 ops for 64-bit hosts,
propagated down to each TCG target, we can use them as a "barrier" in
the propagation. That means for example that extu_i32_i64 will always
have to be executed, as the state of the 32-bit high part would be
unknown, while we know that after this op, the 32-bit high part would
be only zero. Thus we don't lose any possible optimization, while we
keep the information.

This also means we don't risk the copy propagation mixing 32 and 64-bit
types (at least in for this direction). IIRC we got bugs like that in
the past.

That said, fixing the real problem is probably quite intrusive, so in
the meantime this workaround looks fine to me:

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
diff mbox

Patch

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 83e1387..19e4831 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -166,11 +166,18 @@  static void tcg_opt_gen_mov(TCGContext *s, int op_index, TCGArg *gen_args,
                             TCGOpcode old_op, TCGArg dst, TCGArg src)
 {
     TCGOpcode new_op = op_to_mov(old_op);
+    tcg_target_ulong mask;
 
     s->gen_opc_buf[op_index] = new_op;
 
     reset_temp(dst);
-    temps[dst].mask = temps[src].mask;
+    mask = temps[src].mask;
+    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
+        /* High bits of the destination are now garbage.  */
+        mask |= ~0xffffffffull;
+    }
+    temps[dst].mask = mask;
+
     assert(temps[src].state != TCG_TEMP_CONST);
 
     if (s->temps[src].type == s->temps[dst].type) {
@@ -194,13 +201,20 @@  static void tcg_opt_gen_movi(TCGContext *s, int op_index, TCGArg *gen_args,
                              TCGOpcode old_op, TCGArg dst, TCGArg val)
 {
     TCGOpcode new_op = op_to_movi(old_op);
+    tcg_target_ulong mask;
 
     s->gen_opc_buf[op_index] = new_op;
 
     reset_temp(dst);
     temps[dst].state = TCG_TEMP_CONST;
     temps[dst].val = val;
-    temps[dst].mask = val;
+    mask = val;
+    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
+        /* High bits of the destination are now garbage.  */
+        mask |= ~0xffffffffull;
+    }
+    temps[dst].mask = mask;
+
     gen_args[0] = dst;
     gen_args[1] = val;
 }
@@ -539,7 +553,7 @@  static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
     for (op_index = 0; op_index < nb_ops; op_index++) {
         TCGOpcode op = s->gen_opc_buf[op_index];
         const TCGOpDef *def = &tcg_op_defs[op];
-        tcg_target_ulong mask, affected;
+        tcg_target_ulong mask, partmask, affected;
         int nb_oargs, nb_iargs, nb_args, i;
         TCGArg tmp;
 
@@ -901,13 +915,18 @@  static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             break;
         }
 
-        /* 32-bit ops (non 64-bit ops and non load/store ops) generate 32-bit
-           results */
+        /* 32-bit ops (non 64-bit ops and non load/store ops) generate
+           32-bit results.  For the result is zero test below, we can
+           ignore high bits, but for further optimizations we need to
+           record that the high bits contain garbage.  */
+        partmask = mask;
         if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) {
-            mask &= 0xffffffffu;
+            mask |= ~(tcg_target_ulong)0xffffffffu;
+            partmask &= 0xffffffffu;
+            affected &= 0xffffffffu;
         }
 
-        if (mask == 0) {
+        if (partmask == 0) {
             assert(nb_oargs == 1);
             tcg_opt_gen_movi(s, op_index, gen_args, op, args[0], 0);
             args += nb_args;