diff mbox

[v3,3/8] target-sh4: optimize addc using add2

Message ID 1432510638-21021-4-git-send-email-aurelien@aurel32.net
State New
Headers show

Commit Message

Aurelien Jarno May 24, 2015, 11:37 p.m. UTC
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-sh4/translate.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

Comments

Richard Henderson June 4, 2015, 5:03 a.m. UTC | #1
On 05/24/2015 04:37 PM, Aurelien Jarno wrote:
> -            TCGv t0, t1;
> -            t0 = tcg_temp_new();
> +            TCGv t0, t1, t2;
> +            t0 = tcg_const_tl(0);
>               t1 = tcg_temp_new();
> -            tcg_gen_add_i32(t0, REG(B7_4), REG(B11_8));
> -            tcg_gen_add_i32(t1, cpu_sr_t, t0);
> -            tcg_gen_setcond_i32(TCG_COND_GTU, cpu_sr_t, REG(B11_8), t0);
> -            tcg_gen_setcond_i32(TCG_COND_GTU, t0, t0, t1);
> -            tcg_gen_or_i32(cpu_sr_t, cpu_sr_t, t0);
> +            t2 = tcg_temp_new();
> +            tcg_gen_add2_i32(t1, t2, REG(B11_8), t0, REG(B7_4), t0);
> +            tcg_gen_add2_i32(REG(B11_8), cpu_sr_t, t1, t2, cpu_sr_t, t0);

Swap these two adds and you don't need t2.  You can consume sr_t immediately 
and start producing it in the same go.


r~
Paolo Bonzini June 4, 2015, 10:54 a.m. UTC | #2
On 04/06/2015 07:03, Richard Henderson wrote:
>> +            tcg_gen_add2_i32(t1, t2, REG(B11_8), t0, REG(B7_4), t0);
>> +            tcg_gen_add2_i32(REG(B11_8), cpu_sr_t, t1, t2, cpu_sr_t,
>> t0);
> 
> Swap these two adds and you don't need t2.  You can consume sr_t
> immediately and start producing it in the same go.

Could TCG do some kind of intra-basic-block live range splitting?  In
this case, the new sr_t could be allocated to a different register than
the old one, saving one instruction on 2-address targets.

The pseudocode below uses "dest, src" operand order:

   // add2(t1, cpu_sr_t, cpu_sr_t, t0, REG(B7_4), t0)
   add sr_t_in, B7_4    // instead of mov t1, sr_t; add t1, B7_4
   mov sr_t_out, 0
   adc sr_t_out, 0      // cout(B7_r + sr_t_in)

   // add2(REG(B11_8), cpu_sr_t, t1, cpu_sr_t, REG(B11_8), t0)
   add B11_8, sr_t_in   // B11_8 + B7_4 + sr_t_in
   adc sr_t_out, 0      // cout(B11_8 + B7_4 + sr_t_in)

Paolo
Aurelien Jarno June 4, 2015, 4:08 p.m. UTC | #3
On 2015-06-04 12:54, Paolo Bonzini wrote:
> 
> 
> On 04/06/2015 07:03, Richard Henderson wrote:
> >> +            tcg_gen_add2_i32(t1, t2, REG(B11_8), t0, REG(B7_4), t0);
> >> +            tcg_gen_add2_i32(REG(B11_8), cpu_sr_t, t1, t2, cpu_sr_t,
> >> t0);
> > 
> > Swap these two adds and you don't need t2.  You can consume sr_t
> > immediately and start producing it in the same go.
> 
> Could TCG do some kind of intra-basic-block live range splitting?  In
> this case, the new sr_t could be allocated to a different register than
> the old one, saving one instruction on 2-address targets.

TCG doesn't use a fixed register to a temp, so it's kind of difficult to
know, but let's say it more or less do that (see below). On the other
hand it is really bad at handling the constant in that case.

> The pseudocode below uses "dest, src" operand order:
> 
>    // add2(t1, cpu_sr_t, cpu_sr_t, t0, REG(B7_4), t0)
>    add sr_t_in, B7_4    // instead of mov t1, sr_t; add t1, B7_4
>    mov sr_t_out, 0
>    adc sr_t_out, 0      // cout(B7_r + sr_t_in)

The registers are allocated from left to right, started by the inputs
first.

- cpu_sr_t is already in register or in memory and loaded to a register
- t0 is a constant, and the add2 op on x86_64 do not accept a constant
  three so it is loaded to a register. However it is aliased to the
  output and not dead as used again in the second add2 instruction. It
  is therefore copied into another register.
- REG(B7_4) is already in register or in memory and loaded to a register
- t0 appears again and has been loaded to a register and therefore not
  anymore a constant.

We therefore end up with (Intel notation)

     xor %ebx, %ebx       // this is t0
     mov %r12d, %ebx      // a copy of t0
     add %r13d, %ebp      // %r13d contains B7_4 and %ebp contains sr_t
     adc %r12d, %ebx      // %r12d is the new sr_t  

>    // add2(REG(B11_8), cpu_sr_t, t1, cpu_sr_t, REG(B11_8), t0)
>    add B11_8, sr_t_in   // B11_8 + B7_4 + sr_t_in
>    adc sr_t_out, 0      // cout(B11_8 + B7_4 + sr_t_in)

     add %ebp, %r13d      // %ebp is now B11_8
     adc %ebx, %r12d      // %ebx is now cpu_sr_t
diff mbox

Patch

diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index f9bc24c..a7a8f39 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -642,17 +642,15 @@  static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x300e:		/* addc Rm,Rn */
         {
-            TCGv t0, t1;
-            t0 = tcg_temp_new();
+            TCGv t0, t1, t2;
+            t0 = tcg_const_tl(0);
             t1 = tcg_temp_new();
-            tcg_gen_add_i32(t0, REG(B7_4), REG(B11_8));
-            tcg_gen_add_i32(t1, cpu_sr_t, t0);
-            tcg_gen_setcond_i32(TCG_COND_GTU, cpu_sr_t, REG(B11_8), t0);
-            tcg_gen_setcond_i32(TCG_COND_GTU, t0, t0, t1);
-            tcg_gen_or_i32(cpu_sr_t, cpu_sr_t, t0);
+            t2 = tcg_temp_new();
+            tcg_gen_add2_i32(t1, t2, REG(B11_8), t0, REG(B7_4), t0);
+            tcg_gen_add2_i32(REG(B11_8), cpu_sr_t, t1, t2, cpu_sr_t, t0);
             tcg_temp_free(t0);
-            tcg_gen_mov_i32(REG(B11_8), t1);
             tcg_temp_free(t1);
+            tcg_temp_free(t2);
         }
 	return;
     case 0x300f:		/* addv Rm,Rn */