Patchwork [2/3] target-sparc: Free instruction temporaries.

login
register
mail settings
Submitter Richard Henderson
Date April 16, 2010, 2:50 p.m.
Message ID <1271429444-900-2-git-send-email-rth@twiddle.net>
Download mbox | patch
Permalink /patch/50332/
State New
Headers show

Comments

Richard Henderson - April 16, 2010, 2:50 p.m.
Rather than creating new temporaries for constants, use the
ones created in disas_sparc_insn.  Remember the temps created
there so that they can be freed at the end of the function.

Profile data collected by TCG while booting sparc-test kernel:

-avg temps/TB    70.61 max=421
+avg temps/TB    62.75 max=66

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-sparc/translate.c |   52 +++++++++++++++++++++++----------------------
 1 files changed, 27 insertions(+), 25 deletions(-)
Blue Swirl - April 17, 2010, 4:41 p.m.
On 4/16/10, Richard Henderson <rth@twiddle.net> wrote:
> Rather than creating new temporaries for constants, use the
>  ones created in disas_sparc_insn.  Remember the temps created
>  there so that they can be freed at the end of the function.
>
>  Profile data collected by TCG while booting sparc-test kernel:
>
>  -avg temps/TB    70.61 max=421
>  +avg temps/TB    62.75 max=66
>
>  Signed-off-by: Richard Henderson <rth@twiddle.net>

Thanks, applied whole series.

About this patch: it's good that we now free the constants, but
constant handling is still not optimal and I think this series
actually may add extra 'movi' ops in the worst case. It would be nice
if we detected if constants are in play and call immediate versions
(addi, subi etc) automatically. This may need bigger refactoring,
though.

>  ---
>   target-sparc/translate.c |   52 +++++++++++++++++++++++----------------------
>   1 files changed, 27 insertions(+), 25 deletions(-)
>
>  diff --git a/target-sparc/translate.c b/target-sparc/translate.c
>  index 2c07385..2c833ab 100644
>  --- a/target-sparc/translate.c
>  +++ b/target-sparc/translate.c
>  @@ -49,7 +49,7 @@ static TCGv cpu_y;
>   #ifndef CONFIG_USER_ONLY
>   static TCGv cpu_tbr;
>   #endif
>  -static TCGv cpu_cond, cpu_src1, cpu_src2, cpu_dst, cpu_addr, cpu_val;
>  +static TCGv cpu_cond, cpu_dst, cpu_addr, cpu_val;
>   #ifdef TARGET_SPARC64
>   static TCGv_i32 cpu_xcc, cpu_asi, cpu_fprs;
>   static TCGv cpu_gsr;
>  @@ -1631,12 +1631,13 @@ static inline TCGv get_src1(unsigned int insn, TCGv def)
>      unsigned int rs1;
>
>      rs1 = GET_FIELD(insn, 13, 17);
>  -    if (rs1 == 0)
>  -        r_rs1 = tcg_const_tl(0); // XXX how to free?
>  -    else if (rs1 < 8)
>  +    if (rs1 == 0) {
>  +        tcg_gen_movi_tl(def, 0);
>  +    } else if (rs1 < 8) {
>          r_rs1 = cpu_gregs[rs1];
>  -    else
>  +    } else {
>          tcg_gen_ld_tl(def, cpu_regwptr, (rs1 - 8) * sizeof(target_ulong));
>  +    }
>      return r_rs1;
>   }
>
>  @@ -1645,20 +1646,17 @@ static inline TCGv get_src2(unsigned int insn, TCGv def)
>      TCGv r_rs2 = def;
>
>      if (IS_IMM) { /* immediate */
>  -        target_long simm;
>  -
>  -        simm = GET_FIELDs(insn, 19, 31);
>  -        r_rs2 = tcg_const_tl(simm); // XXX how to free?
>  +        target_long simm = GET_FIELDs(insn, 19, 31);
>  +        tcg_gen_movi_tl(def, simm);
>      } else { /* register */
>  -        unsigned int rs2;
>  -
>  -        rs2 = GET_FIELD(insn, 27, 31);
>  -        if (rs2 == 0)
>  -            r_rs2 = tcg_const_tl(0); // XXX how to free?
>  -        else if (rs2 < 8)
>  +        unsigned int rs2 = GET_FIELD(insn, 27, 31);
>  +        if (rs2 == 0) {
>  +            tcg_gen_movi_tl(def, 0);
>  +        } else if (rs2 < 8) {
>              r_rs2 = cpu_gregs[rs2];
>  -        else
>  +        } else {
>              tcg_gen_ld_tl(def, cpu_regwptr, (rs2 - 8) * sizeof(target_ulong));
>  +        }
>      }
>      return r_rs2;
>   }
>  @@ -1701,6 +1699,7 @@ static inline void gen_load_trap_state_at_tl(TCGv_ptr r_tsptr, TCGv_ptr cpu_env)
>   static void disas_sparc_insn(DisasContext * dc)
>   {
>      unsigned int insn, opc, rs1, rs2, rd;
>  +    TCGv cpu_src1, cpu_src2, cpu_tmp1, cpu_tmp2;
>      target_long simm;
>
>      if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)))
>  @@ -1710,8 +1709,8 @@ static void disas_sparc_insn(DisasContext * dc)
>
>      rd = GET_FIELD(insn, 2, 6);
>
>  -    cpu_src1 = tcg_temp_new(); // const
>  -    cpu_src2 = tcg_temp_new(); // const
>  +    cpu_tmp1 = cpu_src1 = tcg_temp_new();
>  +    cpu_tmp2 = cpu_src2 = tcg_temp_new();
>
>      switch (opc) {
>      case 0:                     /* branches/sethi */
>  @@ -4599,7 +4598,7 @@ static void disas_sparc_insn(DisasContext * dc)
>          dc->npc = dc->npc + 4;
>      }
>   jmp_insn:
>  -    return;
>  +    goto egress;
>   illegal_insn:
>      {
>          TCGv_i32 r_const;
>  @@ -4610,7 +4609,7 @@ static void disas_sparc_insn(DisasContext * dc)
>          tcg_temp_free_i32(r_const);
>          dc->is_br = 1;
>      }
>  -    return;
>  +    goto egress;
>   unimp_flush:
>      {
>          TCGv_i32 r_const;
>  @@ -4621,7 +4620,7 @@ static void disas_sparc_insn(DisasContext * dc)
>          tcg_temp_free_i32(r_const);
>          dc->is_br = 1;
>      }
>  -    return;
>  +    goto egress;
>   #if !defined(CONFIG_USER_ONLY)
>   priv_insn:
>      {
>  @@ -4633,19 +4632,19 @@ static void disas_sparc_insn(DisasContext * dc)
>          tcg_temp_free_i32(r_const);
>          dc->is_br = 1;
>      }
>  -    return;
>  +    goto egress;
>   #endif
>   nfpu_insn:
>      save_state(dc, cpu_cond);
>      gen_op_fpexception_im(FSR_FTT_UNIMPFPOP);
>      dc->is_br = 1;
>  -    return;
>  +    goto egress;
>   #if !defined(CONFIG_USER_ONLY) && !defined(TARGET_SPARC64)
>   nfq_insn:
>      save_state(dc, cpu_cond);
>      gen_op_fpexception_im(FSR_FTT_SEQ_ERROR);
>      dc->is_br = 1;
>  -    return;
>  +    goto egress;
>   #endif
>   #ifndef TARGET_SPARC64
>   ncp_insn:
>  @@ -4658,8 +4657,11 @@ static void disas_sparc_insn(DisasContext * dc)
>          tcg_temp_free(r_const);
>          dc->is_br = 1;
>      }
>  -    return;
>  +    goto egress;
>   #endif
>  + egress:
>  +    tcg_temp_free(cpu_tmp1);
>  +    tcg_temp_free(cpu_tmp2);
>   }
>
>   static inline void gen_intermediate_code_internal(TranslationBlock * tb,
>
> --
>  1.6.6.1
>
>
Richard Henderson - April 17, 2010, 5:49 p.m.
On 04/17/2010 11:41 AM, Blue Swirl wrote:
> About this patch: it's good that we now free the constants, but
> constant handling is still not optimal and I think this series
> actually may add extra 'movi' ops in the worst case. It would be nice
> if we detected if constants are in play and call immediate versions
> (addi, subi etc) automatically. This may need bigger refactoring,
> though.

No, that won't help, since the first thing that addi, subi, etc
do is to load the constant into a temporary.

What would *really* help though, is something along the lines of
Aurelien's constant propagation patch, followed by some mechanism
to refactor constants in the backend.

Aurelien's patch does a good job of building the full constant 
that the RISC instruction stream needed to use to generate the
full 32-bit or 64-bit constant.  If the host is x86, that's just
about all we need.  However, if the host is a RISC, we'll 
generally need to decompose the constant again.

I've got the outline of an idea by which TCG can remember which
constants are actually loaded into registers.  And it should be
designed so that the host backend can call into it to load other
constants.  In this way when we have a pair of constants like

  0xfff00011
  0xfff00022

the sparc backend can (if things go well with register allocation)
load the %hi(0xfff00000) just once, and form the full constants
with addition from there.


r~
Richard Henderson - April 17, 2010, 6 p.m.
On 04/17/2010 12:49 PM, Richard Henderson wrote:
> On 04/17/2010 11:41 AM, Blue Swirl wrote:
>> About this patch: it's good that we now free the constants, but
>> constant handling is still not optimal and I think this series
>> actually may add extra 'movi' ops in the worst case. It would be nice
>> if we detected if constants are in play and call immediate versions
>> (addi, subi etc) automatically. This may need bigger refactoring,
>> though.
> 
> No, that won't help, since the first thing that addi, subi, etc
> do is to load the constant into a temporary.
> 
> What would *really* help though, is something along the lines of
> Aurelien's constant propagation patch, followed by some mechanism
> to refactor constants in the backend.

... Actually, I forgot to mention that the biggest thing that would
help the Sparc target would be to eliminate the explicit loads/stores
of the windowed reigsters, such that the generic TCG propagation and
dead code elimination passes can do their job properly.

I've been meaning to try changing the windowing code on the sparc to
memcpy the registers into and out of fixed slots in the CPUState and
see what kind of effect that has on overall performance.  I have a 
feeling that it will be an improvement, since it should avoid some
of the myriad of redundant loads and stores in the generated code.


r~
Blue Swirl - April 17, 2010, 6:41 p.m.
On 4/17/10, Richard Henderson <rth@twiddle.net> wrote:
> On 04/17/2010 11:41 AM, Blue Swirl wrote:
>  > About this patch: it's good that we now free the constants, but
>  > constant handling is still not optimal and I think this series
>  > actually may add extra 'movi' ops in the worst case. It would be nice
>  > if we detected if constants are in play and call immediate versions
>  > (addi, subi etc) automatically. This may need bigger refactoring,
>  > though.
>
>
> No, that won't help, since the first thing that addi, subi, etc
>  do is to load the constant into a temporary.

Yes, but we would still gain the small optimizations for add by 0, and
with 0xffffffff etc. in tcg-op.h. Sparc QEMU target generates a lot of
those because of poor constant formation choices made by the guest
compilers.

>  What would *really* help though, is something along the lines of
>  Aurelien's constant propagation patch, followed by some mechanism
>  to refactor constants in the backend.
>
>  Aurelien's patch does a good job of building the full constant
>  that the RISC instruction stream needed to use to generate the
>  full 32-bit or 64-bit constant.  If the host is x86, that's just
>  about all we need.  However, if the host is a RISC, we'll
>  generally need to decompose the constant again.
>
>  I've got the outline of an idea by which TCG can remember which
>  constants are actually loaded into registers.  And it should be
>  designed so that the host backend can call into it to load other
>  constants.  In this way when we have a pair of constants like
>
>   0xfff00011
>   0xfff00022
>
>  the sparc backend can (if things go well with register allocation)
>  load the %hi(0xfff00000) just once, and form the full constants
>  with addition from there.

That should be interesting.

By the way, do you think constant pool approach (put constants at the
end of TB) would be useful, especially for 64 bit constants?
Blue Swirl - April 17, 2010, 6:50 p.m.
On 4/17/10, Richard Henderson <rth@twiddle.net> wrote:
> On 04/17/2010 12:49 PM, Richard Henderson wrote:
>  > On 04/17/2010 11:41 AM, Blue Swirl wrote:
>  >> About this patch: it's good that we now free the constants, but
>  >> constant handling is still not optimal and I think this series
>  >> actually may add extra 'movi' ops in the worst case. It would be nice
>  >> if we detected if constants are in play and call immediate versions
>  >> (addi, subi etc) automatically. This may need bigger refactoring,
>  >> though.
>  >
>  > No, that won't help, since the first thing that addi, subi, etc
>  > do is to load the constant into a temporary.
>  >
>  > What would *really* help though, is something along the lines of
>  > Aurelien's constant propagation patch, followed by some mechanism
>  > to refactor constants in the backend.
>
>
> ... Actually, I forgot to mention that the biggest thing that would
>  help the Sparc target would be to eliminate the explicit loads/stores
>  of the windowed reigsters, such that the generic TCG propagation and
>  dead code elimination passes can do their job properly.

I had postponed that until AREG1/2 are freed, now we could take one
host register for regwptr.

>  I've been meaning to try changing the windowing code on the sparc to
>  memcpy the registers into and out of fixed slots in the CPUState and
>  see what kind of effect that has on overall performance.  I have a
>  feeling that it will be an improvement, since it should avoid some
>  of the myriad of redundant loads and stores in the generated code.

Maybe. Performance figures for that would be very interesting.

Alternative approach could be that If we could rely on all hosts
(especially x86) having plenty of registers, we could even use
different regwptrs for %o, %l and %i sets.

Then there are host page mapping tricks (map the same page at two
locations to simulate wrapping) but that may be too tricky.
Richard Henderson - April 18, 2010, 3:02 p.m.
On 04/17/2010 01:41 PM, Blue Swirl wrote:
> Yes, but we would still gain the small optimizations for add by 0, and
> with 0xffffffff etc. in tcg-op.h. Sparc QEMU target generates a lot of
> those because of poor constant formation choices made by the guest
> compilers.

Another thing that gets fixed by Aurelien's constant prop patch.
Don't think that Sparc is alone in generating x+0 in a way that
gets past the tcg-op.h checks.

> By the way, do you think constant pool approach (put constants at the
> end of TB) would be useful, especially for 64 bit constants?

Probably.

The support for that could probably be leveraged to move the TLB
miss code path out of line as well.


r~

Patch

diff --git a/target-sparc/translate.c b/target-sparc/translate.c
index 2c07385..2c833ab 100644
--- a/target-sparc/translate.c
+++ b/target-sparc/translate.c
@@ -49,7 +49,7 @@  static TCGv cpu_y;
 #ifndef CONFIG_USER_ONLY
 static TCGv cpu_tbr;
 #endif
-static TCGv cpu_cond, cpu_src1, cpu_src2, cpu_dst, cpu_addr, cpu_val;
+static TCGv cpu_cond, cpu_dst, cpu_addr, cpu_val;
 #ifdef TARGET_SPARC64
 static TCGv_i32 cpu_xcc, cpu_asi, cpu_fprs;
 static TCGv cpu_gsr;
@@ -1631,12 +1631,13 @@  static inline TCGv get_src1(unsigned int insn, TCGv def)
     unsigned int rs1;
 
     rs1 = GET_FIELD(insn, 13, 17);
-    if (rs1 == 0)
-        r_rs1 = tcg_const_tl(0); // XXX how to free?
-    else if (rs1 < 8)
+    if (rs1 == 0) {
+        tcg_gen_movi_tl(def, 0);
+    } else if (rs1 < 8) {
         r_rs1 = cpu_gregs[rs1];
-    else
+    } else {
         tcg_gen_ld_tl(def, cpu_regwptr, (rs1 - 8) * sizeof(target_ulong));
+    }
     return r_rs1;
 }
 
@@ -1645,20 +1646,17 @@  static inline TCGv get_src2(unsigned int insn, TCGv def)
     TCGv r_rs2 = def;
 
     if (IS_IMM) { /* immediate */
-        target_long simm;
-
-        simm = GET_FIELDs(insn, 19, 31);
-        r_rs2 = tcg_const_tl(simm); // XXX how to free?
+        target_long simm = GET_FIELDs(insn, 19, 31);
+        tcg_gen_movi_tl(def, simm);
     } else { /* register */
-        unsigned int rs2;
-
-        rs2 = GET_FIELD(insn, 27, 31);
-        if (rs2 == 0)
-            r_rs2 = tcg_const_tl(0); // XXX how to free?
-        else if (rs2 < 8)
+        unsigned int rs2 = GET_FIELD(insn, 27, 31);
+        if (rs2 == 0) {
+            tcg_gen_movi_tl(def, 0);
+        } else if (rs2 < 8) {
             r_rs2 = cpu_gregs[rs2];
-        else
+        } else {
             tcg_gen_ld_tl(def, cpu_regwptr, (rs2 - 8) * sizeof(target_ulong));
+        }
     }
     return r_rs2;
 }
@@ -1701,6 +1699,7 @@  static inline void gen_load_trap_state_at_tl(TCGv_ptr r_tsptr, TCGv_ptr cpu_env)
 static void disas_sparc_insn(DisasContext * dc)
 {
     unsigned int insn, opc, rs1, rs2, rd;
+    TCGv cpu_src1, cpu_src2, cpu_tmp1, cpu_tmp2;
     target_long simm;
 
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)))
@@ -1710,8 +1709,8 @@  static void disas_sparc_insn(DisasContext * dc)
 
     rd = GET_FIELD(insn, 2, 6);
 
-    cpu_src1 = tcg_temp_new(); // const
-    cpu_src2 = tcg_temp_new(); // const
+    cpu_tmp1 = cpu_src1 = tcg_temp_new();
+    cpu_tmp2 = cpu_src2 = tcg_temp_new();
 
     switch (opc) {
     case 0:                     /* branches/sethi */
@@ -4599,7 +4598,7 @@  static void disas_sparc_insn(DisasContext * dc)
         dc->npc = dc->npc + 4;
     }
  jmp_insn:
-    return;
+    goto egress;
  illegal_insn:
     {
         TCGv_i32 r_const;
@@ -4610,7 +4609,7 @@  static void disas_sparc_insn(DisasContext * dc)
         tcg_temp_free_i32(r_const);
         dc->is_br = 1;
     }
-    return;
+    goto egress;
  unimp_flush:
     {
         TCGv_i32 r_const;
@@ -4621,7 +4620,7 @@  static void disas_sparc_insn(DisasContext * dc)
         tcg_temp_free_i32(r_const);
         dc->is_br = 1;
     }
-    return;
+    goto egress;
 #if !defined(CONFIG_USER_ONLY)
  priv_insn:
     {
@@ -4633,19 +4632,19 @@  static void disas_sparc_insn(DisasContext * dc)
         tcg_temp_free_i32(r_const);
         dc->is_br = 1;
     }
-    return;
+    goto egress;
 #endif
  nfpu_insn:
     save_state(dc, cpu_cond);
     gen_op_fpexception_im(FSR_FTT_UNIMPFPOP);
     dc->is_br = 1;
-    return;
+    goto egress;
 #if !defined(CONFIG_USER_ONLY) && !defined(TARGET_SPARC64)
  nfq_insn:
     save_state(dc, cpu_cond);
     gen_op_fpexception_im(FSR_FTT_SEQ_ERROR);
     dc->is_br = 1;
-    return;
+    goto egress;
 #endif
 #ifndef TARGET_SPARC64
  ncp_insn:
@@ -4658,8 +4657,11 @@  static void disas_sparc_insn(DisasContext * dc)
         tcg_temp_free(r_const);
         dc->is_br = 1;
     }
-    return;
+    goto egress;
 #endif
+ egress:
+    tcg_temp_free(cpu_tmp1);
+    tcg_temp_free(cpu_tmp2);
 }
 
 static inline void gen_intermediate_code_internal(TranslationBlock * tb,