diff mbox

[RFC] costs and it's use in assign_reg_parm

Message ID 542D0D4D.3000709@arm.com
State New
Headers show

Commit Message

Ramana Radhakrishnan Oct. 2, 2014, 8:31 a.m. UTC
Hi,

    I've been digging into why on AArch64 we generate pretty bad code 
for the following testcase.

void g2(float, float, float, float, float, float, float, float);

void f2a(void)
{
    float x0 = 1.0, x1 = 2.0, x2 = 3.0, x3 = 4.0, x4 = 5.0, x5 = 6.0, x6 
= 7.0, x7 = 8.0;
    float x8 = 0.5, x9 = 1.5, x10 = 2.5, x11 = 0.25, x12 = 0.125, x13 = 
3.5, x14 = 0.75, x15 = 1.25;

    g2(x0, x1, x2, x3, x4, x5, x6, x7);
    g2(x8, x9, x10, x11, x12, x13, x14, x15);
    g2(x0, x1, x2, x3, x4, x5, x6, x7);
    g2(x8, x9, x10, x11, x12, x13, x14, x15);
}

And a couple of items caught my attention.

For one the backend doesn't set the costs of a reg-reg move to be the 
same as a reg-const move. In the AArch64 backend the approach appears to 
be in line with the documentation which is to set the costs of various 
instructions relative to a fast integer instruction. The hack to 
aarch64.c in the attached patch is for setting the cost properly for a 
reg-reg move of the appropriate mode and is only for demonstration 
purposes. I expect this to be replaced by an equivalent bit of code in 
the backend to achieve the same thing.

However the code in precompute_register_parameters assumes that the 
value is forced into a register if the set_src_cost of a constant is  > 
COSTS_N_INSNS(1). Now this appears to be checking the cost of a set of 
an FP immediate constant to a register and the backend not unreasonably 
sets it to an appropriate cost. Now to assume that this number should 
always be less than 1 is really not appropriate.

The same can be achieved with removing the fpconst case in 
aarch64.c:rtx_costs but ...

Instead of putting in what's effectively a lie in the backend, should we 
just be moving the midend to a world where all such numbers are compared 
to costs from the backend rather than relying on magic numbers. The 
costs comparison logic is similar to whats used in lower-subreg. The 
thought was to move this to a common area (Richard Sandiford suggested 
expmed.h in a private conversation) so that we had common APIs to check 
the cost of a SET in this form rather than relying on the rtx_cost 
interface.

Some of you might ask what's the impact on other backends, I still need 
to do the due diligence there with various test programs but my 
expectation based on reading the code is that a sample of backends (x86, 
mips, aarch64 and powerpc) handle the reg-reg move cost with a "set" 
already and I would expect the same to be in line with other costs. 
AArch32 does not but I'll take care of that in a follow up.

Longer term should we move towards cleaning up such magic numbers from 
the mid-end and that would make for a more maintainable compiler IMHO.

Thoughts ?

Lightly tested with the testcase and nothing more.


regards
Ramana



2a:
         fmov    s23, 8.0e+0
         fmov    s22, 7.0e+0
         fmov    s21, 6.0e+0
         fmov    s20, 5.0e+0
         fmov    s19, 4.0e+0
         fmov    s18, 3.0e+0
         fmov    s17, 2.0e+0
         fmov    s16, 1.0e+0
         stp     x29, x30, [sp, -112]!
         fmov    s7, s23
         fmov    s6, s22
         add     x29, sp, 0
         fmov    s5, s21
         fmov    s4, s20
         stp     d8, d9, [sp, 16]
         fmov    s3, s19
         stp     d10, d11, [sp, 32]
         fmov    s2, s18
         stp     d12, d13, [sp, 48]
         fmov    s1, s17
         stp     d14, d15, [sp, 64]
         fmov    s0, s16
         fmov    s15, 1.25e+0
         fmov    s14, 7.5e-1
         fmov    s13, 3.5e+0
         fmov    s12, 1.25e-1
         fmov    s11, 2.5e-1
         fmov    s10, 2.5e+0
         fmov    s9, 1.5e+0
         fmov    s8, 5.0e-1
         str     s23, [x29, 80]
         str     s22, [x29, 84]
         str     s21, [x29, 88]
         str     s20, [x29, 92]
         str     s19, [x29, 96]
         str     s18, [x29, 100]
         str     s17, [x29, 104]
         str     s16, [x29, 108]
         bl      g2
         fmov    s7, s15
         fmov    s6, s14
         fmov    s5, s13
         fmov    s4, s12
         fmov    s3, s11
         fmov    s2, s10
         fmov    s1, s9
         fmov    s0, s8
         bl      g2
         ldr     s23, [x29, 80]




TO

f2a:
         stp     x29, x30, [sp, -16]!
         fmov    s7, 8.0e+0
         add     x29, sp, 0
         fmov    s6, 7.0e+0
         fmov    s5, 6.0e+0
         fmov    s4, 5.0e+0
         fmov    s3, 4.0e+0
         fmov    s2, 3.0e+0
         fmov    s1, 2.0e+0
         fmov    s0, 1.0e+0
         bl      g2
         fmov    s7, 1.25e+0
         fmov    s6, 7.5e-1
         fmov    s5, 3.5e+0
         fmov    s4, 1.25e-1
         fmov    s3, 2.5e-1
         fmov    s2, 2.5e+0
         fmov    s1, 1.5e+0
         fmov    s0, 5.0e-1
         bl      g2
         fmov    s7, 8.0e+0
         fmov    s6, 7.0e+0
         fmov    s5, 6.0e+0
         fmov    s4, 5.0e+0
         fmov    s3, 4.0e+0
         fmov    s2, 3.0e+0
         fmov    s1, 2.0e+0
         fmov    s0, 1.0e+0
         bl      g2
         ldp     x29, x30, [sp], 16
         fmov    s7, 1.25e+0
         fmov    s6, 7.5e-1
         fmov    s5, 3.5e+0
         fmov    s4, 1.25e-1
         fmov    s3, 2.5e-1
         fmov    s2, 2.5e+0
         fmov    s1, 1.5e+0
         fmov    s0, 5.0e-1
         b       g2

Comments

Jeff Law Oct. 3, 2014, 6:36 p.m. UTC | #1
On 10/02/14 02:31, Ramana Radhakrishnan wrote:
>
> And a couple of items caught my attention.
>
> For one the backend doesn't set the costs of a reg-reg move to be the
> same as a reg-const move. In the AArch64 backend the approach appears to
> be in line with the documentation which is to set the costs of various
> instructions relative to a fast integer instruction. The hack to
> aarch64.c in the attached patch is for setting the cost properly for a
> reg-reg move of the appropriate mode and is only for demonstration
> purposes. I expect this to be replaced by an equivalent bit of code in
> the backend to achieve the same thing.
rtx_cost and its friends have always been a problem and it's always felt 
like a design problem.   The true cost of something is so context 
dependent I'm not really sure how to build an API to do this in a sane way.


Regardless, the first thing I see when I look at the aarch64 rtx costing 
bits is that it rarely looks at modes.  So it's entirely possible that 
if floats/doubles have a higher cost than integers that it needs a bit 
of hacking to bring the result into line for non-integer modes.



>
> However the code in precompute_register_parameters assumes that the
> value is forced into a register if the set_src_cost of a constant is  >
> COSTS_N_INSNS(1). Now this appears to be checking the cost of a set of
> an FP immediate constant to a register and the backend not unreasonably
> sets it to an appropriate cost. Now to assume that this number should
> always be less than 1 is really not appropriate.
Agreed. Also note there's a bit of a break in the model -- namely that 
we're comparing the cost of a particular operand vs the cost of insns. 
I've always found this to be, umm lame and a source of confusion when 
folks have been doing backend tuning.


> Instead of putting in what's effectively a lie in the backend, should we
> just be moving the midend to a world where all such numbers are compared
> to costs from the backend rather than relying on magic numbers. The
> costs comparison logic is similar to whats used in lower-subreg. The
> thought was to move this to a common area (Richard Sandiford suggested
> expmed.h in a private conversation) so that we had common APIs to check
> the cost of a SET in this form rather than relying on the rtx_cost
> interface.
Agreed.

Jeff
Segher Boessenkool Oct. 4, 2014, 2:54 a.m. UTC | #2
On Fri, Oct 03, 2014 at 12:36:37PM -0600, Jeff Law wrote:
> rtx_cost and its friends have always been a problem and it's always felt 
> like a design problem.   The true cost of something is so context 
> dependent I'm not really sure how to build an API to do this in a sane way.

In many places it already would help if we had an rtx_insn_cost that
gets a full insn as input.  Maybe an rtx_set_src_cost for the other
cases where "cost of an RTX" makes any sense at all.


Segher
Richard Sandiford Oct. 5, 2014, 9:32 a.m. UTC | #3
Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> writes:
> Hi,
>
>     I've been digging into why on AArch64 we generate pretty bad code 
> for the following testcase.
>
> void g2(float, float, float, float, float, float, float, float);
>
> void f2a(void)
> {
>     float x0 = 1.0, x1 = 2.0, x2 = 3.0, x3 = 4.0, x4 = 5.0, x5 = 6.0, x6 
> = 7.0, x7 = 8.0;
>     float x8 = 0.5, x9 = 1.5, x10 = 2.5, x11 = 0.25, x12 = 0.125, x13 = 
> 3.5, x14 = 0.75, x15 = 1.25;
>
>     g2(x0, x1, x2, x3, x4, x5, x6, x7);
>     g2(x8, x9, x10, x11, x12, x13, x14, x15);
>     g2(x0, x1, x2, x3, x4, x5, x6, x7);
>     g2(x8, x9, x10, x11, x12, x13, x14, x15);
> }
>
> And a couple of items caught my attention.
>
> For one the backend doesn't set the costs of a reg-reg move to be the 
> same as a reg-const move. In the AArch64 backend the approach appears to 
> be in line with the documentation which is to set the costs of various 
> instructions relative to a fast integer instruction. The hack to 
> aarch64.c in the attached patch is for setting the cost properly for a 
> reg-reg move of the appropriate mode and is only for demonstration 
> purposes. I expect this to be replaced by an equivalent bit of code in 
> the backend to achieve the same thing.
>
> However the code in precompute_register_parameters assumes that the 
> value is forced into a register if the set_src_cost of a constant is  > 
> COSTS_N_INSNS(1). Now this appears to be checking the cost of a set of 
> an FP immediate constant to a register and the backend not unreasonably 
> sets it to an appropriate cost. Now to assume that this number should 
> always be less than 1 is really not appropriate.
>
> The same can be achieved with removing the fpconst case in 
> aarch64.c:rtx_costs but ...
>
> Instead of putting in what's effectively a lie in the backend, should we 
> just be moving the midend to a world where all such numbers are compared 
> to costs from the backend rather than relying on magic numbers.

Just to throw out what I said in a private conversation in case anyone
else feels the same way...

I think this is really a bug in the backend.  The backend is assigning a
cost of COSTS_N_INSNS (3) to a floating-point constant not because the
constant itself is expensive -- it's actually as cheap as a register
in this context -- but because the backend considers floating-point
moves to be 3 times more expensive than cheap integer moves.

The calls.c code is written the way it is because of:

  switch (code)
    {
    case REG:
      return 0;

in rtx_costs.  This cannot be overridden “by design” (not saying that
it's a good design).  So the backend is creating a situation where
the cost of an SFmode CONST_DOUBLE SET_SRC is COSTS_N_INSNS (3) more
expensive than an SFmode REG SET_SRC, whereas they actually have the
same cost.  And in Ramana's case we want them to have the same cost.
So I think the backend is just providing misleading information here.

That's what's leading calls.c to think that the constant is a lot
more expensive than a register.  It's not asking for the cost of
an operation.  It's asking for the cost of one rvalue object compared
to the cost of a register.  And when the cost of a register is pegged
at zero, it's reasonable to assume that a constant that comes back as
COSTS_N_INSNS (3) is much more espensive than a register.

lower-subreg.c uses the cost of (set ...) rtxes to calculate the cost
of a single move in a wide mode vs. the cost of multiple moves in
smaller modes.  IMO this is consistent with counting the cost of an
addition against the (plus ...) rtx, rather than against the operands
of the plus.  So the backend can use the SET costs to say that an SFmode
move is 3 times more expensive than an SImode move.  But once it's done
that, it shouldn't also say that a CONST_DOUBLE with outer code SET
is COSTS_N_INSNS (3), since that would be double-counting.  It can still
use COSTS_N_INSNS (3) in situations that don't allow constants, such as
outer code DIV (or probably just outer code != SET) since in that case
we are calculating the cost of a division plus a separate move.

I agree it would good in principle to completely overhaul the way
costs are calculated, but that discussion has been had several times
and it's not likely anyone will find time to do it.  But while we
keep the scheme that a REG has cost 0, anything that is as cheap
as a register in a given context (decided by outer code) should
have cost 0 too.

FWIW, this is what the MIPS backend does.  It seemed to work well
in practice and I think it's consistent with the current cost model
(although I know others disagree :-))

Thanks,
Richard
diff mbox

Patch

diff --git a/gcc/calls.c b/gcc/calls.c
index 345331f..a34da07 100644
--- a/gcc/calls.c
+++ b/gcc/calls.c
@@ -813,11 +813,26 @@  call_expr_flags (const_tree t)
 
    Set REG_PARM_SEEN if we encounter a register parameter.  */
 
+/* RTXes used while computing costs.  */
+struct cost_rtxes {
+  /* Source and target registers.  */
+  rtx source;
+  rtx target;
+  /* A SET of TARGET.  */
+  rtx set;
+};
+
+
 static void
 precompute_register_parameters (int num_actuals, struct arg_data *args,
 				int *reg_parm_seen)
 {
   int i;
+  static struct cost_rtxes cost_rtx;
+
+  cost_rtx.target = gen_rtx_REG (word_mode, FIRST_PSEUDO_REGISTER);
+  cost_rtx.source = gen_rtx_REG (word_mode, FIRST_PSEUDO_REGISTER + 1);
+  cost_rtx.set = gen_rtx_SET (VOIDmode, cost_rtx.target, cost_rtx.source);
 
   *reg_parm_seen = 0;
 
@@ -825,6 +840,8 @@  precompute_register_parameters (int num_actuals, struct arg_data *args,
     if (args[i].reg != 0 && ! args[i].pass_on_stack)
       {
 	*reg_parm_seen = 1;
+	PUT_MODE (cost_rtx.target, args[i].mode);
+	PUT_MODE (cost_rtx.source, args[i].mode);
 
 	if (args[i].value == 0)
 	  {
@@ -872,8 +889,13 @@  precompute_register_parameters (int num_actuals, struct arg_data *args,
 		     || (GET_CODE (args[i].value) == SUBREG
 			 && REG_P (SUBREG_REG (args[i].value)))))
 		 && args[i].mode != BLKmode
-		 && set_src_cost (args[i].value, optimize_insn_for_speed_p ())
-		    > COSTS_N_INSNS (1)
+		 && (args[i].mode != VOIDmode
+		     && optimize
+		     && CONSTANT_P (args[i].value)
+		     && (set_src_cost (args[i].value, 
+				       optimize_insn_for_speed_p ())
+			 > set_src_cost (cost_rtx.set, 
+					 optimize_insn_for_speed_p ())))
 		 && ((*reg_parm_seen
 		      && targetm.small_register_classes_for_mode_p (args[i].mode))
 		     || optimize))
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4e0cba8..91ece7b 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -5083,6 +5083,7 @@  aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
 			      / UNITS_PER_WORD;
               *cost = COSTS_N_INSNS (n_minus_1 + 1);
+	      *cost = COSTS_N_INSNS (3);
             }
           else
 	    /* Cost is just the cost of the RHS of the set.  */