diff mbox

RFD: rtx_cost changes

Message ID 4CA1A622.4060408@codesourcery.com
State New
Headers show

Commit Message

Bernd Schmidt Sept. 28, 2010, 8:24 a.m. UTC
A while ago I posted this:
> Our rtx_costs infrastructure is somewhat inadequate.  Consider
> reload_cse_move2add, which converts constant loads into add
> instructions.  We have the following problems:
> 
> 1. There's no way to distinguish between a 3-operand operation and a
>    2-operand operation in rtx_costs.  On Thumb-2,
>    movs r2, #100
>    adds r2, r2, #8
>    adds r2, r3, #7
>    are 2-byte instructions, while
>    movs r8, #100
>    movw r2, #3095
>    add r2, r2, #1234
>    add r2, r3, #8
>    take four bytes.  If we only have (plus (r3) (const_int 8)) or
>    (const_int 100), we cannot meaningfully return a cost for it.  We
>    need to know the SET_DEST.
> 2. When optimizing for speed, we should prefer faster instructions, but
>    when choosing between equally fast alternatives, we should pick the
>    smaller one.  The same applies vice versa when optimizing for size.
> 3. We have no way of estimating costs for a strict_low_part load of a
>    constant (one optimization done in reload_cse_move2add).
> 4. rtx_costs returns zero for REG, even when outer_code == SET.  This is
>    almost certainly wrong everywhere; COSTS_N_INSNS (1) seems like a
>    better choice.  This isn't addressed yet in the patch, as trying to
>    change it introduces lots of changes in code generation.
> 5. In postreload, we have apples-to-oranges comparisons of the form
>    rtx_cost (x, PLUS) < rtx_cost (y, SET)
>    which makes no sense.  A constant may well be free inside a PLUS,
>    but the insn still has a cost which we'd underestimate in such a
>    case.  We should compare both in terms of outer_code == SET.
> 6. On some machines, using constants is better than using registers.
>    reload_cse_simplify_operands doesn't have a plausible cost check
>    when substituting constants, and reload_cse_move2add should also
>    do the substitution only when costs improve.  It's likely that
>    further optimization passes can more easily deal with a constant
>    load rather than an add, and on OOO machines it may reduce
>    dependencies between instructions and reduce the number of
>    register accesses in a cycle (documented as a limitation of PPro
>    and its derivatives such as Core 2).

The previous patch was unnecessarily large.  I realized that the problem
can be solved without a rtx_cost interface change if the callers pass
whole SETs to rtx_costs, so I've modified code in postreload to do
exactly this.  I still think we'll want to think about the interface at
some point.

The patch below addresses problems #1, #2 and #5 (and #6 as far as
move2add is concerned).  I've bootstrapped and regression tested this on
i686-linux and verified that the new ARM testcase passes.  I've removed
ARM-specific costs changes and for now since they caused some
undesirable effects even though they were more correct than the current
code.

Ok for the rtl.h/rtlanal.c bits?


Bernd
* postreload.c (move2add_use_add2_insn): Use full_costs for
	comparison.
	(move2add_use_add3_insn): Likewise.
	(reload_cse_move2add): Likewise.
	* rtlanal.c (get_full_rtx_cost): New function.
	* rtl.h (struct full_rtx_costs): New.
	(init_costs_to_max, init_costs_to_zero, costs_lt_p,
	costs_add_n_insns): New inline functions.
	(get_full_rtx_cost): Declare.

	* gcc.target/arm/pr40457-3.c: New test.

Comments

Mark Mitchell Sept. 28, 2010, 2:19 p.m. UTC | #1
On 9/28/2010 1:24 AM, Bernd Schmidt wrote:

> The patch below addresses problems #1, #2 and #5 (and #6 as far as
> move2add is concerned).  I've bootstrapped and regression tested this on
> i686-linux and verified that the new ARM testcase passes.  I've removed
> ARM-specific costs changes and for now since they caused some
> undesirable effects even though they were more correct than the current
> code.
> 
> Ok for the rtl.h/rtlanal.c bits?

Yes.
diff mbox

Patch

Index: postreload.c
===================================================================
--- postreload.c	(revision 164476)
+++ postreload.c	(working copy)
@@ -1645,39 +1645,45 @@  move2add_use_add2_insn (rtx reg, rtx sym
       if (INTVAL (off) == reg_offset [regno])
 	changed = validate_change (insn, &SET_SRC (pat), reg, 0);
     }
-  else if (rtx_cost (new_src, PLUS, speed) < rtx_cost (src, SET, speed)
-	   && have_add2_insn (reg, new_src))
+  else
     {
+      struct full_rtx_costs oldcst, newcst;
       rtx tem = gen_rtx_PLUS (GET_MODE (reg), reg, new_src);
-      changed = validate_change (insn, &SET_SRC (pat), tem, 0);
-    }
-  else if (sym == NULL_RTX && GET_MODE (reg) != BImode)
-    {
-      enum machine_mode narrow_mode;
-      for (narrow_mode = GET_CLASS_NARROWEST_MODE (MODE_INT);
-	   narrow_mode != VOIDmode
-	     && narrow_mode != GET_MODE (reg);
-	   narrow_mode = GET_MODE_WIDER_MODE (narrow_mode))
+
+      get_full_rtx_cost (pat, SET, &oldcst);
+      SET_SRC (pat) = tem;
+      get_full_rtx_cost (pat, SET, &newcst);
+      SET_SRC (pat) = src;
+
+      if (costs_lt_p (&newcst, &oldcst, speed)
+	  && have_add2_insn (reg, new_src))
+	changed = validate_change (insn, &SET_SRC (pat), tem, 0);	
+      else if (sym == NULL_RTX && GET_MODE (reg) != BImode)
 	{
-	  if (have_insn_for (STRICT_LOW_PART, narrow_mode)
-	      && ((reg_offset[regno]
-		   & ~GET_MODE_MASK (narrow_mode))
-		  == (INTVAL (off)
-		      & ~GET_MODE_MASK (narrow_mode))))
+	  enum machine_mode narrow_mode;
+	  for (narrow_mode = GET_CLASS_NARROWEST_MODE (MODE_INT);
+	       narrow_mode != VOIDmode
+		 && narrow_mode != GET_MODE (reg);
+	       narrow_mode = GET_MODE_WIDER_MODE (narrow_mode))
 	    {
-	      rtx narrow_reg = gen_rtx_REG (narrow_mode,
-					    REGNO (reg));
-	      rtx narrow_src = gen_int_mode (INTVAL (off),
-					     narrow_mode);
-	      rtx new_set =
-		gen_rtx_SET (VOIDmode,
-			     gen_rtx_STRICT_LOW_PART (VOIDmode,
-						      narrow_reg),
-			     narrow_src);
-	      changed = validate_change (insn, &PATTERN (insn),
-					 new_set, 0);
-	      if (changed)
-		break;
+	      if (have_insn_for (STRICT_LOW_PART, narrow_mode)
+		  && ((reg_offset[regno] & ~GET_MODE_MASK (narrow_mode))
+		      == (INTVAL (off) & ~GET_MODE_MASK (narrow_mode))))
+		{
+		  rtx narrow_reg = gen_rtx_REG (narrow_mode,
+						REGNO (reg));
+		  rtx narrow_src = gen_int_mode (INTVAL (off),
+						 narrow_mode);
+		  rtx new_set
+		    = gen_rtx_SET (VOIDmode,
+				   gen_rtx_STRICT_LOW_PART (VOIDmode,
+							    narrow_reg),
+				   narrow_src);
+		  changed = validate_change (insn, &PATTERN (insn),
+					     new_set, 0);
+		  if (changed)
+		    break;
+		}
 	    }
 	}
     }
@@ -1705,11 +1711,18 @@  move2add_use_add3_insn (rtx reg, rtx sym
   rtx pat = PATTERN (insn);
   rtx src = SET_SRC (pat);
   int regno = REGNO (reg);
-  int min_cost = INT_MAX;
   int min_regno = 0;
   bool speed = optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn));
   int i;
   bool changed = false;
+  struct full_rtx_costs oldcst, newcst, mincst;
+  rtx plus_expr;
+
+  init_costs_to_max (&mincst);
+  get_full_rtx_cost (pat, SET, &oldcst);
+
+  plus_expr = gen_rtx_PLUS (GET_MODE (reg), reg, const0_rtx);
+  SET_SRC (pat) = plus_expr;
 
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
     if (reg_set_luid[i] > move2add_last_label_luid
@@ -1728,22 +1741,25 @@  move2add_use_add3_insn (rtx reg, rtx sym
 	   no-op moves.  */
 	if (new_src == const0_rtx)
 	  {
-	    min_cost = 0;
+	    init_costs_to_zero (&mincst);
 	    min_regno = i;
 	    break;
 	  }
 	else
 	  {
-	    int cost = rtx_cost (new_src, PLUS, speed);
-	    if (cost < min_cost)
+	    XEXP (plus_expr, 1) = new_src;
+	    get_full_rtx_cost (pat, SET, &newcst);
+
+	    if (costs_lt_p (&newcst, &mincst, speed))
 	      {
-		min_cost = cost;
+		mincst = newcst;
 		min_regno = i;
 	      }
 	  }
       }
+  SET_SRC (pat) = src;
 
-  if (min_cost < rtx_cost (src, SET, speed))
+  if (costs_lt_p (&mincst, &oldcst, speed))
     {
       rtx tem;
 
@@ -1879,18 +1895,26 @@  reload_cse_move2add (rtx first)
 			/* See above why we create (set (reg) (reg)) here.  */
 			success
 			  = validate_change (next, &SET_SRC (set), reg, 0);
-		      else if ((rtx_cost (new_src, PLUS, speed)
-				< COSTS_N_INSNS (1) + rtx_cost (src3, SET, speed))
-			       && have_add2_insn (reg, new_src))
+		      else
 			{
-			  rtx newpat = gen_rtx_SET (VOIDmode,
-						    reg,
-						    gen_rtx_PLUS (GET_MODE (reg),
-						 		  reg,
-								  new_src));
-			  success
-			    = validate_change (next, &PATTERN (next),
-					       newpat, 0);
+			  rtx old_src = SET_SRC (set);
+			  struct full_rtx_costs oldcst, newcst;
+			  rtx tem = gen_rtx_PLUS (GET_MODE (reg), reg, new_src);
+
+			  get_full_rtx_cost (set, SET, &oldcst);
+			  SET_SRC (set) = tem;
+			  get_full_rtx_cost (tem, SET, &newcst);
+			  SET_SRC (set) = old_src;
+			  costs_add_n_insns (&oldcst, 1);
+
+			  if (costs_lt_p (&newcst, &oldcst, speed)
+			      && have_add2_insn (reg, new_src))
+			    {
+			      rtx newpat = gen_rtx_SET (VOIDmode, reg, tem);
+			      success
+				= validate_change (next, &PATTERN (next),
+						   newpat, 0);
+			    }
 			}
 		      if (success)
 			delete_insn (insn);
Index: rtlanal.c
===================================================================
--- rtlanal.c	(revision 164476)
+++ rtlanal.c	(working copy)
@@ -3589,6 +3589,17 @@  rtx_cost (rtx x, enum rtx_code outer_cod
 
   return total;
 }
+
+/* Fill in the structure C with information about both speed and size rtx
+   costs for X, with outer code OUTER.  */
+
+void
+get_full_rtx_cost (rtx x, enum rtx_code outer, struct full_rtx_costs *c)
+{
+  c->speed = rtx_cost (x, outer, true);
+  c->size = rtx_cost (x, outer, false);
+}
+
 
 /* Return cost of address expression X.
    Expect that X is properly formed address reference.
Index: rtl.h
===================================================================
--- rtl.h	(revision 164476)
+++ rtl.h	(working copy)
@@ -1123,9 +1123,57 @@  rhs_regno (const_rtx x)
    not to use an rtx with this cost under any circumstances.  */
 #define MAX_COST INT_MAX
 
+/* A structure to hold all available cost information about an rtl
+   expression.  */
+struct full_rtx_costs
+{
+  int speed;
+  int size;
+};
+
+/* Initialize a full_rtx_costs structure C to the maximum cost.  */
+static inline void
+init_costs_to_max (struct full_rtx_costs *c)
+{
+  c->speed = MAX_COST;
+  c->size = MAX_COST;
+}
+
+/* Initialize a full_rtx_costs structure C to zero cost.  */
+static inline void
+init_costs_to_zero (struct full_rtx_costs *c)
+{
+  c->speed = 0;
+  c->size = 0;
+}
+
+/* Compare two full_rtx_costs structures A and B, returning true
+   if A < B when optimizing for speed.  */
+static inline bool
+costs_lt_p (struct full_rtx_costs *a, struct full_rtx_costs *b,
+	    bool speed)
+{
+  if (speed)
+    return (a->speed < b->speed
+	    || (a->speed == b->speed && a->size < b->size));
+  else
+    return (a->size < b->size
+	    || (a->size == b->size && a->speed < b->speed));
+}
+
+/* Increase both members of the full_rtx_costs structure C by the
+   cost of N insns.  */
+static inline void
+costs_add_n_insns (struct full_rtx_costs *c, int n)
+{
+  c->speed += COSTS_N_INSNS (n);
+  c->size += COSTS_N_INSNS (n);
+}
+
 extern void init_rtlanal (void);
 extern int rtx_cost (rtx, enum rtx_code, bool);
 extern int address_cost (rtx, enum machine_mode, addr_space_t, bool);
+extern void get_full_rtx_cost (rtx, enum rtx_code, struct full_rtx_costs *);
 extern unsigned int subreg_lsb (const_rtx);
 extern unsigned int subreg_lsb_1 (enum machine_mode, enum machine_mode,
 				  unsigned int);
Index: testsuite/gcc.target/arm/pr40457-3.c
===================================================================
--- testsuite/gcc.target/arm/pr40457-3.c	(revision 0)
+++ testsuite/gcc.target/arm/pr40457-3.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-options "-Os" }  */
+/* { dg-do compile } */
+
+void foo(int* p)
+{
+  p[0] = 1;
+  p[1] = 0;
+}
+
+/* { dg-final { scan-assembler "stm" } } */