Index: fwprop.c
===================================================================
--- fwprop.c	(revision 162823)
+++ fwprop.c	(working copy)
@@ -413,7 +413,8 @@ should_replace_address (rtx old_rtx, rtx
      eliminating the most insns without additional costs, and it
      is the same that cse.c used to do.  */
   if (gain == 0)
-    gain = rtx_cost (new_rtx, SET, speed) - rtx_cost (old_rtx, SET, speed);
+    gain = (rtx_cost2 (new_rtx, SET, NULL_RTX, speed)
+	    - rtx_cost2 (old_rtx, SET, NULL_RTX, speed));
 
   return (gain > 0);
 }
@@ -962,7 +963,7 @@ try_fwprop_subst (df_ref use, rtx *loc, 
      multiple sets.  If so, assume the cost of the new instruction is
      not greater than the old one.  */
   if (set)
-    old_cost = rtx_cost (SET_SRC (set), SET, speed);
+    old_cost = rtx_cost2 (SET_SRC (set), SET, SET_DEST (set), speed);
   if (dump_file)
     {
       fprintf (dump_file, "\nIn insn %d, replacing\n ", INSN_UID (insn));
@@ -983,7 +984,7 @@ try_fwprop_subst (df_ref use, rtx *loc, 
 
   else if (DF_REF_TYPE (use) == DF_REF_REG_USE
 	   && set
-	   && rtx_cost (SET_SRC (set), SET, speed) > old_cost)
+	   && rtx_cost2 (SET_SRC (set), SET, SET_DEST (set), speed) > old_cost)
     {
       if (dump_file)
 	fprintf (dump_file, "Changes to insn %d not profitable\n",
Index: doc/tm.texi
===================================================================
--- doc/tm.texi	(revision 162823)
+++ doc/tm.texi	(working copy)
@@ -6373,14 +6373,17 @@ Define this macro if a non-short-circuit
 @code{BRANCH_COST} is greater than or equal to the value 2.
 @end defmac
 
-@deftypefn {Target Hook} bool TARGET_RTX_COSTS (rtx @var{x}, int @var{code}, int @var{outer_code}, int *@var{total}, bool @var{speed})
+@deftypefn {Target Hook} bool TARGET_RTX_COSTS2 (rtx @var{x}, int @var{code}, int @var{outer_code}, rtx @var{set_lhs}, int *@var{total}, bool @var{speed})
 This target hook describes the relative costs of RTL expressions.
 
 The cost may depend on the precise form of the expression, which is
 available for examination in @var{x}, and the rtx code of the expression
 in which it is contained, found in @var{outer_code}.  @var{code} is the
 expression code---redundant, since it can be obtained with
-@code{GET_CODE (@var{x})}.
+@code{GET_CODE (@var{x})}.  @var{set_lhs} may be @code{NULL_RTX}, otherwise
+it contains the @code{SET_DEST} of a @code{SET} for which the costs are
+being computed.  In the latter case, the @var{x} is known to occur somewhere
+in the @code{SET_SRC}.
 
 In implementing this hook, you can use the construct
 @code{COSTS_N_INSNS (@var{n})} to specify a cost equal to @var{n} fast
@@ -6400,10 +6403,15 @@ The hook returns true when all subexpres
 processed, and false when @code{rtx_cost} should recurse.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_RTX_COSTS (rtx @var{x}, int @var{code}, int @var{outer_code}, int *@var{total}, bool @var{speed})
+This target hook is an old form of @code{TARGET_RTX_COSTS2}, which omits
+the @var{set_lhs} argument.  It should not be used in new code.
+@end deftypefn
+
 @deftypefn {Target Hook} int TARGET_ADDRESS_COST (rtx @var{address}, bool @var{speed})
 This hook computes the cost of an addressing mode that contains
 @var{address}.  If not defined, the cost is computed from
-the @var{address} expression and the @code{TARGET_RTX_COST} hook.
+the @var{address} expression and the @code{TARGET_RTX_COSTS2} hook.
 
 For most CISC machines, the default cost is a good approximation of the
 true cost of the addressing mode.  However, on RISC machines, all
Index: doc/tm.texi.in
===================================================================
--- doc/tm.texi.in	(revision 162823)
+++ doc/tm.texi.in	(working copy)
@@ -6373,14 +6373,17 @@ Define this macro if a non-short-circuit
 @code{BRANCH_COST} is greater than or equal to the value 2.
 @end defmac
 
-@hook TARGET_RTX_COSTS
+@hook TARGET_RTX_COSTS2
 This target hook describes the relative costs of RTL expressions.
 
 The cost may depend on the precise form of the expression, which is
 available for examination in @var{x}, and the rtx code of the expression
 in which it is contained, found in @var{outer_code}.  @var{code} is the
 expression code---redundant, since it can be obtained with
-@code{GET_CODE (@var{x})}.
+@code{GET_CODE (@var{x})}.  @var{set_lhs} may be @code{NULL_RTX}, otherwise
+it contains the @code{SET_DEST} of a @code{SET} for which the costs are
+being computed.  In the latter case, the @var{x} is known to occur somewhere
+in the @code{SET_SRC}.
 
 In implementing this hook, you can use the construct
 @code{COSTS_N_INSNS (@var{n})} to specify a cost equal to @var{n} fast
@@ -6400,10 +6403,15 @@ The hook returns true when all subexpres
 processed, and false when @code{rtx_cost} should recurse.
 @end deftypefn
 
+@hook TARGET_RTX_COSTS
+This target hook is an old form of @code{TARGET_RTX_COSTS2}, which omits
+the @var{set_lhs} argument.  It should not be used in new code.
+@end deftypefn
+
 @hook TARGET_ADDRESS_COST
 This hook computes the cost of an addressing mode that contains
 @var{address}.  If not defined, the cost is computed from
-the @var{address} expression and the @code{TARGET_RTX_COST} hook.
+the @var{address} expression and the @code{TARGET_RTX_COSTS2} hook.
 
 For most CISC machines, the default cost is a good approximation of the
 true cost of the addressing mode.  However, on RISC machines, all
Index: targhooks.c
===================================================================
--- targhooks.c	(revision 162823)
+++ targhooks.c	(working copy)
@@ -1198,6 +1198,13 @@ default_register_move_cost (enum machine
 }
 
 bool
+default_rtx_costs2 (rtx x, int code, int outer_code,
+		    rtx set_lhs ATTRIBUTE_UNUSED, int *total, bool speed)
+{
+  return targetm.rtx_costs (x, code, outer_code, total, speed);
+}
+
+bool
 default_profile_before_prologue (void)
 {
 #ifdef PROFILE_BEFORE_PROLOGUE
Index: targhooks.h
===================================================================
--- targhooks.h	(revision 162823)
+++ targhooks.h	(working copy)
@@ -146,6 +146,7 @@ extern bool default_addr_space_subset_p 
 extern rtx default_addr_space_convert (rtx, tree, tree);
 extern unsigned int default_case_values_threshold (void);
 extern bool default_have_conditional_execution (void);
+extern bool default_rtx_costs2 (rtx, int, int, rtx, int *, bool);
 extern int default_memory_move_cost (enum machine_mode, reg_class_t, bool);
 extern int default_register_move_cost (enum machine_mode, reg_class_t,
 				       reg_class_t);
Index: target.def
===================================================================
--- target.def	(revision 162823)
+++ target.def	(working copy)
@@ -1408,10 +1408,20 @@ DEFHOOK
 
 /* Compute a (partial) cost for rtx X.  Return true if the complete
    cost has been computed, and false if subexpressions should be
-   scanned.  In either case, *TOTAL contains the cost result.  */
+   scanned.  In either case, *TOTAL contains the cost result.
+   If SET_LHS is nonnull, it means we're computing the costs of the
+   right-hand-side expression of a SET whose destination is SET_LHS.  */
 /* Note that CODE and OUTER_CODE ought to be RTX_CODE, but that's
    not necessarily defined at this point.  */
 DEFHOOK
+(rtx_costs2,
+ "",
+ bool, (rtx x, int code, int outer_code, rtx set_lhs, int *total, bool speed),
+ default_rtx_costs2)
+
+/* The old form of rtx_costs2, without the SET_LHS argument.  This should not
+   be used in new code.  */
+DEFHOOK
 (rtx_costs,
  "",
  bool, (rtx x, int code, int outer_code, int *total, bool speed),
Index: optabs.c
===================================================================
--- optabs.c	(revision 162823)
+++ optabs.c	(working copy)
@@ -1364,7 +1364,8 @@ avoid_expensive_constant (enum machine_m
   if (mode != VOIDmode
       && optimize
       && CONSTANT_P (x)
-      && rtx_cost (x, binoptab->code, speed) > rtx_cost (x, SET, speed))
+      && (rtx_cost2 (x, binoptab->code, NULL_RTX, speed)
+	  > rtx_cost2 (x, SET, NULL_RTX, speed)))
     {
       if (CONST_INT_P (x))
 	{
@@ -3997,12 +3998,12 @@ prepare_cmp_insn (rtx x, rtx y, enum rtx
 
   /* If we are optimizing, force expensive constants into a register.  */
   if (CONSTANT_P (x) && optimize
-      && (rtx_cost (x, COMPARE, optimize_insn_for_speed_p ())
+      && (rtx_cost2 (x, COMPARE, NULL_RTX, optimize_insn_for_speed_p ())
           > COSTS_N_INSNS (1)))
     x = force_reg (mode, x);
 
   if (CONSTANT_P (y) && optimize
-      && (rtx_cost (y, COMPARE, optimize_insn_for_speed_p ())
+      && (rtx_cost2 (y, COMPARE, NULL_RTX, optimize_insn_for_speed_p ())
           > COSTS_N_INSNS (1)))
     y = force_reg (mode, y);
 
Index: postreload.c
===================================================================
--- postreload.c	(revision 162823)
+++ postreload.c	(working copy)
@@ -231,7 +231,7 @@ reload_cse_simplify_set (rtx set, rtx in
 {
   int did_change = 0;
   int dreg;
-  rtx src;
+  rtx src, dest;
   enum reg_class dclass;
   int old_cost;
   cselib_val *val;
@@ -241,7 +241,8 @@ reload_cse_simplify_set (rtx set, rtx in
 #endif
   bool speed = optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn));
 
-  dreg = true_regnum (SET_DEST (set));
+  dest = SET_DEST (set);
+  dreg = true_regnum (dest);
   if (dreg < 0)
     return 0;
 
@@ -274,7 +275,7 @@ reload_cse_simplify_set (rtx set, rtx in
     old_cost = register_move_cost (GET_MODE (src),
 				   REGNO_REG_CLASS (REGNO (src)), dclass);
   else
-    old_cost = rtx_cost (src, SET, speed);
+    old_cost = rtx_cost2 (src, SET, dest, speed);
 
   for (l = val->locs; l; l = l->next)
     {
@@ -309,7 +310,7 @@ reload_cse_simplify_set (rtx set, rtx in
 	      this_rtx = GEN_INT (this_val);
 	    }
 #endif
-	  this_cost = rtx_cost (this_rtx, SET, speed);
+	  this_cost = rtx_cost2 (this_rtx, SET, dest, speed);
 	}
       else if (REG_P (this_rtx))
 	{
@@ -317,7 +318,7 @@ reload_cse_simplify_set (rtx set, rtx in
 	  if (extend_op != UNKNOWN)
 	    {
 	      this_rtx = gen_rtx_fmt_e (extend_op, word_mode, this_rtx);
-	      this_cost = rtx_cost (this_rtx, SET, speed);
+	      this_cost = rtx_cost2 (this_rtx, SET, dest, speed);
 	    }
 	  else
 #endif
@@ -373,11 +374,11 @@ reload_cse_simplify_set (rtx set, rtx in
 static int
 reload_cse_simplify_operands (rtx insn, rtx testreg)
 {
+  bool speed = optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn));
   int i, j;
 
   /* For each operand, all registers that are equivalent to it.  */
   HARD_REG_SET equiv_regs[MAX_RECOG_OPERANDS];
-
   const char *constraints[MAX_RECOG_OPERANDS];
 
   /* Vector recording how bad an alternative is.  */
@@ -392,6 +393,9 @@ reload_cse_simplify_operands (rtx insn, 
   /* Array of alternatives, sorted in order of decreasing desirability.  */
   int *alternative_order;
 
+  rtx set;
+  struct full_rtx_costs oldcst, newcst;
+
   extract_insn (insn);
 
   if (recog_data.n_alternatives == 0 || recog_data.n_operands == 0)
@@ -407,6 +411,12 @@ reload_cse_simplify_operands (rtx insn, 
   memset (alternative_reject, 0, recog_data.n_alternatives * sizeof (int));
   memset (alternative_nregs, 0, recog_data.n_alternatives * sizeof (int));
 
+  init_costs_to_zero (&oldcst);
+  init_costs_to_zero (&newcst);
+  set = single_set (insn);
+  if (set)
+    get_full_rtx_cost (SET_SRC (set), SET, SET_DEST (set), &oldcst);
+
   /* For each operand, find out which regs are equivalent.  */
   for (i = 0; i < recog_data.n_operands; i++)
     {
@@ -570,16 +580,12 @@ reload_cse_simplify_operands (rtx insn, 
 		case ',': case '\0':
 		  /* See if REGNO fits this alternative, and set it up as the
 		     replacement register if we don't have one for this
-		     alternative yet and the operand being replaced is not
-		     a cheap CONST_INT.  */
+		     alternative yet.  If the operand is a CONST_INT, do this only
+		     if we are able to calculate costs.  */
 		  if (op_alt_regno[i][j] == -1
 		      && recog_data.alternative_enabled_p[j]
 		      && reg_fits_class_p (testreg, rclass, 0, mode)
-		      && (!CONST_INT_P (recog_data.operand[i])
-			  || (rtx_cost (recog_data.operand[i], SET,
-			  		optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn)))
-			      > rtx_cost (testreg, SET,
-			  		optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))))))
+		      && (!CONST_INT_P (recog_data.operand[i]) || set))
 		    {
 		      alternative_nregs[j]++;
 		      op_alt_regno[i][j] = regno;
@@ -657,6 +663,15 @@ reload_cse_simplify_operands (rtx insn, 
 		       gen_rtx_REG (mode, op_alt_regno[op][j]), 1);
     }
 
+  if (set)
+    {
+      get_full_rtx_cost (SET_SRC (set), SET, SET_DEST (set), &newcst);
+      if (!costs_lt_p (&oldcst, &newcst, speed))
+	{
+	  cancel_changes (0);
+	  return 0;
+	}
+    }
   return apply_change_group ();
 }
 
@@ -913,12 +928,13 @@ try_replace_in_use (struct reg_use *use,
 	  && CONSTANT_P (XEXP (SET_SRC (new_set), 1)))
 	{
 	  rtx new_src;
-	  int old_cost = rtx_cost (SET_SRC (new_set), SET, speed);
+	  int old_cost = rtx_cost2 (SET_SRC (new_set), SET, SET_DEST (new_set),
+				    speed);
 
 	  gcc_assert (rtx_equal_p (XEXP (SET_SRC (new_set), 0), reg));
 	  new_src = simplify_replace_rtx (SET_SRC (new_set), reg, src);
 
-	  if (rtx_cost (new_src, SET, speed) <= old_cost
+	  if (rtx_cost2 (new_src, SET, SET_DEST (new_set), speed) <= old_cost
 	      && validate_change (use_insn, &SET_SRC (new_set),
 				  new_src, 0))
 	    return true;
@@ -1645,39 +1661,43 @@ move2add_use_add2_insn (rtx reg, rtx sym
       if (INTVAL (off) == reg_offset [regno])
 	changed = validate_change (insn, &SET_SRC (pat), reg, 0);
     }
-  else if (rtx_cost (new_src, PLUS, speed) < rtx_cost (src, SET, speed)
-	   && have_add2_insn (reg, new_src))
+  else
     {
+      struct full_rtx_costs oldcst, newcst;
       rtx tem = gen_rtx_PLUS (GET_MODE (reg), reg, new_src);
-      changed = validate_change (insn, &SET_SRC (pat), tem, 0);
-    }
-  else if (sym == NULL_RTX && GET_MODE (reg) != BImode)
-    {
-      enum machine_mode narrow_mode;
-      for (narrow_mode = GET_CLASS_NARROWEST_MODE (MODE_INT);
-	   narrow_mode != VOIDmode
-	     && narrow_mode != GET_MODE (reg);
-	   narrow_mode = GET_MODE_WIDER_MODE (narrow_mode))
+
+      get_full_rtx_cost (src, SET, SET_DEST (pat), &oldcst);
+      get_full_rtx_cost (tem, SET, SET_DEST (pat), &newcst);
+
+      if (costs_lt_p (&newcst, &oldcst, speed)
+	  && have_add2_insn (reg, new_src))
+	changed = validate_change (insn, &SET_SRC (pat), tem, 0);	
+      else if (sym == NULL_RTX && GET_MODE (reg) != BImode)
 	{
-	  if (have_insn_for (STRICT_LOW_PART, narrow_mode)
-	      && ((reg_offset[regno]
-		   & ~GET_MODE_MASK (narrow_mode))
-		  == (INTVAL (off)
-		      & ~GET_MODE_MASK (narrow_mode))))
+	  enum machine_mode narrow_mode;
+	  for (narrow_mode = GET_CLASS_NARROWEST_MODE (MODE_INT);
+	       narrow_mode != VOIDmode
+		 && narrow_mode != GET_MODE (reg);
+	       narrow_mode = GET_MODE_WIDER_MODE (narrow_mode))
 	    {
-	      rtx narrow_reg = gen_rtx_REG (narrow_mode,
-					    REGNO (reg));
-	      rtx narrow_src = gen_int_mode (INTVAL (off),
-					     narrow_mode);
-	      rtx new_set =
-		gen_rtx_SET (VOIDmode,
-			     gen_rtx_STRICT_LOW_PART (VOIDmode,
-						      narrow_reg),
-			     narrow_src);
-	      changed = validate_change (insn, &PATTERN (insn),
-					 new_set, 0);
-	      if (changed)
-		break;
+	      if (have_insn_for (STRICT_LOW_PART, narrow_mode)
+		  && ((reg_offset[regno] & ~GET_MODE_MASK (narrow_mode))
+		      == (INTVAL (off) & ~GET_MODE_MASK (narrow_mode))))
+		{
+		  rtx narrow_reg = gen_rtx_REG (narrow_mode,
+						REGNO (reg));
+		  rtx narrow_src = gen_int_mode (INTVAL (off),
+						 narrow_mode);
+		  rtx new_set
+		    = gen_rtx_SET (VOIDmode,
+				   gen_rtx_STRICT_LOW_PART (VOIDmode,
+							    narrow_reg),
+				   narrow_src);
+		  changed = validate_change (insn, &PATTERN (insn),
+					     new_set, 0);
+		  if (changed)
+		    break;
+		}
 	    }
 	}
     }
@@ -1705,11 +1725,16 @@ move2add_use_add3_insn (rtx reg, rtx sym
   rtx pat = PATTERN (insn);
   rtx src = SET_SRC (pat);
   int regno = REGNO (reg);
-  int min_cost = INT_MAX;
   int min_regno = 0;
   bool speed = optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn));
   int i;
   bool changed = false;
+  struct full_rtx_costs oldcst, newcst, mincst;
+  rtx plus_expr;
+
+  init_costs_to_max (&mincst);
+
+  plus_expr = gen_rtx_PLUS (GET_MODE (reg), reg, const0_rtx);
 
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
     if (reg_set_luid[i] > move2add_last_label_luid
@@ -1728,22 +1753,26 @@ move2add_use_add3_insn (rtx reg, rtx sym
 	   no-op moves.  */
 	if (new_src == const0_rtx)
 	  {
-	    min_cost = 0;
+	    init_costs_to_zero (&mincst);
 	    min_regno = i;
 	    break;
 	  }
 	else
 	  {
-	    int cost = rtx_cost (new_src, PLUS, speed);
-	    if (cost < min_cost)
+	    XEXP (plus_expr, 1) = new_src;
+	    get_full_rtx_cost (plus_expr, SET, SET_DEST (pat), &newcst);
+
+	    if (costs_lt_p (&newcst, &mincst, speed))
 	      {
-		min_cost = cost;
+		mincst = newcst;
 		min_regno = i;
 	      }
 	  }
       }
 
-  if (min_cost < rtx_cost (src, SET, speed))
+  get_full_rtx_cost (src, SET, SET_DEST (pat), &oldcst);
+
+  if (costs_lt_p (&mincst, &oldcst, speed))
     {
       rtx tem;
 
@@ -1879,18 +1908,23 @@ reload_cse_move2add (rtx first)
 			/* See above why we create (set (reg) (reg)) here.  */
 			success
 			  = validate_change (next, &SET_SRC (set), reg, 0);
-		      else if ((rtx_cost (new_src, PLUS, speed)
-				< COSTS_N_INSNS (1) + rtx_cost (src3, SET, speed))
-			       && have_add2_insn (reg, new_src))
+		      else
 			{
-			  rtx newpat = gen_rtx_SET (VOIDmode,
-						    reg,
-						    gen_rtx_PLUS (GET_MODE (reg),
-						 		  reg,
-								  new_src));
-			  success
-			    = validate_change (next, &PATTERN (next),
-					       newpat, 0);
+			  struct full_rtx_costs oldcst, newcst;
+			  rtx tem = gen_rtx_PLUS (GET_MODE (reg), reg, new_src);
+
+			  get_full_rtx_cost (src3, SET, reg, &oldcst);
+			  get_full_rtx_cost (tem, SET, reg, &newcst);
+			  costs_add_n_insns (&oldcst, 1);
+
+			  if (costs_lt_p (&newcst, &oldcst, speed)
+			      && have_add2_insn (reg, new_src))
+			    {
+			      rtx newpat = gen_rtx_SET (VOIDmode, reg, tem);
+			      success
+				= validate_change (next, &PATTERN (next),
+						   newpat, 0);
+			    }
 			}
 		      if (success)
 			delete_insn (insn);
Index: rtlanal.c
===================================================================
--- rtlanal.c	(revision 162823)
+++ rtlanal.c	(working copy)
@@ -3534,7 +3534,7 @@ label_is_jump_target_p (const_rtx label,
    be returned.  */
 
 int
-rtx_cost (rtx x, enum rtx_code outer_code ATTRIBUTE_UNUSED, bool speed)
+rtx_cost2 (rtx x, enum rtx_code outer_code, rtx set_lhs, bool speed)
 {
   int i, j;
   enum rtx_code code;
@@ -3582,24 +3582,36 @@ rtx_cost (rtx x, enum rtx_code outer_cod
       break;
 
     default:
-      if (targetm.rtx_costs (x, code, outer_code, &total, speed))
+      if (targetm.rtx_costs2 (x, code, outer_code, set_lhs, &total, speed))
 	return total;
       break;
     }
 
+  if (code == SET)
+    set_lhs = SET_DEST (x);
+
   /* Sum the costs of the sub-rtx's, plus cost of this operation,
      which is already in total.  */
 
   fmt = GET_RTX_FORMAT (code);
   for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
     if (fmt[i] == 'e')
-      total += rtx_cost (XEXP (x, i), code, speed);
+      total += rtx_cost2 (XEXP (x, i), code, set_lhs, speed);
     else if (fmt[i] == 'E')
       for (j = 0; j < XVECLEN (x, i); j++)
-	total += rtx_cost (XVECEXP (x, i, j), code, speed);
+	total += rtx_cost2 (XVECEXP (x, i, j), code, set_lhs, speed);
 
   return total;
 }
+
+void
+get_full_rtx_cost (rtx x, enum rtx_code outer, rtx set_lhs,
+		   struct full_rtx_costs *c)
+{
+  c->speed = rtx_cost2 (x, outer, set_lhs, true);
+  c->size = rtx_cost2 (x, outer, set_lhs, false);
+}
+
 
 /* Return cost of address expression X.
    Expect that X is properly formed address reference.
@@ -3625,7 +3637,7 @@ address_cost (rtx x, enum machine_mode m
 int
 default_address_cost (rtx x, bool speed)
 {
-  return rtx_cost (x, MEM, speed);
+  return rtx_cost2 (x, MEM, NULL_RTX, speed);
 }
 
 
@@ -4653,7 +4665,7 @@ insn_rtx_cost (rtx pat, bool speed)
   else
     return 0;
 
-  cost = rtx_cost (SET_SRC (set), SET, speed);
+  cost = rtx_cost2 (SET_SRC (set), SET, SET_DEST (set), speed);
   return cost > 0 ? cost : COSTS_N_INSNS (1);
 }
 
Index: cfgloopanal.c
===================================================================
--- cfgloopanal.c	(revision 162823)
+++ cfgloopanal.c	(working copy)
@@ -314,7 +314,7 @@ seq_cost (const_rtx seq, bool speed)
     {
       set = single_set (seq);
       if (set)
-	cost += rtx_cost (set, SET, speed);
+	cost += rtx_cost2 (set, SET, SET_DEST (set), speed);
       else
 	cost++;
     }
Index: auto-inc-dec.c
===================================================================
--- auto-inc-dec.c	(revision 162823)
+++ auto-inc-dec.c	(working copy)
@@ -484,9 +484,9 @@ attempt_change (rtx new_addr, rtx inc_re
   PUT_MODE (mem_tmp, mode);
   XEXP (mem_tmp, 0) = new_addr;
 
-  old_cost = (rtx_cost (mem, SET, speed)
-	      + rtx_cost (PATTERN (inc_insn.insn), SET, speed));
-  new_cost = rtx_cost (mem_tmp, SET, speed);
+  old_cost = (rtx_cost2 (mem, SET, NULL_RTX, speed)
+	      + rtx_cost2 (PATTERN (inc_insn.insn), SET, NULL_RTX, speed));
+  new_cost = rtx_cost2 (mem_tmp, SET, NULL_RTX, speed);
 
   /* The first item of business is to see if this is profitable.  */
   if (old_cost < new_cost)
Index: testsuite/gcc.target/arm/headmerge-2.c
===================================================================
Index: testsuite/gcc.target/arm/headmerge-1.c
===================================================================
Index: testsuite/gcc.target/i386/headmerge-1.c
===================================================================
Index: testsuite/gcc.target/i386/headmerge-2.c
===================================================================
Index: dojump.c
===================================================================
--- dojump.c	(revision 162823)
+++ dojump.c	(working copy)
@@ -143,6 +143,7 @@ static GTY(()) rtx shift_test;
 static bool
 prefer_and_bit_test (enum machine_mode mode, int bitnum)
 {
+  bool speed = optimize_insn_for_speed_p ();
   if (and_test == 0)
     {
       /* Set up rtxes for the two variations.  Use NULL as a placeholder
@@ -167,8 +168,8 @@ prefer_and_bit_test (enum machine_mode m
 						 mode);
   XEXP (XEXP (shift_test, 0), 1) = GEN_INT (bitnum);
 
-  return (rtx_cost (and_test, IF_THEN_ELSE, optimize_insn_for_speed_p ())
-	  <= rtx_cost (shift_test, IF_THEN_ELSE, optimize_insn_for_speed_p ()));
+  return (rtx_cost2 (and_test, IF_THEN_ELSE, NULL_RTX, speed)
+	  <= rtx_cost2 (shift_test, IF_THEN_ELSE, NULL_RTX, speed));
 }
 
 /* Subroutine of do_jump, dealing with exploded comparisons of the type
Index: tree-ssa-loop-ivopts.c
===================================================================
--- tree-ssa-loop-ivopts.c	(revision 162823)
+++ tree-ssa-loop-ivopts.c	(working copy)
@@ -2726,7 +2726,7 @@ seq_cost (rtx seq, bool speed)
     {
       set = single_set (seq);
       if (set)
-	cost += rtx_cost (set, SET,speed);
+	cost += rtx_cost2 (set, SET, SET_DEST (set), speed);
       else
 	cost++;
     }
Index: cse.c
===================================================================
--- cse.c	(revision 162823)
+++ cse.c	(working copy)
@@ -468,7 +468,7 @@ struct table_elt
 /* Compute cost of X, as stored in the `cost' field of a table_elt.  Fixed
    hard registers and pointers into the frame are the cheapest with a cost
    of 0.  Next come pseudos with a cost of one and other hard registers with
-   a cost of 2.  Aside from these special cases, call `rtx_cost'.  */
+   a cost of 2.  Aside from these special cases, call `rtx_cost2'.  */
 
 #define CHEAP_REGNO(N)							\
   (REGNO_PTR_FRAME_P(N)							\
@@ -764,7 +764,7 @@ notreg_cost (rtx x, enum rtx_code outer)
 	   && TRULY_NOOP_TRUNCATION (GET_MODE_BITSIZE (GET_MODE (x)),
 				     GET_MODE_BITSIZE (GET_MODE (SUBREG_REG (x)))))
 	  ? 0
-	  : rtx_cost (x, outer, optimize_this_for_speed_p) * 2);
+	  : rtx_cost2 (x, outer, NULL_RTX, optimize_this_for_speed_p) * 2);
 }
 
 
Index: ifcvt.c
===================================================================
--- ifcvt.c	(revision 162823)
+++ ifcvt.c	(working copy)
@@ -2027,7 +2027,8 @@ noce_try_sign_mask (struct noce_if_info 
      && (if_info->insn_b == NULL_RTX
 	 || BLOCK_FOR_INSN (if_info->insn_b) == if_info->test_bb));
   if (!(t_unconditional
-	|| (rtx_cost (t, SET, optimize_bb_for_speed_p (if_info->test_bb))
+	|| (rtx_cost2 (t, SET, NULL_RTX,
+		       optimize_bb_for_speed_p (if_info->test_bb))
 	    < COSTS_N_INSNS (2))))
     return FALSE;
 
Index: expr.c
===================================================================
--- expr.c	(revision 162823)
+++ expr.c	(working copy)
@@ -3451,9 +3451,9 @@ compress_float_constant (rtx x, rtx y)
   REAL_VALUE_FROM_CONST_DOUBLE (r, y);
 
   if (LEGITIMATE_CONSTANT_P (y))
-    oldcost = rtx_cost (y, SET, speed);
+    oldcost = rtx_cost2 (y, SET, NULL_RTX, speed);
   else
-    oldcost = rtx_cost (force_const_mem (dstmode, y), SET, speed);
+    oldcost = rtx_cost2 (force_const_mem (dstmode, y), SET, NULL_RTX, speed);
 
   for (srcmode = GET_CLASS_NARROWEST_MODE (GET_MODE_CLASS (orig_srcmode));
        srcmode != orig_srcmode;
@@ -3480,7 +3480,8 @@ compress_float_constant (rtx x, rtx y)
 	  if (! (*insn_data[ic].operand[1].predicate) (trunc_y, srcmode))
 	    continue;
 	  /* This is valid, but may not be cheaper than the original. */
-	  newcost = rtx_cost (gen_rtx_FLOAT_EXTEND (dstmode, trunc_y), SET, speed);
+	  newcost = rtx_cost2 (gen_rtx_FLOAT_EXTEND (dstmode, trunc_y), SET,
+			       NULL_RTX, speed);
 	  if (oldcost < newcost)
 	    continue;
 	}
@@ -3488,7 +3489,8 @@ compress_float_constant (rtx x, rtx y)
 	{
 	  trunc_y = force_const_mem (srcmode, trunc_y);
 	  /* This is valid, but may not be cheaper than the original. */
-	  newcost = rtx_cost (gen_rtx_FLOAT_EXTEND (dstmode, trunc_y), SET, speed);
+	  newcost = rtx_cost2 (gen_rtx_FLOAT_EXTEND (dstmode, trunc_y), SET,
+			       NULL_RTX, speed);
 	  if (oldcost < newcost)
 	    continue;
 	  trunc_y = validize_mem (trunc_y);
Index: dse.c
===================================================================
--- dse.c	(revision 162823)
+++ dse.c	(working copy)
@@ -1705,7 +1705,8 @@ find_shift_sequence (int access_size,
 		  byte = subreg_lowpart_offset (read_mode, new_mode);
 		  ret = simplify_subreg (read_mode, ret, new_mode, byte);
 		  if (ret && CONSTANT_P (ret)
-		      && rtx_cost (ret, SET, speed) <= COSTS_N_INSNS (1))
+		      && (rtx_cost2 (ret, SET, NULL_RTX, speed)
+			  <= COSTS_N_INSNS (1)))
 		    return ret;
 		}
 	    }
Index: gcse.c
===================================================================
--- gcse.c	(revision 162823)
+++ gcse.c	(working copy)
@@ -826,7 +826,7 @@ want_to_gcse_p (rtx x, int *max_distance
 
 	  gcc_assert (!optimize_function_for_speed_p (cfun)
 		      && optimize_function_for_size_p (cfun));
-	  cost = rtx_cost (x, SET, 0);
+	  cost = rtx_cost2 (x, SET, NULL_RTX, 0);
 
 	  if (cost < COSTS_N_INSNS (GCSE_UNRESTRICTED_COST))
 	    {
Index: calls.c
===================================================================
--- calls.c	(revision 162823)
+++ calls.c	(working copy)
@@ -713,8 +713,9 @@ precompute_register_parameters (int num_
 		     || (GET_CODE (args[i].value) == SUBREG
 			 && REG_P (SUBREG_REG (args[i].value)))))
 		 && args[i].mode != BLKmode
-		 && rtx_cost (args[i].value, SET, optimize_insn_for_speed_p ())
-		    > COSTS_N_INSNS (1)
+		 && (rtx_cost2 (args[i].value, SET, NULL_RTX,
+				optimize_insn_for_speed_p ())
+		     > COSTS_N_INSNS (1))
 		 && ((*reg_parm_seen
 		      && targetm.small_register_classes_for_mode_p (args[i].mode))
 		     || optimize))
Index: expmed.c
===================================================================
--- expmed.c	(revision 162823)
+++ expmed.c	(working copy)
@@ -190,7 +190,7 @@ init_expmed (void)
   for (speed = 0; speed < 2; speed++)
     {
       crtl->maybe_hot_insn_p = speed;
-      zero_cost[speed] = rtx_cost (const0_rtx, SET, speed);
+      zero_cost[speed] = rtx_cost2 (const0_rtx, SET, NULL_RTX, speed);
 
       for (mode = GET_CLASS_NARROWEST_MODE (MODE_INT);
 	   mode != VOIDmode;
@@ -211,15 +211,15 @@ init_expmed (void)
 	  PUT_MODE (&all.shift_sub0, mode);
 	  PUT_MODE (&all.shift_sub1, mode);
 
-	  add_cost[speed][mode] = rtx_cost (&all.plus, SET, speed);
-	  neg_cost[speed][mode] = rtx_cost (&all.neg, SET, speed);
-	  mul_cost[speed][mode] = rtx_cost (&all.mult, SET, speed);
-	  sdiv_cost[speed][mode] = rtx_cost (&all.sdiv, SET, speed);
-	  udiv_cost[speed][mode] = rtx_cost (&all.udiv, SET, speed);
+	  add_cost[speed][mode] = rtx_cost2 (&all.plus, SET, NULL_RTX, speed);
+	  neg_cost[speed][mode] = rtx_cost2 (&all.neg, SET, NULL_RTX, speed);
+	  mul_cost[speed][mode] = rtx_cost2 (&all.mult, SET, NULL_RTX, speed);
+	  sdiv_cost[speed][mode] = rtx_cost2 (&all.sdiv, SET, NULL_RTX, speed);
+	  udiv_cost[speed][mode] = rtx_cost2 (&all.udiv, SET, NULL_RTX, speed);
 
-	  sdiv_pow2_cheap[speed][mode] = (rtx_cost (&all.sdiv_32, SET, speed)
+	  sdiv_pow2_cheap[speed][mode] = (rtx_cost2 (&all.sdiv_32, SET, NULL_RTX, speed)
 				          <= 2 * add_cost[speed][mode]);
-	  smod_pow2_cheap[speed][mode] = (rtx_cost (&all.smod_32, SET, speed)
+	  smod_pow2_cheap[speed][mode] = (rtx_cost2 (&all.smod_32, SET, NULL_RTX, speed)
 				          <= 4 * add_cost[speed][mode]);
 
 	  wider_mode = GET_MODE_WIDER_MODE (mode);
@@ -231,9 +231,9 @@ init_expmed (void)
 	      XEXP (&all.wide_lshr, 1) = GEN_INT (GET_MODE_BITSIZE (mode));
 
 	      mul_widen_cost[speed][wider_mode]
-	        = rtx_cost (&all.wide_mult, SET, speed);
+	        = rtx_cost2 (&all.wide_mult, SET, NULL_RTX, speed);
 	      mul_highpart_cost[speed][mode]
-	        = rtx_cost (&all.wide_trunc, SET, speed);
+	        = rtx_cost2 (&all.wide_trunc, SET, NULL_RTX, speed);
 	    }
 
 	  shift_cost[speed][mode][0] = 0;
@@ -246,10 +246,10 @@ init_expmed (void)
 	      XEXP (&all.shift, 1) = cint[m];
 	      XEXP (&all.shift_mult, 1) = pow2[m];
 
-	      shift_cost[speed][mode][m] = rtx_cost (&all.shift, SET, speed);
-	      shiftadd_cost[speed][mode][m] = rtx_cost (&all.shift_add, SET, speed);
-	      shiftsub0_cost[speed][mode][m] = rtx_cost (&all.shift_sub0, SET, speed);
-	      shiftsub1_cost[speed][mode][m] = rtx_cost (&all.shift_sub1, SET, speed);
+	      shift_cost[speed][mode][m] = rtx_cost2 (&all.shift, SET, NULL_RTX, speed);
+	      shiftadd_cost[speed][mode][m] = rtx_cost2 (&all.shift_add, SET, NULL_RTX, speed);
+	      shiftsub0_cost[speed][mode][m] = rtx_cost2 (&all.shift_sub0, SET, NULL_RTX, speed);
+	      shiftsub1_cost[speed][mode][m] = rtx_cost2 (&all.shift_sub1, SET, NULL_RTX, speed);
 	    }
 	}
     }
@@ -3065,7 +3065,7 @@ expand_mult (enum machine_mode mode, rtx
 		 result is interpreted as an unsigned coefficient.
 		 Exclude cost of op0 from max_cost to match the cost
 		 calculation of the synth_mult.  */
-	      max_cost = rtx_cost (gen_rtx_MULT (mode, fake_reg, op1), SET, speed)
+	      max_cost = rtx_cost2 (gen_rtx_MULT (mode, fake_reg, op1), SET, NULL_RTX, speed)
 			 - neg_cost[speed][mode];
 	      if (max_cost > 0
 		  && choose_mult_variant (mode, -INTVAL (op1), &algorithm,
@@ -3111,7 +3111,7 @@ expand_mult (enum machine_mode mode, rtx
 
 	  /* Exclude cost of op0 from max_cost to match the cost
 	     calculation of the synth_mult.  */
-	  max_cost = rtx_cost (gen_rtx_MULT (mode, fake_reg, op1), SET, speed);
+	  max_cost = rtx_cost2 (gen_rtx_MULT (mode, fake_reg, op1), SET, NULL_RTX, speed);
 	  if (choose_mult_variant (mode, coeff, &algorithm, &variant,
 				   max_cost))
 	    return expand_mult_const (mode, op0, coeff, target,
@@ -3598,7 +3598,8 @@ expand_smod_pow2 (enum machine_mode mode
 
 	  temp = gen_rtx_LSHIFTRT (mode, result, shift);
 	  if (optab_handler (lshr_optab, mode) == CODE_FOR_nothing
-	      || rtx_cost (temp, SET, optimize_insn_for_speed_p ()) > COSTS_N_INSNS (2))
+	      || (rtx_cost2 (temp, SET, NULL_RTX, optimize_insn_for_speed_p ())
+		  > COSTS_N_INSNS (2)))
 	    {
 	      temp = expand_binop (mode, xor_optab, op0, signmask,
 				   NULL_RTX, 1, OPTAB_LIB_WIDEN);
@@ -5447,8 +5448,8 @@ emit_store_flag (rtx target, enum rtx_co
 
 	  /* For the reverse comparison, use either an addition or a XOR.  */
           if (want_add
-	      && rtx_cost (GEN_INT (normalizep), PLUS,
-			   optimize_insn_for_speed_p ()) == 0)
+	      && rtx_cost2 (GEN_INT (normalizep), PLUS, NULL_RTX,
+			    optimize_insn_for_speed_p ()) == 0)
 	    {
 	      tem = emit_store_flag_1 (subtarget, rcode, op0, op1, mode, 0,
 				       STORE_FLAG_VALUE, target_mode);
@@ -5458,8 +5459,8 @@ emit_store_flag (rtx target, enum rtx_co
 				     target, 0, OPTAB_WIDEN);
 	    }
           else if (!want_add
-	           && rtx_cost (trueval, XOR,
-			        optimize_insn_for_speed_p ()) == 0)
+	           && rtx_cost2 (trueval, XOR, NULL_RTX,
+				 optimize_insn_for_speed_p ()) == 0)
 	    {
 	      tem = emit_store_flag_1 (subtarget, rcode, op0, op1, mode, 0,
 				       normalizep, target_mode);
@@ -5551,8 +5552,8 @@ emit_store_flag (rtx target, enum rtx_co
 
       /* Again, for the reverse comparison, use either an addition or a XOR.  */
       if (want_add
-	  && rtx_cost (GEN_INT (normalizep), PLUS,
-		       optimize_insn_for_speed_p ()) == 0)
+	  && rtx_cost2 (GEN_INT (normalizep), PLUS, NULL_RTX,
+			optimize_insn_for_speed_p ()) == 0)
 	{
 	  tem = emit_store_flag_1 (subtarget, rcode, op0, op1, mode, 0,
 				   STORE_FLAG_VALUE, target_mode);
@@ -5561,8 +5562,8 @@ emit_store_flag (rtx target, enum rtx_co
 				GEN_INT (normalizep), target, 0, OPTAB_WIDEN);
 	}
       else if (!want_add
-	       && rtx_cost (trueval, XOR,
-			    optimize_insn_for_speed_p ()) == 0)
+	       && rtx_cost2 (trueval, XOR, NULL_RTX,
+			     optimize_insn_for_speed_p ()) == 0)
 	{
 	  tem = emit_store_flag_1 (subtarget, rcode, op0, op1, mode, 0,
 				   normalizep, target_mode);
Index: simplify-rtx.c
===================================================================
--- simplify-rtx.c	(revision 162823)
+++ simplify-rtx.c	(working copy)
@@ -1829,8 +1829,9 @@ simplify_binary_operation_1 (enum rtx_co
 	      coeff = immed_double_int_const (val, mode);
 
 	      tem = simplify_gen_binary (MULT, mode, lhs, coeff);
-	      return rtx_cost (tem, SET, speed) <= rtx_cost (orig, SET, speed)
-		? tem : 0;
+	      return (rtx_cost2 (tem, SET, NULL_RTX, speed)
+		      <= rtx_cost2 (orig, SET, NULL_RTX, speed)
+		      ? tem : 0);
 	    }
 	}
 
@@ -2010,8 +2011,9 @@ simplify_binary_operation_1 (enum rtx_co
 	      coeff = immed_double_int_const (val, mode);
 
 	      tem = simplify_gen_binary (MULT, mode, lhs, coeff);
-	      return rtx_cost (tem, SET, speed) <= rtx_cost (orig, SET, speed)
-		? tem : 0;
+	      return (rtx_cost2 (tem, SET, NULL_RTX, speed)
+		      <= rtx_cost2 (orig, SET, NULL_RTX, speed)
+		      ? tem : 0);
 	    }
 	}
 
Index: loop-invariant.c
===================================================================
--- loop-invariant.c	(revision 162823)
+++ loop-invariant.c	(working copy)
@@ -704,7 +704,7 @@ create_new_invariant (struct def *def, r
      the loop.  Otherwise we save only cost of the computation.  */
   if (def)
     {
-      inv->cost = rtx_cost (set, SET, speed);
+      inv->cost = rtx_cost2 (set, SET, NULL_RTX, speed);
       /* ??? Try to determine cheapness of address computation.  Unfortunately
          the address cost is only a relative measure, we can't really compare
 	 it with any absolute number, but only with other address costs.
@@ -719,7 +719,7 @@ create_new_invariant (struct def *def, r
     }
   else
     {
-      inv->cost = rtx_cost (SET_SRC (set), SET, speed);
+      inv->cost = rtx_cost2 (SET_SRC (set), SET, SET_DEST (set), speed);
       inv->cheap_address = false;
     }
 
Index: rtl.h
===================================================================
--- rtl.h	(revision 162823)
+++ rtl.h	(working copy)
@@ -1119,9 +1119,64 @@ rhs_regno (const_rtx x)
    not to use an rtx with this cost under any circumstances.  */
 #define MAX_COST INT_MAX
 
+/* A structure to hold all available cost information about an rtl
+   expression.  */
+struct full_rtx_costs
+{
+  int speed;
+  int size;
+};
+
+static inline void
+init_costs_to_max (struct full_rtx_costs *c)
+{
+  c->speed = MAX_COST;
+  c->size = MAX_COST;
+}
+
+static inline void
+init_costs_to_zero (struct full_rtx_costs *c)
+{
+  c->speed = 0;
+  c->size = 0;
+}
+
+static inline bool
+costs_lt_p (struct full_rtx_costs *a, struct full_rtx_costs *b,
+	    bool speed)
+{
+  if (speed)
+    return (a->speed < b->speed
+	    || (a->speed == b->speed && a->size < b->size));
+  else
+    return (a->size < b->size
+	    || (a->size == b->size && a->speed < b->speed));
+}
+
+static inline bool
+costs_le_p (struct full_rtx_costs *a, struct full_rtx_costs *b,
+	    bool speed)
+{
+  if (speed)
+    return (a->speed < b->speed
+	    || (a->speed == b->speed && a->size <= b->size));
+  else
+    return (a->size < b->size
+	    || (a->size == b->size && a->speed <= b->speed));
+}
+
+static inline void
+costs_add_n_insns (struct full_rtx_costs *c, int n)
+{
+  c->speed += COSTS_N_INSNS (n);
+  c->size += COSTS_N_INSNS (n);
+}
+
 extern void init_rtlanal (void);
-extern int rtx_cost (rtx, enum rtx_code, bool);
+extern int rtx_cost2 (rtx, enum rtx_code, rtx, bool);
+#define rtx_cost(a,b,c) rtx_cost2 (a, b, NULL_RTX, c)
 extern int address_cost (rtx, enum machine_mode, addr_space_t, bool);
+extern void get_full_rtx_cost (rtx, enum rtx_code, rtx, struct full_rtx_costs *);
 extern unsigned int subreg_lsb (const_rtx);
 extern unsigned int subreg_lsb_1 (enum machine_mode, enum machine_mode,
 				  unsigned int);
Index: combine.c
===================================================================
--- combine.c	(revision 162823)
+++ combine.c	(working copy)
@@ -6351,11 +6351,11 @@ expand_compound_operation (rtx x)
       rtx temp2 = expand_compound_operation (temp);
 
       /* Make sure this is a profitable operation.  */
-      if (rtx_cost (x, SET, optimize_this_for_speed_p)
-          > rtx_cost (temp2, SET, optimize_this_for_speed_p))
+      if (rtx_cost2 (x, SET, NULL_RTX, optimize_this_for_speed_p)
+          > rtx_cost2 (temp2, SET, NULL_RTX, optimize_this_for_speed_p))
        return temp2;
-      else if (rtx_cost (x, SET, optimize_this_for_speed_p)
-               > rtx_cost (temp, SET, optimize_this_for_speed_p))
+      else if (rtx_cost2 (x, SET, NULL_RTX, optimize_this_for_speed_p)
+               > rtx_cost2 (temp, SET, NULL_RTX, optimize_this_for_speed_p))
        return temp;
       else
        return x;
@@ -6782,8 +6782,8 @@ make_extraction (enum machine_mode mode,
 
 	  /* Prefer ZERO_EXTENSION, since it gives more information to
 	     backends.  */
-	  if (rtx_cost (temp, SET, optimize_this_for_speed_p)
-	      <= rtx_cost (temp1, SET, optimize_this_for_speed_p))
+	  if (rtx_cost2 (temp, SET, NULL_RTX, optimize_this_for_speed_p)
+	      <= rtx_cost2 (temp1, SET, NULL_RTX, optimize_this_for_speed_p))
 	    return temp;
 	  return temp1;
 	}
@@ -6984,8 +6984,8 @@ make_extraction (enum machine_mode mode,
 
 	  /* Prefer ZERO_EXTENSION, since it gives more information to
 	     backends.  */
-	  if (rtx_cost (temp1, SET, optimize_this_for_speed_p)
-	      < rtx_cost (temp, SET, optimize_this_for_speed_p))
+	  if (rtx_cost2 (temp1, SET, NULL_RTX, optimize_this_for_speed_p)
+	      < rtx_cost2 (temp, SET, NULL_RTX, optimize_this_for_speed_p))
 	    temp = temp1;
 	}
       pos_rtx = temp;
@@ -7666,8 +7666,8 @@ force_to_mode (rtx x, enum machine_mode 
 
 	      y = simplify_gen_binary (AND, GET_MODE (x),
 				       XEXP (x, 0), GEN_INT (cval));
-	      if (rtx_cost (y, SET, optimize_this_for_speed_p)
-	          < rtx_cost (x, SET, optimize_this_for_speed_p))
+	      if (rtx_cost2 (y, SET, NULL_RTX, optimize_this_for_speed_p)
+	          < rtx_cost2 (x, SET, NULL_RTX, optimize_this_for_speed_p))
 		x = y;
 	    }
 
@@ -8825,8 +8825,8 @@ distribute_and_simplify_rtx (rtx x, int 
   tmp = apply_distributive_law (simplify_gen_binary (inner_code, mode,
 						     new_op0, new_op1));
   if (GET_CODE (tmp) != outer_code
-      && rtx_cost (tmp, SET, optimize_this_for_speed_p)
-         < rtx_cost (x, SET, optimize_this_for_speed_p))
+      && (rtx_cost2 (tmp, SET, NULL_RTX, optimize_this_for_speed_p)
+	  < rtx_cost2 (x, SET, NULL_RTX, optimize_this_for_speed_p)))
     return tmp;
 
   return NULL_RTX;
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 162823)
+++ config/i386/i386.c	(working copy)
@@ -26470,7 +26470,8 @@ ix86_modes_tieable_p (enum machine_mode 
    scanned.  In either case, *TOTAL contains the cost result.  */
 
 static bool
-ix86_rtx_costs (rtx x, int code, int outer_code_i, int *total, bool speed)
+ix86_rtx_costs2 (rtx x, int code, int outer_code_i, rtx set_lhs, int *total,
+		 bool speed)
 {
   enum rtx_code outer_code = (enum rtx_code) outer_code_i;
   enum machine_mode mode = GET_MODE (x);
@@ -26642,7 +26643,8 @@ ix86_rtx_costs (rtx x, int code, int out
 
   	  *total = (cost->mult_init[MODE_INDEX (mode)]
 		    + nbits * cost->mult_bit
-	            + rtx_cost (op0, outer_code, speed) + rtx_cost (op1, outer_code, speed));
+	            + rtx_cost2 (op0, outer_code, set_lhs, speed)
+		    + rtx_cost2 (op1, outer_code, set_lhs, speed));
 
           return true;
 	}
@@ -26676,10 +26678,11 @@ ix86_rtx_costs (rtx x, int code, int out
 	      if (val == 2 || val == 4 || val == 8)
 		{
 		  *total = cost->lea;
-		  *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code, speed);
-		  *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
-				      outer_code, speed);
-		  *total += rtx_cost (XEXP (x, 1), outer_code, speed);
+		  *total += rtx_cost2 (XEXP (XEXP (x, 0), 1), outer_code,
+				       set_lhs, speed);
+		  *total += rtx_cost2 (XEXP (XEXP (XEXP (x, 0), 0), 0),
+				       outer_code, set_lhs, speed);
+		  *total += rtx_cost2 (XEXP (x, 1), outer_code, set_lhs, speed);
 		  return true;
 		}
 	    }
@@ -26690,17 +26693,20 @@ ix86_rtx_costs (rtx x, int code, int out
 	      if (val == 2 || val == 4 || val == 8)
 		{
 		  *total = cost->lea;
-		  *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, speed);
-		  *total += rtx_cost (XEXP (x, 1), outer_code, speed);
+		  *total += rtx_cost2 (XEXP (XEXP (x, 0), 0), outer_code,
+				       set_lhs, speed);
+		  *total += rtx_cost2 (XEXP (x, 1), outer_code, set_lhs, speed);
 		  return true;
 		}
 	    }
 	  else if (GET_CODE (XEXP (x, 0)) == PLUS)
 	    {
 	      *total = cost->lea;
-	      *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, speed);
-	      *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code, speed);
-	      *total += rtx_cost (XEXP (x, 1), outer_code, speed);
+	      *total += rtx_cost2 (XEXP (XEXP (x, 0), 0), outer_code,
+				  set_lhs, speed);
+	      *total += rtx_cost2 (XEXP (XEXP (x, 0), 1), outer_code, set_lhs,
+				  speed);
+	      *total += rtx_cost2 (XEXP (x, 1), outer_code, set_lhs, speed);
 	      return true;
 	    }
 	}
@@ -26732,9 +26738,9 @@ ix86_rtx_costs (rtx x, int code, int out
       if (!TARGET_64BIT && mode == DImode)
 	{
 	  *total = (cost->add * 2
-		    + (rtx_cost (XEXP (x, 0), outer_code, speed)
+		    + (rtx_cost2 (XEXP (x, 0), outer_code, set_lhs, speed)
 		       << (GET_MODE (XEXP (x, 0)) != DImode))
-		    + (rtx_cost (XEXP (x, 1), outer_code, speed)
+		    + (rtx_cost2 (XEXP (x, 1), outer_code, set_lhs, speed)
 	               << (GET_MODE (XEXP (x, 1)) != DImode)));
 	  return true;
 	}
@@ -26776,8 +26782,9 @@ ix86_rtx_costs (rtx x, int code, int out
 	  /* This kind of construct is implemented using test[bwl].
 	     Treat it as if we had an AND.  */
 	  *total = (cost->add
-		    + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, speed)
-		    + rtx_cost (const1_rtx, outer_code, speed));
+		    + rtx_cost2 (XEXP (XEXP (x, 0), 0), outer_code, set_lhs,
+				 speed)
+		    + rtx_cost2 (const1_rtx, outer_code, set_lhs, speed));
 	  return true;
 	}
       return false;
@@ -31468,8 +31475,8 @@ ix86_enum_va_list (int idx, const char *
 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
 #undef TARGET_MEMORY_MOVE_COST
 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
-#undef TARGET_RTX_COSTS
-#define TARGET_RTX_COSTS ix86_rtx_costs
+#undef TARGET_RTX_COSTS2
+#define TARGET_RTX_COSTS2 ix86_rtx_costs2
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST ix86_address_cost
 
Index: config/arm/arm.c
===================================================================
--- config/arm/arm.c	(revision 162823)
+++ config/arm/arm.c	(working copy)
@@ -143,13 +143,13 @@ static void arm_internal_label (FILE *, 
 static void arm_output_mi_thunk (FILE *, tree, HOST_WIDE_INT, HOST_WIDE_INT,
 				 tree);
 static bool arm_have_conditional_execution (void);
-static bool arm_rtx_costs_1 (rtx, enum rtx_code, int*, bool);
-static bool arm_size_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *);
-static bool arm_slowmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
-static bool arm_fastmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
-static bool arm_xscale_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
-static bool arm_9e_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
-static bool arm_rtx_costs (rtx, int, int, int *, bool);
+static bool arm_rtx_costs_1 (rtx, enum rtx_code, rtx, int *, bool);
+static bool arm_size_rtx_costs (rtx, enum rtx_code, enum rtx_code, rtx, int *);
+static bool arm_slowmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, rtx, int *, bool);
+static bool arm_fastmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, rtx, int *, bool);
+static bool arm_xscale_rtx_costs (rtx, enum rtx_code, enum rtx_code, rtx, int *, bool);
+static bool arm_9e_rtx_costs (rtx, enum rtx_code, enum rtx_code, rtx, int *, bool);
+static bool arm_rtx_costs2 (rtx, int, int, rtx, int *, bool);
 static int arm_address_cost (rtx, bool);
 static bool arm_memory_load_p (rtx);
 static bool arm_cirrus_insn_p (rtx);
@@ -346,8 +346,8 @@ static const struct attribute_spec arm_a
 #undef  TARGET_ASM_CAN_OUTPUT_MI_THUNK
 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK default_can_output_mi_thunk_no_vcall
 
-#undef  TARGET_RTX_COSTS
-#define TARGET_RTX_COSTS arm_rtx_costs
+#undef  TARGET_RTX_COSTS2
+#define TARGET_RTX_COSTS2 arm_rtx_costs2
 #undef  TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST arm_address_cost
 
@@ -6305,7 +6305,8 @@ arm_cannot_force_const_mem (rtx x)
    (GET_CODE (X) == REG ? (X) : SUBREG_REG (X))
 
 static inline int
-thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
+thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer,
+		  rtx set_lhs)
 {
   enum machine_mode mode = GET_MODE (x);
   int total;
@@ -6361,11 +6362,12 @@ thumb1_rtx_costs (rtx x, enum rtx_code c
       else if (outer == AND)
 	{
 	  int i;
-	  /* This duplicates the tests in the andsi3 expander.  */
+          /* This duplicates the tests in the andsi3 expander.  Using shifts
+	     takes one extra insn beyond the cost of the AND.  */
 	  for (i = 9; i <= 31; i++)
 	    if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x)
 		|| (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x))
-	      return COSTS_N_INSNS (2);
+	      return COSTS_N_INSNS (1);
 	}
       else if (outer == ASHIFT || outer == ASHIFTRT
 	       || outer == LSHIFTRT)
@@ -6410,7 +6412,8 @@ thumb1_rtx_costs (rtx x, enum rtx_code c
     case SIGN_EXTEND:
     case ZERO_EXTEND:
       total = mode == DImode ? COSTS_N_INSNS (1) : 0;
-      total += thumb1_rtx_costs (XEXP (x, 0), GET_CODE (XEXP (x, 0)), code);
+      total += thumb1_rtx_costs (XEXP (x, 0), GET_CODE (XEXP (x, 0)), code,
+				 set_lhs);
 
       if (mode == SImode)
 	return total;
@@ -6428,7 +6431,7 @@ thumb1_rtx_costs (rtx x, enum rtx_code c
 }
 
 static inline bool
-arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
+arm_rtx_costs_1 (rtx x, enum rtx_code outer, rtx set_lhs, int *total, bool speed)
 {
   enum machine_mode mode = GET_MODE (x);
   enum rtx_code subcode;
@@ -6460,7 +6463,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
       if (GET_CODE (XEXP (x, 1)) == REG)
 	*total = COSTS_N_INSNS (1); /* Need to subtract from 32 */
       else if (GET_CODE (XEXP (x, 1)) != CONST_INT)
-	*total = rtx_cost (XEXP (x, 1), code, speed);
+	*total = rtx_cost2 (XEXP (x, 1), code, set_lhs, speed);
 
       /* Fall through */
     case ROTATERT:
@@ -6472,7 +6475,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 
       /* Fall through */
     case ASHIFT: case LSHIFTRT: case ASHIFTRT:
-      *total += rtx_cost (XEXP (x, 0), code, speed);
+      *total += rtx_cost2 (XEXP (x, 0), code, set_lhs, speed);
       if (mode == DImode)
 	{
 	  *total += COSTS_N_INSNS (3);
@@ -6495,14 +6498,14 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 	  if (GET_CODE (XEXP (x, 0)) == CONST_INT
 	      && const_ok_for_arm (INTVAL (XEXP (x, 0))))
 	    {
-	      *total += rtx_cost (XEXP (x, 1), code, speed);
+	      *total += rtx_cost2 (XEXP (x, 1), code, set_lhs, speed);
 	      return true;
 	    }
 
 	  if (GET_CODE (XEXP (x, 1)) == CONST_INT
 	      && const_ok_for_arm (INTVAL (XEXP (x, 1))))
 	    {
-	      *total += rtx_cost (XEXP (x, 0), code, speed);
+	      *total += rtx_cost2 (XEXP (x, 0), code, set_lhs, speed);
 	      return true;
 	    }
 
@@ -6519,14 +6522,14 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 	      if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
 		  && arm_const_double_rtx (XEXP (x, 0)))
 		{
-		  *total += rtx_cost (XEXP (x, 1), code, speed);
+		  *total += rtx_cost2 (XEXP (x, 1), code, set_lhs, speed);
 		  return true;
 		}
 
 	      if (GET_CODE (XEXP (x, 1)) == CONST_DOUBLE
 		  && arm_const_double_rtx (XEXP (x, 1)))
 		{
-		  *total += rtx_cost (XEXP (x, 0), code, speed);
+		  *total += rtx_cost2 (XEXP (x, 0), code, set_lhs, speed);
 		  return true;
 		}
 
@@ -6540,7 +6543,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
       if (GET_CODE (XEXP (x, 0)) == CONST_INT
 	  && const_ok_for_arm (INTVAL (XEXP (x, 0))))
 	{
-	  *total += rtx_cost (XEXP (x, 1), code, speed);
+	  *total += rtx_cost2 (XEXP (x, 1), code, set_lhs, speed);
 	  return true;
 	}
 
@@ -6549,8 +6552,8 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 	  || subcode == LSHIFTRT
 	  || subcode == ROTATE || subcode == ROTATERT)
 	{
-	  *total += rtx_cost (XEXP (x, 0), code, speed);
-	  *total += rtx_cost (XEXP (XEXP (x, 1), 0), subcode, speed);
+	  *total += rtx_cost2 (XEXP (x, 0), code, set_lhs, speed);
+	  *total += rtx_cost2 (XEXP (XEXP (x, 1), 0), subcode, set_lhs, speed);
 	  return true;
 	}
 
@@ -6558,23 +6561,24 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
       if (GET_CODE (XEXP (x, 0)) == MULT
 	  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
 	{
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), code, speed);
-	  *total += rtx_cost (XEXP (x, 1), code, speed);
+	  *total += rtx_cost2 (XEXP (XEXP (x, 0), 0), code, set_lhs, speed);
+	  *total += rtx_cost2 (XEXP (x, 1), code, set_lhs, speed);
 	  return true;
 	}
 
       if (subcode == MULT
 	  && power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode))
 	{
-	  *total += rtx_cost (XEXP (x, 0), code, speed);
-	  *total += rtx_cost (XEXP (XEXP (x, 1), 0), subcode, speed);
+	  *total += rtx_cost2 (XEXP (x, 0), code, set_lhs, speed);
+	  *total += rtx_cost2 (XEXP (XEXP (x, 1), 0), subcode, set_lhs, speed);
 	  return true;
 	}
 
       if (GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMPARE
 	  || GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMM_COMPARE)
 	{
-	  *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, speed);
+	  *total = COSTS_N_INSNS (1) + rtx_cost2 (XEXP (x, 0), code, set_lhs,
+						  speed);
 	  if (GET_CODE (XEXP (XEXP (x, 1), 0)) == REG
 	      && REGNO (XEXP (XEXP (x, 1), 0)) != CC_REGNUM)
 	    *total += COSTS_N_INSNS (1);
@@ -6590,9 +6594,9 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 	      || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND))
 	{
 	  *total = COSTS_N_INSNS (1);
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), GET_CODE (XEXP (x, 0)),
-			      speed);
-	  *total += rtx_cost (XEXP (x, 1), code, speed);
+	  *total += rtx_cost2 (XEXP (XEXP (x, 0), 0), GET_CODE (XEXP (x, 0)),
+			       NULL_RTX, speed);
+	  *total += rtx_cost2 (XEXP (x, 1), code, set_lhs, speed);
 	  return true;
 	}
 
@@ -6616,7 +6620,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 	      if (GET_CODE (XEXP (x, 1)) == CONST_DOUBLE
 		  && arm_const_double_rtx (XEXP (x, 1)))
 		{
-		  *total += rtx_cost (XEXP (x, 0), code, speed);
+		  *total += rtx_cost2 (XEXP (x, 0), code, set_lhs, speed);
 		  return true;
 		}
 
@@ -6630,7 +6634,8 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
       if (GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMPARE
 	  || GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMM_COMPARE)
 	{
-	  *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 1), code, speed);
+	  *total = COSTS_N_INSNS (1) + rtx_cost2 (XEXP (x, 1), code, set_lhs,
+						  speed);
 	  if (GET_CODE (XEXP (XEXP (x, 0), 0)) == REG
 	      && REGNO (XEXP (XEXP (x, 0), 0)) != CC_REGNUM)
 	    *total += COSTS_N_INSNS (1);
@@ -6657,7 +6662,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 	  if (GET_CODE (XEXP (x, 1)) == CONST_INT
 	      && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
 	    {
-	      *total += rtx_cost (XEXP (x, 0), code, speed);
+	      *total += rtx_cost2 (XEXP (x, 0), code, set_lhs, speed);
 	      return true;
 	    }
 
@@ -6668,7 +6673,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
       if (GET_CODE (XEXP (x, 1)) == CONST_INT
 	  && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
 	{
-	  *total += rtx_cost (XEXP (x, 0), code, speed);
+	  *total += rtx_cost2 (XEXP (x, 0), code, set_lhs, speed);
 	  return true;
 	}
       subcode = GET_CODE (XEXP (x, 0));
@@ -6676,16 +6681,16 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 	  || subcode == LSHIFTRT
 	  || subcode == ROTATE || subcode == ROTATERT)
 	{
-	  *total += rtx_cost (XEXP (x, 1), code, speed);
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed);
+	  *total += rtx_cost2 (XEXP (x, 1), code, set_lhs, speed);
+	  *total += rtx_cost2 (XEXP (XEXP (x, 0), 0), subcode, set_lhs, speed);
 	  return true;
 	}
 
       if (subcode == MULT
 	  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
 	{
-	  *total += rtx_cost (XEXP (x, 1), code, speed);
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed);
+	  *total += rtx_cost2 (XEXP (x, 1), code, set_lhs, speed);
+	  *total += rtx_cost2 (XEXP (XEXP (x, 0), 0), subcode, set_lhs, speed);
 	  return true;
 	}
 
@@ -6711,7 +6716,8 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 	  && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
 	      || GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND))
 	{
-	  *total = rtx_cost (XEXP (XEXP (x, 0), 0), LSHIFTRT, speed);
+	  *total = rtx_cost2 (XEXP (XEXP (x, 0), 0), LSHIFTRT,
+			      set_lhs, speed);
 	  return true;
 	}
       *total = COSTS_N_INSNS (2); /* Plus the cost of the MULT */
@@ -6743,11 +6749,13 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 	      || (subcode == MULT
 		  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)))
 	    {
-	      *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed);
+	      *total += rtx_cost2 (XEXP (XEXP (x, 0), 0), subcode,
+				   set_lhs, speed);
 	      /* Register shifts cost an extra cycle.  */
 	      if (GET_CODE (XEXP (XEXP (x, 0), 1)) != CONST_INT)
-		*total += COSTS_N_INSNS (1) + rtx_cost (XEXP (XEXP (x, 0), 1),
-							subcode, speed);
+		*total += COSTS_N_INSNS (1) + rtx_cost2 (XEXP (XEXP (x, 0), 1),
+							 subcode, set_lhs,
+							 speed);
 	      return true;
 	    }
 	}
@@ -6768,14 +6776,15 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 	    && GET_CODE (XEXP (operand, 0)) == REG
 	    && REGNO (XEXP (operand, 0)) == CC_REGNUM))
 	*total += COSTS_N_INSNS (1);
-      *total += (rtx_cost (XEXP (x, 1), code, speed)
-		 + rtx_cost (XEXP (x, 2), code, speed));
+      *total += (rtx_cost2 (XEXP (x, 1), code, set_lhs, speed)
+		 + rtx_cost2 (XEXP (x, 2), code, set_lhs, speed));
       return true;
 
     case NE:
       if (mode == SImode && XEXP (x, 1) == const0_rtx)
 	{
-	  *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, speed);
+	  *total = COSTS_N_INSNS (2) + rtx_cost2 (XEXP (x, 0), code,
+						  set_lhs, speed);
 	  return true;
 	}
       goto scc_insn;
@@ -6784,7 +6793,8 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
       if ((GET_CODE (XEXP (x, 0)) != REG || REGNO (XEXP (x, 0)) != CC_REGNUM)
 	  && mode == SImode && XEXP (x, 1) == const0_rtx)
 	{
-	  *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, speed);
+	  *total = COSTS_N_INSNS (2) + rtx_cost2 (XEXP (x, 0), code,
+						  set_lhs, speed);
 	  return true;
 	}
       goto scc_insn;
@@ -6793,7 +6803,8 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
       if ((GET_CODE (XEXP (x, 0)) != REG || REGNO (XEXP (x, 0)) != CC_REGNUM)
 	  && mode == SImode && XEXP (x, 1) == const0_rtx)
 	{
-	  *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, speed);
+	  *total = COSTS_N_INSNS (1) + rtx_cost2 (XEXP (x, 0), code,
+						  set_lhs, speed);
 	  return true;
 	}
       goto scc_insn;
@@ -6834,7 +6845,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
       if (GET_CODE (XEXP (x, 1)) == CONST_INT
 	  && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
 	{
-	  *total += rtx_cost (XEXP (x, 0), code, speed);
+	  *total += rtx_cost2 (XEXP (x, 0), code, set_lhs, speed);
 	  return true;
 	}
 
@@ -6843,16 +6854,16 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 	  || subcode == LSHIFTRT
 	  || subcode == ROTATE || subcode == ROTATERT)
 	{
-	  *total += rtx_cost (XEXP (x, 1), code, speed);
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed);
+	  *total += rtx_cost2 (XEXP (x, 1), code, set_lhs, speed);
+	  *total += rtx_cost2 (XEXP (XEXP (x, 0), 0), subcode, set_lhs, speed);
 	  return true;
 	}
 
       if (subcode == MULT
 	  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
 	{
-	  *total += rtx_cost (XEXP (x, 1), code, speed);
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed);
+	  *total += rtx_cost2 (XEXP (x, 1), code, set_lhs, speed);
+	  *total += rtx_cost2 (XEXP (XEXP (x, 0), 0), subcode, set_lhs, speed);
 	  return true;
 	}
       
@@ -6862,10 +6873,11 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
     case UMAX:
     case SMIN:
     case SMAX:
-      *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, speed);
+      *total = COSTS_N_INSNS (2) + rtx_cost2 (XEXP (x, 0), code, set_lhs,
+					      speed);
       if (GET_CODE (XEXP (x, 1)) != CONST_INT
 	  || !const_ok_for_arm (INTVAL (XEXP (x, 1))))
-	*total += rtx_cost (XEXP (x, 1), code, speed);
+	*total += rtx_cost2 (XEXP (x, 1), code, set_lhs, speed);
       return true;
 
     case ABS:
@@ -6942,7 +6954,8 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 
     case ZERO_EXTRACT:
     case SIGN_EXTRACT:
-      *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, speed);
+      *total = COSTS_N_INSNS (1) + rtx_cost2 (XEXP (x, 0), code, set_lhs,
+					      speed);
       return true;
 
     case CONST_INT:
@@ -6967,7 +6980,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
 
     case LO_SUM:
       *total = COSTS_N_INSNS (1);
-      *total += rtx_cost (XEXP (x, 0), code, speed);
+      *total += rtx_cost2 (XEXP (x, 0), code, set_lhs, speed);
       return true;
 
     case CONST_DOUBLE:
@@ -6988,7 +7001,8 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
    For now most of the code is copied from thumb1_rtx_costs. We need more
    fine grain tuning when we have more related test cases.  */
 static inline int
-thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
+thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer,
+		       rtx set_lhs)
 {
   enum machine_mode mode = GET_MODE (x);
 
@@ -7010,7 +7024,8 @@ thumb1_size_rtx_costs (rtx x, enum rtx_c
         {
           /* Thumb1 mul instruction can't operate on const. We must Load it
              into a register first.  */
-          int const_size = thumb1_size_rtx_costs (XEXP (x, 1), CONST_INT, SET);
+          int const_size = thumb1_size_rtx_costs (XEXP (x, 1), CONST_INT, SET,
+						  set_lhs);
           return COSTS_N_INSNS (1) + const_size;
         }
       return COSTS_N_INSNS (1);
@@ -7042,11 +7057,12 @@ thumb1_size_rtx_costs (rtx x, enum rtx_c
       else if (outer == AND)
         {
           int i;
-          /* This duplicates the tests in the andsi3 expander.  */
+          /* This duplicates the tests in the andsi3 expander.  Using shifts
+	     takes one extra insn beyond the cost of the AND.  */
           for (i = 9; i <= 31; i++)
             if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x)
                 || (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x))
-              return COSTS_N_INSNS (2);
+              return COSTS_N_INSNS (1);
         }
       else if (outer == ASHIFT || outer == ASHIFTRT
                || outer == LSHIFTRT)
@@ -7115,16 +7131,16 @@ thumb1_size_rtx_costs (rtx x, enum rtx_c
 /* RTX costs when optimizing for size.  */
 static bool
 arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
-		    int *total)
+		    rtx set_lhs, int *total)
 {
   enum machine_mode mode = GET_MODE (x);
   if (TARGET_THUMB1)
     {
-      *total = thumb1_size_rtx_costs (x, code, outer_code);
+      *total = thumb1_size_rtx_costs (x, code, outer_code, set_lhs);
       return true;
     }
 
-  /* FIXME: This makes no attempt to prefer narrow Thumb-2 instructions.  */
+  /* FIXME: This makes few attempts to prefer narrow Thumb-2 instructions.  */
   switch (code)
     {
     case MEM:
@@ -7153,7 +7169,8 @@ arm_size_rtx_costs (rtx x, enum rtx_code
     case ROTATE:
       if (mode == SImode && GET_CODE (XEXP (x, 1)) == REG)
 	{
-	  *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, false);
+	  *total = COSTS_N_INSNS (2) + rtx_cost2 (XEXP (x, 0), code,
+						  set_lhs, false);
 	  return true;
 	}
       /* Fall through */
@@ -7163,15 +7180,17 @@ arm_size_rtx_costs (rtx x, enum rtx_code
     case ASHIFTRT:
       if (mode == DImode && GET_CODE (XEXP (x, 1)) == CONST_INT)
 	{
-	  *total = COSTS_N_INSNS (3) + rtx_cost (XEXP (x, 0), code, false);
+	  *total = COSTS_N_INSNS (3) + rtx_cost2 (XEXP (x, 0), code, set_lhs,
+						  false);
 	  return true;
 	}
       else if (mode == SImode)
 	{
-	  *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, false);
+	  *total = COSTS_N_INSNS (1) + rtx_cost2 (XEXP (x, 0), code, set_lhs,
+						  false);
 	  /* Slightly disparage register shifts, but not by much.  */
 	  if (GET_CODE (XEXP (x, 1)) != CONST_INT)
-	    *total += 1 + rtx_cost (XEXP (x, 1), code, false);
+	    *total += 1 + rtx_cost2 (XEXP (x, 1), code, set_lhs, false);
 	  return true;
 	}
 
@@ -7223,10 +7242,29 @@ arm_size_rtx_costs (rtx x, enum rtx_code
 	  && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
 	{
 	  *total = COSTS_N_INSNS (TARGET_THUMB2 ? 2 : 1);
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), code, false);
-	  *total += rtx_cost (XEXP (x, 1), code, false);
+	  *total += rtx_cost2 (XEXP (XEXP (x, 0), 0), code, set_lhs, false);
+	  *total += rtx_cost2 (XEXP (x, 1), code, set_lhs, false);
 	  return true;
 	}
+      else if (TARGET_THUMB2 && outer_code == SET
+	       && mode == SImode
+	       && CONST_INT_P (XEXP (x, 1))
+	       && REG_P (XEXP (x, 0))
+	       && (REGNO (XEXP (x, 0)) < FIRST_HI_REGNUM
+		   || REGNO (XEXP (x, 0)) > LAST_HI_REGNUM))
+	{
+	  if (INTVAL (XEXP (x, 1)) >= 0
+	      && (INTVAL (XEXP (x, 1)) < 8
+		  || (XEXP (x, 0) == stack_pointer_rtx
+		      && (INTVAL (XEXP (x, 1)) & 3) == 0
+		      && INTVAL (XEXP (x, 1)) < 1024)
+		  || (rtx_equal_p (set_lhs, XEXP (x, 0))
+		      && INTVAL (XEXP (x, 1)) < 256)))
+	    {
+	      *total = COSTS_N_INSNS (1) / 2;
+	      return true;
+	    }
+	}
 
       /* Fall through */
     case AND: case XOR: case IOR:
@@ -7286,10 +7324,24 @@ arm_size_rtx_costs (rtx x, enum rtx_code
 
     case SIGN_EXTEND:
     case ZERO_EXTEND:
-      return arm_rtx_costs_1 (x, outer_code, total, 0);
+      return arm_rtx_costs_1 (x, outer_code, set_lhs, total, 0);
 
     case CONST_INT:
-      if (const_ok_for_arm (INTVAL (x)))
+      if (TARGET_THUMB2 && outer_code == SET
+	  && INTVAL (x) >= 0 && INTVAL (x) < 256
+	  && (!set_lhs
+	      || (REG_P (set_lhs)
+		  && (REGNO (set_lhs) < FIRST_HI_REGNUM
+		      || REGNO (set_lhs) > LAST_HI_REGNUM))))
+	*total = COSTS_N_INSNS (1) / 2;
+      else if (TARGET_THUMB2 && outer_code == SET
+	       && INTVAL (x) >= 0 && INTVAL (x) < 65536)
+	*total = COSTS_N_INSNS (1);
+      else if (arm_arch6 && outer_code == AND
+	       && mode == SImode
+	       && (INTVAL (x) == 255 || INTVAL (x) == 65535))
+	*total = 0;
+      else if (const_ok_for_arm (INTVAL (x)))
 	/* A multiplication by a constant requires another instruction
 	   to load the constant to a register.  */
 	*total = COSTS_N_INSNS ((outer_code == SET || outer_code == MULT)
@@ -7304,8 +7356,11 @@ arm_size_rtx_costs (rtx x, enum rtx_code
 	  else
 	    *total = COSTS_N_INSNS (1);
 	}
-      else
+      else if (outer_code == SET)
 	*total = COSTS_N_INSNS (2);
+      else
+	return arm_size_rtx_costs (x, code, SET, set_lhs, total);
+
       return true;
 
     case CONST:
@@ -7336,16 +7391,16 @@ arm_size_rtx_costs (rtx x, enum rtx_code
 
 /* RTX costs when optimizing for size.  */
 static bool
-arm_rtx_costs (rtx x, int code, int outer_code, int *total,
-	       bool speed)
+arm_rtx_costs2 (rtx x, int code, int outer_code, rtx set_lhs, int *total,
+		bool speed)
 {
   if (!speed)
     return arm_size_rtx_costs (x, (enum rtx_code) code,
-			       (enum rtx_code) outer_code, total);
+			       (enum rtx_code) outer_code, set_lhs, total);
   else
     return current_tune->rtx_costs (x, (enum rtx_code) code,
 				    (enum rtx_code) outer_code,
-				    total, speed);
+				    set_lhs, total, speed);
 }
 
 /* RTX costs for cores with a slow MUL implementation.  Thumb-2 is not
@@ -7353,13 +7408,13 @@ arm_rtx_costs (rtx x, int code, int oute
 
 static bool
 arm_slowmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
-		       int *total, bool speed)
+		       rtx set_lhs, int *total, bool speed)
 {
   enum machine_mode mode = GET_MODE (x);
 
   if (TARGET_THUMB)
     {
-      *total = thumb1_rtx_costs (x, code, outer_code);
+      *total = thumb1_rtx_costs (x, code, outer_code, set_lhs);
       return true;
     }
 
@@ -7390,7 +7445,7 @@ arm_slowmul_rtx_costs (rtx x, enum rtx_c
 	    }
 
 	  *total = COSTS_N_INSNS (cost);
-	  *total += rtx_cost (XEXP (x, 0), code, speed);
+	  *total += rtx_cost2 (XEXP (x, 0), code, set_lhs, speed);
 	  return true;
 	}
 
@@ -7398,7 +7453,7 @@ arm_slowmul_rtx_costs (rtx x, enum rtx_c
       return false;
 
     default:
-      return arm_rtx_costs_1 (x, outer_code, total, speed);;
+      return arm_rtx_costs_1 (x, outer_code, set_lhs, total, speed);
     }
 }
 
@@ -7407,13 +7462,13 @@ arm_slowmul_rtx_costs (rtx x, enum rtx_c
 
 static bool
 arm_fastmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
-		       int *total, bool speed)
+		       rtx set_lhs, int *total, bool speed)
 {
   enum machine_mode mode = GET_MODE (x);
 
   if (TARGET_THUMB1)
     {
-      *total = thumb1_rtx_costs (x, code, outer_code);
+      *total = thumb1_rtx_costs (x, code, outer_code, set_lhs);
       return true;
     }
 
@@ -7481,7 +7536,7 @@ arm_fastmul_rtx_costs (rtx x, enum rtx_c
       return false;
 
     default:
-      return arm_rtx_costs_1 (x, outer_code, total, speed);
+      return arm_rtx_costs_1 (x, outer_code, set_lhs, total, speed);
     }
 }
 
@@ -7491,13 +7546,13 @@ arm_fastmul_rtx_costs (rtx x, enum rtx_c
 
 static bool
 arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
-		      int *total, bool speed)
+		      rtx set_lhs, int *total, bool speed)
 {
   enum machine_mode mode = GET_MODE (x);
 
   if (TARGET_THUMB)
     {
-      *total = thumb1_rtx_costs (x, code, outer_code);
+      *total = thumb1_rtx_costs (x, code, outer_code, set_lhs);
       return true;
     }
 
@@ -7505,7 +7560,7 @@ arm_xscale_rtx_costs (rtx x, enum rtx_co
     {
     case COMPARE:
       if (GET_CODE (XEXP (x, 0)) != MULT)
-	return arm_rtx_costs_1 (x, outer_code, total, speed);
+	return arm_rtx_costs_1 (x, outer_code, set_lhs, total, speed);
 
       /* A COMPARE of a MULT is slow on XScale; the muls instruction
 	 will stall until the multiplication is complete.  */
@@ -7571,7 +7626,7 @@ arm_xscale_rtx_costs (rtx x, enum rtx_co
       return false;
 
     default:
-      return arm_rtx_costs_1 (x, outer_code, total, speed);
+      return arm_rtx_costs_1 (x, outer_code, set_lhs, total, speed);
     }
 }
 
@@ -7580,7 +7635,7 @@ arm_xscale_rtx_costs (rtx x, enum rtx_co
 
 static bool
 arm_9e_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
-		  int *total, bool speed)
+		  rtx set_lhs, int *total, bool speed)
 {
   enum machine_mode mode = GET_MODE (x);
 
@@ -7593,7 +7648,7 @@ arm_9e_rtx_costs (rtx x, enum rtx_code c
 	  return true;
 
 	default:
-	  *total = thumb1_rtx_costs (x, code, outer_code);
+	  *total = thumb1_rtx_costs (x, code, outer_code, set_lhs);
 	  return true;
 	}
     }
@@ -7640,7 +7695,7 @@ arm_9e_rtx_costs (rtx x, enum rtx_code c
       return false;
 
     default:
-      return arm_rtx_costs_1 (x, outer_code, total, speed);
+      return arm_rtx_costs_1 (x, outer_code, set_lhs, total, speed);
     }
 }
 /* All address computations that can be done are free, but rtx cost returns
Index: config/arm/arm-protos.h
===================================================================
--- config/arm/arm-protos.h	(revision 162823)
+++ config/arm/arm-protos.h	(working copy)
@@ -215,7 +215,7 @@ extern void arm_order_regs_for_local_all
 
 struct tune_params
 {
-  bool (*rtx_costs) (rtx, RTX_CODE, RTX_CODE, int *, bool);
+  bool (*rtx_costs) (rtx, RTX_CODE, RTX_CODE, rtx, int *, bool);
   int constant_limit;
 };
 
Index: stmt.c
===================================================================
--- stmt.c	(revision 162823)
+++ stmt.c	(working copy)
@@ -2106,8 +2106,8 @@ bool lshift_cheap_p (void)
   if (!init)
     {
       rtx reg = gen_rtx_REG (word_mode, 10000);
-      int cost = rtx_cost (gen_rtx_ASHIFT (word_mode, const1_rtx, reg), SET,
-      			   optimize_insn_for_speed_p ());
+      int cost = rtx_cost2 (gen_rtx_ASHIFT (word_mode, const1_rtx, reg), SET,
+			    NULL_RTX, optimize_insn_for_speed_p ());
       cheap = cost < COSTS_N_INSNS (3);
       init = true;
     }
Index: reload1.c
===================================================================
--- reload1.c	(revision 162823)
+++ reload1.c	(working copy)
@@ -1629,8 +1629,8 @@ calculate_elim_costs_all_insns (void)
 		    {
 		      rtx t = eliminate_regs_1 (SET_SRC (set), VOIDmode, insn,
 						false, true);
-		      int cost = rtx_cost (t, SET,
-					   optimize_bb_for_speed_p (bb));
+		      int cost = rtx_cost2 (t, SET, SET_DEST (set),
+					    optimize_bb_for_speed_p (bb));
 		      int freq = REG_FREQ_FROM_BB (bb);
 
 		      reg_equiv_init_cost[regno] = cost * freq;
@@ -2483,7 +2483,8 @@ note_reg_elim_costly (rtx *px, void *dat
     {
       rtx t = reg_equiv_invariant[REGNO (x)];
       rtx new_rtx = eliminate_regs_1 (t, Pmode, insn, true, true);
-      int cost = rtx_cost (new_rtx, SET, optimize_bb_for_speed_p (elim_bb));
+      int cost = rtx_cost2 (new_rtx, SET, NULL_RTX,
+			    optimize_bb_for_speed_p (elim_bb));
       int freq = REG_FREQ_FROM_BB (elim_bb);
 
       if (cost != 0)
