Patchwork [RFC,PR18589] Enhance tree-ssa-reassoc to optimize repeated factors

login
register
mail settings
Submitter William J. Schmidt
Date Jan. 25, 2012, 8:14 p.m.
Message ID <1327522487.2734.15.camel@gnopaine>
Download mbox | patch
Permalink /patch/137854/
State New
Headers show

Comments

William J. Schmidt - Jan. 25, 2012, 8:14 p.m.
PR18589 identifies missed optimization opportunities for multiplies with
repeated factors, whether explicitly repeated or produced by a
__builtin_pow or __builtin_powi call.  This patch, proposed for 4.8,
expands such built-in calls to expose the base factors, pre-multiplies
repeated factors together, and builds new __builtin_powi calls where
appropriate.

This is done only in the first reassociation pass, so that the
__builtin_powi expansion done during pass_cse_sincos can produce the
optimal number of multiplies from the built-in calls.  I had to create a
separate pass variable to permit this (splitting pass_reassoc into
pass_early_reassoc and pass_late_reassoc).  Surprisingly, this only
affected one test in the regression suite.

Again, this is proposed for 4.8, so I am only looking for comments at
this time.

Thanks,
Bill


gcc:

2012-01-25  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* tree-pass.h: Replace pass_reassoc with pass_early_reassoc and
	pass_late_reassoc.
	* passes.c (init_optimization_passes): Change pass_reassoc calls to
	pass_early_reassoc and pass_late_reassoc.
	* tree-ssa-reassoc.c (reassociate_stats): Add two fields.
	(early_reassoc): New static var.
	(MAX_POW_EXPAND): New #define'd constant.
	(linear_expand_pow_common): New function.
	(linear_expand_powi): Likewise.
	(linear_expand_pow): Likewise.
	(break_up_subtract_bb): Attempt to expand __builtin_pow[i].
	(repeat_factor_d): New struct and associated typedefs.
	(repeat_factor_vec): New static vector variable.
	(compare_repeat_factors): New function.
	(get_reassoc_pow_ssa_name): Likewise.
	(attempt_builtin_powi): Likewise.
	(reassociate_bb): Attempt to reconstitute __builtin_powi calls, and
	multiply their results by any leftover reassociated factors.
	(fini_reassoc): Two new calls to statistics_counter_event.
	(execute_early_reassoc): New function.
	(execute_late_reassoc): Likewise.
	(pass_early_reassoc): Replace pass_reassoc, renamed to reassoc1,
	call execute_early_reassoc.
	(pass_late_reassoc): New gimple_opt_pass named reassoc2 that calls
	execute_late_reassoc.

gcc/testsuite:

2012-01-25  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* gcc.dg/pr46309.c: Change -fdump-tree-reassoc-details to
	-fdump-tree-reassoc[12]-details.
	* gcc.dg/tree-ssa/pr18589-1.c: New test.
	* gcc.dg/tree-ssa/pr18589-2.c: Likewise.
	* gcc.dg/tree-ssa/pr18589-3.c: Likewise.
	* gcc.dg/tree-ssa/pr18589-4.c: Likewise.
	* gcc.dg/tree-ssa/pr18589-5.c: Likewise.
	* gcc.dg/tree-ssa/pr18589-6.c: Likewise.
	* gcc.dg/tree-ssa/pr18589-7.c: Likewise.
	* gcc.dg/tree-ssa/pr18589-8.c: Likewise.

Patch

Index: gcc/tree-pass.h
===================================================================
--- gcc/tree-pass.h	(revision 183444)
+++ gcc/tree-pass.h	(working copy)
@@ -440,7 +440,8 @@  extern struct gimple_opt_pass pass_copy_prop;
 extern struct gimple_opt_pass pass_vrp;
 extern struct gimple_opt_pass pass_uncprop;
 extern struct gimple_opt_pass pass_return_slot;
-extern struct gimple_opt_pass pass_reassoc;
+extern struct gimple_opt_pass pass_early_reassoc;
+extern struct gimple_opt_pass pass_late_reassoc;
 extern struct gimple_opt_pass pass_rebuild_cgraph_edges;
 extern struct gimple_opt_pass pass_remove_cgraph_callee_edges;
 extern struct gimple_opt_pass pass_build_cgraph_edges;
Index: gcc/testsuite/gcc.dg/pr46309.c
===================================================================
--- gcc/testsuite/gcc.dg/pr46309.c	(revision 183444)
+++ gcc/testsuite/gcc.dg/pr46309.c	(working copy)
@@ -1,6 +1,6 @@ 
 /* PR tree-optimization/46309 */
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-reassoc-details" } */
+/* { dg-options "-O2 -fdump-tree-reassoc1-details -fdump-tree-reassoc2-details" } */
 /* The transformation depends on BRANCH_COST being greater than 1
    (see the notes in the PR), so try to force that.  */
 /* { dg-additional-options "-mtune=octeon2" { target mips*-*-* } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z, double u)
+{
+  return x * x * y * y * y * z * z * z * z * u;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 7 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z, double u)
+{
+  return x * x * x * y * y * y * z * z * z * z * u * u * u * u;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 6 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c	(revision 0)
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y)
+{
+  return __builtin_pow (x, -4.0) * __builtin_pow (y, -4.0);
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " / " 1 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+float baz (float x, float y)
+{
+  return x * x * x * x * y * y * y * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+long double baz (long double x, long double y)
+{
+  return x * x * x * x * y * y * y * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y)
+{
+  return x * x * x * x * y * y * y * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y)
+{
+  return x * y * y * x * y * x * x * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z)
+{
+  return x * x * y * y * y * z * z * z * z;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 6 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/passes.c
===================================================================
--- gcc/passes.c	(revision 183444)
+++ gcc/passes.c	(working copy)
@@ -1325,7 +1325,7 @@  init_optimization_passes (void)
 	 opportunities.  */
       NEXT_PASS (pass_phi_only_cprop);
       NEXT_PASS (pass_dse);
-      NEXT_PASS (pass_reassoc);
+      NEXT_PASS (pass_early_reassoc);
       NEXT_PASS (pass_dce);
       NEXT_PASS (pass_forwprop);
       NEXT_PASS (pass_phiopt);
@@ -1377,7 +1377,7 @@  init_optimization_passes (void)
 	}
       NEXT_PASS (pass_lower_vector_ssa);
       NEXT_PASS (pass_cse_reciprocals);
-      NEXT_PASS (pass_reassoc);
+      NEXT_PASS (pass_late_reassoc);
       NEXT_PASS (pass_vrp);
       NEXT_PASS (pass_dominator);
       /* The only const/copy propagation opportunities left after
Index: gcc/tree-ssa-reassoc.c
===================================================================
--- gcc/tree-ssa-reassoc.c	(revision 183444)
+++ gcc/tree-ssa-reassoc.c	(working copy)
@@ -53,6 +53,9 @@  along with GCC; see the file COPYING3.  If not see
     1. Breaking up subtract operations into addition + negate, where
     it would promote the reassociation of adds.
 
+    1a. During the same pass, expanding __builtin_pow variants with
+    small integer exponents into linearized multiply trees.
+
     2. Left linearization of the expression trees, so that (A+B)+(C+D)
     becomes (((A+B)+C)+D), which is easier for us to rewrite later.
     During linearization, we place the operands of the binary
@@ -61,6 +64,10 @@  along with GCC; see the file COPYING3.  If not see
     3. Optimization of the operand lists, eliminating things like a +
     -a, a & a, etc.
 
+    3a. Combine repeated factors with the same occurrence counts
+    into a __builtin_powi call that will later be optimized into
+    an optimal number of multiplies.
+
     4. Rewrite the expression trees we linearized and optimized so
     they are in proper rank order.
 
@@ -169,6 +176,8 @@  static struct
   int constants_eliminated;
   int ops_eliminated;
   int rewritten;
+  int pows_expanded;
+  int pows_created;
 } reassociate_stats;
 
 /* Operator, rank pair.  */
@@ -190,6 +199,12 @@  static int next_operand_entry_id;
    depth.  */
 static long *bb_rank;
 
+/* Distinguish between early and late reassociation passes.  Early
+   reassociation expands and rebuilds __builtin_pow* calls.  This
+   is not done in late reassociation to preserve the builtin
+   optimizations performed in pass_cse_sincos.  */
+static bool early_reassoc;
+
 /* Operand->rank hashtable.  */
 static struct pointer_map_t *operand_rank;
 
@@ -2759,6 +2774,164 @@  can_reassociate_p (tree op)
   return false;
 }
 
+/* Define a limit on the exponent of a __builtin_pow or __builtin_powi
+   that will be converted into a linear chain of multiplies prior to
+   reassociation.  */
+
+#define MAX_POW_EXPAND 32
+
+/* Given STMT at *GSIP that is a __builtin_pow or __builtin_powi
+   operation with base ARG0 and integer power EXPONENT, transform
+   it into a linear chain of multiplies, provided that EXPONENT is
+   of sufficiently small magnitude. */
+static void
+linear_expand_pow_common (gimple stmt, gimple_stmt_iterator *gsip,
+			  tree arg0, HOST_WIDE_INT exponent)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  HOST_WIDE_INT abs_exp = abs (exponent);
+  location_t loc = gimple_location (stmt);
+  gimple mul_stmt = NULL;
+  gimple div_stmt = NULL;
+  tree result, type, target;
+  gimple copy_stmt;
+
+  if (abs_exp > MAX_POW_EXPAND)
+    return;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "Expanding builtin pow/powi: ");
+      print_gimple_stmt (dump_file, stmt, 0, 0);
+    }
+
+  reassociate_stats.pows_expanded++;
+
+  type = TREE_TYPE (arg0);
+
+  if (exponent == 0)
+    result = build_real (type, dconst1);
+  else if (exponent == 1)
+    result = arg0;
+  else
+    {
+      tree target_ssa;
+
+      target = create_tmp_reg (type, "reassocpow");
+      add_referenced_var (target);
+
+      if (exponent == -1)
+	result = arg0;
+      else
+	{
+	  unsigned i;
+
+	  target_ssa = make_ssa_name (target, NULL);
+	  mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
+						   arg0, arg0);
+	  gimple_set_location (mul_stmt, loc);
+	  gsi_insert_before (gsip, mul_stmt, GSI_SAME_STMT);
+	  
+	  for (i = abs_exp - 2; i > 0; i--)
+	    {
+	      tree prev_target_ssa = target_ssa;
+	      target_ssa = make_ssa_name (target, NULL);
+	      mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
+						       prev_target_ssa, arg0);
+	      gimple_set_location (mul_stmt, loc);
+	      gsi_insert_before (gsip, mul_stmt, GSI_SAME_STMT);
+	    }
+
+	  result = target_ssa;
+	}
+
+      /* Possible TODO:  If we expand two __builtin_pow's with different
+	 negative exponents, the introduction of the RDIVs for inversion
+	 prevents combining their factors.  (If they have the same negative 
+	 exponent, expression folding combines them as expected.)  I'm
+	 not worrying about this as it should be quite rare in practice.  */
+      if (exponent < 0)
+	{
+	  target_ssa = make_ssa_name (target, NULL);
+	  div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target_ssa,
+						   build_real (type, dconst1),
+						   result);
+	  gimple_set_location (div_stmt, loc);
+	  gsi_insert_before (gsip, div_stmt, GSI_SAME_STMT);
+	  result = target_ssa;
+	}
+    }
+  
+  /* If we introduced multiplies but no inversion, avoid introducing a
+     copy so that the copy doesn't truncate a larger reassociation chain.  */
+  if (mul_stmt && !div_stmt)
+    {
+      unlink_stmt_vdef (stmt);
+      gimple_set_lhs (mul_stmt, lhs);
+      gsi_remove (gsip, true);
+      update_stmt (mul_stmt);
+      /* If we're at the end of the block, leave the iterator in a
+	 usable state.  */
+      if (gsi_end_p (*gsip))
+	*gsip = gsi_for_stmt (mul_stmt);
+    }
+  else
+    {
+      copy_stmt = gimple_build_assign (lhs, result);
+      gimple_set_location (copy_stmt, loc);
+      unlink_stmt_vdef (stmt);
+      gsi_replace (gsip, copy_stmt, true);
+    }
+}
+
+/* Transform __builtin_powi into a linear chain of multiplies,
+   if the exponent is of sufficiently small magnitude.  */
+
+static void
+linear_expand_powi (gimple stmt, gimple_stmt_iterator *gsip)
+{
+  tree arg0 = gimple_call_arg (stmt, 0);
+  tree arg1 = gimple_call_arg (stmt, 1);
+  HOST_WIDE_INT exponent;
+
+  if (TREE_CODE (arg1) != INTEGER_CST)
+    return;
+
+  if (host_integerp (arg1, 0))
+    {
+      exponent = TREE_INT_CST_LOW (arg1);
+      linear_expand_pow_common (stmt, gsip, arg0, exponent);
+    }
+}
+
+/* Transform __builtin_pow into a linear chain of multiplies,
+   if the exponent is constant and equivalent to an integer of
+   sufficiently small magnitude.  */
+
+static void
+linear_expand_pow (gimple stmt, gimple_stmt_iterator *gsip)
+{
+  tree arg0 = gimple_call_arg (stmt, 0);
+  tree arg1 = gimple_call_arg (stmt, 1);
+  REAL_VALUE_TYPE c, cint;
+  HOST_WIDE_INT n;
+
+  /* The exponent must be a constant equivalent to an integer that
+     fits in a HOST_WIDE_INT.  */
+  if (TREE_CODE (arg1) != REAL_CST)
+    return;
+  
+  c = TREE_REAL_CST (arg1);
+  if (REAL_EXP (&c) > HOST_BITS_PER_WIDE_INT)
+    return;
+
+  n = real_to_integer (&c);
+  real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
+
+  if (real_identical (&c, &cint))
+    linear_expand_pow_common (stmt, gsip, arg0, n);
+}
+
 /* Break up subtract operations in block BB.
 
    We do this top down because we don't know whether the subtract is
@@ -2784,9 +2957,32 @@  break_up_subtract_bb (basic_block bb)
 
   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
     {
+      tree fndecl;
       gimple stmt = gsi_stmt (gsi);
       gimple_set_visited (stmt, false);
 
+      /* Look for __builtin_pow and __builtin_powi calls,
+	 and expand them to multiplies if possible.  */
+      if (early_reassoc
+	  && flag_unsafe_math_optimizations
+	  && is_gimple_call (stmt)
+	  && (fndecl = gimple_call_fndecl (stmt))
+	  && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
+	{
+	  switch (DECL_FUNCTION_CODE (fndecl))
+	    {
+	    CASE_FLT_FN (BUILT_IN_POW):
+	      linear_expand_pow (stmt, &gsi);
+	      break;
+	    CASE_FLT_FN (BUILT_IN_POWI):
+	      linear_expand_powi (stmt, &gsi);
+	      break;
+	    default:
+	      ;
+	    }
+	  continue;
+	}
+
       if (!is_gimple_assign (stmt)
 	  || !can_reassociate_p (gimple_assign_lhs (stmt)))
 	continue;
@@ -2815,6 +3011,342 @@  break_up_subtract_bb (basic_block bb)
     break_up_subtract_bb (son);
 }
 
+/* Used for repeated factor analysis.  */
+struct repeat_factor_d
+{
+  /* An SSA name that occurs in a multiply chain.  */
+  tree factor;
+
+  /* Cached rank of the factor.  */
+  unsigned rank;
+
+  /* Number of occurrences of the factor in the chain.  */
+  HOST_WIDE_INT count;
+
+  /* An SSA name representing the product of this factor and
+     all factors appearing later in the repeated factor vector.  */
+  tree repr;
+};
+
+typedef struct repeat_factor_d repeat_factor, *repeat_factor_t;
+typedef const struct repeat_factor_d *const_repeat_factor_t;
+
+DEF_VEC_O (repeat_factor);
+DEF_VEC_ALLOC_O (repeat_factor, heap);
+
+static VEC (repeat_factor, heap) *repeat_factor_vec;
+
+/* Used for sorting the repeat factor vector.  Sort primarily by
+   ascending occurrence count, secondarily by descending rank.  */
+
+static int
+compare_repeat_factors (const void *x1, const void *x2)
+{
+  const_repeat_factor_t rf1 = (const_repeat_factor_t) x1;
+  const_repeat_factor_t rf2 = (const_repeat_factor_t) x2;
+
+  if (rf1->count != rf2->count)
+    return rf1->count - rf2->count;
+
+  return rf2->rank - rf1->rank;
+}
+
+/* Get a new SSA name for register variable *TARGET of type TYPE.
+   If *TARGET is null or incompatible with TYPE, create the variable
+   first.  */
+
+static tree
+get_reassoc_pow_ssa_name (tree *target, tree type)
+{
+  if (!*target || !types_compatible_p (type, TREE_TYPE (*target)))
+    {
+      *target = create_tmp_reg (type, "reassocpow");
+      add_referenced_var (*target);
+    }
+
+  return make_ssa_name (*target, NULL);
+}
+
+/* Look for repeated operands in OPS in the multiply tree rooted at
+   STMT.  Replace them with an optimal sequence of multiplies and powi
+   builtin calls, and remove the used operands from OPS.  Return an
+   SSA name representing the value of the replacement sequence.  */
+
+static tree
+attempt_builtin_powi (gimple stmt, VEC(operand_entry_t, heap) **ops,
+		      tree *target)
+{
+  unsigned i, j, cached, vec_len;
+  int ii;
+  operand_entry_t oe;
+  repeat_factor_t rf1, rf2;
+  repeat_factor rfnew;
+  tree target_ssa;
+  tree result = NULL_TREE;
+  tree type = TREE_TYPE (gimple_get_lhs (stmt));
+  tree powi_fndecl = mathfn_built_in (type, BUILT_IN_POWI);
+  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+  gimple mul_stmt, pow_stmt;
+
+  /* Nothing to do if BUILT_IN_POWI doesn't exist for this type and
+     target.  */
+  if (!powi_fndecl)
+    return NULL_TREE;
+
+  /* Allocate the repeated factor vector.  */
+  repeat_factor_vec = VEC_alloc (repeat_factor, heap, 10);
+
+  /* Scan the OPS vector for all SSA names in the product and build
+     up a vector of occurrence counts for each factor.  */
+  FOR_EACH_VEC_ELT (operand_entry_t, *ops, i, oe)
+    {
+      if (TREE_CODE (oe->op) == SSA_NAME)
+	{
+	  FOR_EACH_VEC_ELT (repeat_factor, repeat_factor_vec, j, rf1)
+	    {
+	      if (rf1->factor == oe->op)
+		{
+		  rf1->count++;
+		  break;
+		}
+	    }
+
+	  if (j >= VEC_length (repeat_factor, repeat_factor_vec))
+	    {
+	      rfnew.factor = oe->op;
+	      rfnew.rank = oe->rank;
+	      rfnew.count = 1;
+	      rfnew.repr = NULL_TREE;
+	      VEC_safe_push (repeat_factor, heap, repeat_factor_vec, &rfnew);
+	    }
+	}
+    }
+
+  vec_len = VEC_length (repeat_factor, repeat_factor_vec);
+  
+  /* Sort the repeated factor vector by (a) increasing occurrence count,
+     and (b) decreasing rank.  */
+  VEC_qsort (repeat_factor, repeat_factor_vec, compare_repeat_factors);
+
+  /* There are a variety of ways we could combine factors with different
+     occurrence counts.  It seems best in practice to try to combine as
+     many base factors as possible into a product before applying
+     builtin_powi to the result.  The sort order chosen for the repeated
+     factor vector allows us to cache partial results for the product
+     of the base factors for subsequent use.
+
+     As an example, consider x * x * y * y * y * y * z * z * z * z.
+     We want to first compose the product x * y * z, raise it to the
+     second power, and then multiply this by the product y * z raised
+     to the second power.  This can be done in 5 multiplies provided
+     we cache y * z for use in both expressions:
+
+        t1 = y * z
+	t2 = t1 * x
+	t3 = t2 * t2
+	t4 = t1 * t1
+	result = t3 * t4
+
+     Alternatively we could also do this in 5 multiplies by first 
+     calculating y * z, squaring it twice, and multiplying by x * x.
+     However, if the occurrence counts were not powers of 2 as in 
+     this example, combining higher occurrence counts first would 
+     require more multiplies than combining lower ones first.  */
+
+  /* Repeatedly look for opportunities to create a builtin_powi call.  */
+  while (true)
+    {
+      HOST_WIDE_INT power;
+
+      /* Find the first factor in the repeated factor vector whose 
+	 occurrence count is at least 2.  If no such factor exists,
+	 there are no builtin_powi opportunities remaining.  */
+      FOR_EACH_VEC_ELT (repeat_factor, repeat_factor_vec, j, rf1)
+	{
+	  if (rf1->count >= 2)
+	    break;
+	}
+
+      if (j >= vec_len)
+	break;
+
+      power = rf1->count;
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  unsigned elt;
+	  repeat_factor_t rf;
+	  fputs ("Building __builtin_pow call for (", dump_file);
+	  for (elt = j; elt < vec_len; elt++)
+	    {
+	      rf = VEC_index (repeat_factor, repeat_factor_vec, elt);
+	      print_generic_expr (dump_file, rf->factor, 0);
+	      if (elt < vec_len - 1)
+		fputs (" * ", dump_file);
+	    }
+	  fprintf (dump_file, ")^%ld\n", power);
+	}
+
+      reassociate_stats.pows_created++;
+
+      /* Visit each element of the vector in reverse order (so that
+	 high-occurrence elements are visited first, and within the
+	 same occurrence count, lower-ranked elements are visited
+	 first).  Form a linear product of all elements in this order
+	 whose occurrencce count is at least that of element J.
+	 Record the SSA name representing the product of each element
+	 with all subsequent elements in the vector.  */
+      if (j == vec_len - 1)
+	rf1->repr = rf1->factor;
+      else
+	{
+	  for (ii = vec_len - 2; ii >= (int)j; ii--)
+	    {
+	      tree op1, op2;
+
+	      rf1 = VEC_index (repeat_factor, repeat_factor_vec, ii);
+	      rf2 = VEC_index (repeat_factor, repeat_factor_vec, ii + 1);
+
+	      /* Init the last factor's representative to be itself.  */
+	      if (!rf2->repr)
+		rf2->repr = rf2->factor;
+
+	      op1 = rf1->factor;
+	      op2 = rf2->repr;
+
+	      target_ssa = get_reassoc_pow_ssa_name (target, type);
+	      mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
+						       op1, op2);
+	      gimple_set_location (mul_stmt, gimple_location (stmt));
+	      gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
+	      rf1->repr = target_ssa;
+	    }
+	}
+
+      /* Form a call to __builtin_powi for the maximum product
+	 just formed, raised to the power obtained earlier.  */
+      rf1 = VEC_index (repeat_factor, repeat_factor_vec, j);
+      target_ssa = get_reassoc_pow_ssa_name (target, type);
+      pow_stmt = gimple_build_call (powi_fndecl, 2, rf1->repr, 
+				    build_int_cst (integer_type_node, power));
+      gimple_call_set_lhs (pow_stmt, target_ssa);
+      gimple_set_location (pow_stmt, gimple_location (stmt));
+      gsi_insert_before (&gsi, pow_stmt, GSI_SAME_STMT);
+
+      /* Decrement the occurrence count of each element in the product
+	 by the count found above, and remove this many copies of each
+	 factor from OPS.  */
+      for (i = j; i < vec_len; i++)
+	{
+	  unsigned k = power;
+	  unsigned n;
+
+	  rf1 = VEC_index (repeat_factor, repeat_factor_vec, i);
+	  rf1->count -= power;
+	  
+	  FOR_EACH_VEC_ELT_REVERSE (operand_entry_t, *ops, n, oe)
+	    {
+	      if (oe->op == rf1->factor)
+		{
+		  VEC_ordered_remove (operand_entry_t, *ops, n);
+		  if (--k == 0)
+		    break;
+		}
+	    }
+	}
+
+      /* If we previously formed at least one other builtin_powi call,
+	 form the product of this one and those others.  */
+      if (result)
+	{
+	  tree new_target_ssa = get_reassoc_pow_ssa_name (target, type);
+	  mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, new_target_ssa,
+						   target_ssa, result);
+	  gimple_set_location (mul_stmt, gimple_location (stmt));
+	  gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
+	  result = new_target_ssa;
+	}
+      else
+	result = target_ssa;
+    }
+
+  /* At this point all elements in the repeated factor vector have a
+     remaining occurrence count of 0 or 1.  Scanning the vector in
+     reverse order, find the last element with a 1 before a 0 is found.
+     If this element has an SSA representative and is not the last
+     element, then it represents a multiply we have already calculated.
+     Multiply the result so far by this SSA name.  Remove one copy of
+     each factor in the product from OPS.  */
+  cached = vec_len;
+
+  for (ii = vec_len - 1; ii >= 0; ii--)
+    {
+      rf1 = VEC_index (repeat_factor, repeat_factor_vec, ii);
+      if (rf1->count == 0)
+	{
+	  cached = ii + 1;
+	  break;
+	}
+    }
+
+  if (cached < vec_len)
+    {
+      rf1 = VEC_index (repeat_factor, repeat_factor_vec, cached);
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  unsigned elt;
+	  repeat_factor_t rf;
+	  fputs ("Multiplying by cached product ", dump_file);
+	  for (elt = cached; elt < vec_len; elt++)
+	    {
+	      rf = VEC_index (repeat_factor, repeat_factor_vec, elt);
+	      print_generic_expr (dump_file, rf->factor, 0);
+	      if (elt < vec_len - 1)
+		fputs (" * ", dump_file);
+	    }
+	  fputs ("\n", dump_file);
+	}
+
+      if (!result)
+	result = rf1->repr;
+      else
+	{
+	  tree new_target_ssa = get_reassoc_pow_ssa_name (target, type);
+	  mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, new_target_ssa,
+						   rf1->repr, result);
+	  gimple_set_location (mul_stmt, gimple_location (stmt));
+	  gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
+	  result = new_target_ssa;
+	}
+    }
+
+  for (i = cached; i < vec_len; i++)
+    {
+      unsigned n;
+
+      rf1 = VEC_index (repeat_factor, repeat_factor_vec, i);
+      rf1->count--;
+	  
+      FOR_EACH_VEC_ELT_REVERSE (operand_entry_t, *ops, n, oe)
+	{
+	  if (oe->op == rf1->factor)
+	    {
+	      VEC_ordered_remove (operand_entry_t, *ops, n);
+	      break;
+	    }
+	}
+    }
+
+  /* Clean up.  */
+  VEC_free (repeat_factor, heap, repeat_factor_vec);
+
+  /* Return the final product as computed herein.  Note that there
+     may still be some elements with single occurrence count left
+     in OPS; those will be handled by the normal reassociation logic.  */
+  return result;
+}
+
 /* Reassociate expressions in basic block BB and its post-dominator as
    children.  */
 
@@ -2823,6 +3355,7 @@  reassociate_bb (basic_block bb)
 {
   gimple_stmt_iterator gsi;
   basic_block son;
+  tree target = NULL_TREE;
 
   for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
     {
@@ -2883,6 +3416,8 @@  reassociate_bb (basic_block bb)
 
 	  if (associative_tree_code (rhs_code))
 	    {
+	      tree powi_result = NULL_TREE;
+
 	      VEC(operand_entry_t, heap) *ops = NULL;
 
 	      /* There may be no immediate uses left by the time we
@@ -2904,7 +3439,14 @@  reassociate_bb (basic_block bb)
 	      if (rhs_code == BIT_IOR_EXPR || rhs_code == BIT_AND_EXPR)
 		optimize_range_tests (rhs_code, &ops);
 
-	      if (VEC_length (operand_entry_t, ops) == 1)
+	      if (early_reassoc
+		  && rhs_code == MULT_EXPR
+		  && flag_unsafe_math_optimizations)
+		powi_result = attempt_builtin_powi (stmt, &ops, &target);
+
+	      /* If the operand vector is now empty, all operands were
+		 consumed by the __builtin_pow optimization.  */
+	      if (VEC_length (operand_entry_t, ops) == 0)
 		{
 		  if (dump_file && (dump_flags & TDF_DETAILS))
 		    {
@@ -2913,9 +3455,7 @@  reassociate_bb (basic_block bb)
 		    }
 
 		  rhs1 = gimple_assign_rhs1 (stmt);
-		  gimple_assign_set_rhs_from_tree (&gsi,
-						   VEC_last (operand_entry_t,
-							     ops)->op);
+		  gimple_assign_set_rhs_from_tree (&gsi, powi_result);
 		  update_stmt (stmt);
 		  remove_visited_stmt_chain (rhs1);
 
@@ -2925,6 +3465,39 @@  reassociate_bb (basic_block bb)
 		      print_gimple_stmt (dump_file, stmt, 0, 0);
 		    }
 		}
+
+	      else if (VEC_length (operand_entry_t, ops) == 1)
+		{
+		  tree last_op = VEC_last (operand_entry_t, ops)->op;
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    {
+		      fprintf (dump_file, "Transforming ");
+		      print_gimple_stmt (dump_file, stmt, 0, 0);
+		    }
+
+		  if (powi_result)
+		    {
+		      rhs1 = gimple_assign_rhs1 (stmt);
+		      gimple_assign_set_rhs_with_ops_1 (&gsi, MULT_EXPR,
+							powi_result, last_op,
+							NULL_TREE);
+		      update_stmt (gsi_stmt (gsi));
+		      remove_visited_stmt_chain (rhs1);
+		    }
+		  else
+		    {
+		      rhs1 = gimple_assign_rhs1 (stmt);
+		      gimple_assign_set_rhs_from_tree (&gsi, last_op);
+		      update_stmt (stmt);
+		      remove_visited_stmt_chain (rhs1);
+		    }
+
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    {
+		      fprintf (dump_file, " into ");
+		      print_gimple_stmt (dump_file, stmt, 0, 0);
+		    }
+		}
 	      else
 		{
 		  enum machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
@@ -2940,6 +3513,24 @@  reassociate_bb (basic_block bb)
 		    rewrite_expr_tree_parallel (stmt, width, ops);
 		  else
 		    rewrite_expr_tree (stmt, 0, ops, false);
+
+		  /* If we combined some repeated factors into a 
+		     __builtin_powi call, multiply that result by the
+		     reassociated operands.  */
+		  if (powi_result)
+		    {
+		      gimple mul_stmt;
+		      tree type = TREE_TYPE (gimple_get_lhs (stmt));
+		      tree target_ssa = get_reassoc_pow_ssa_name (&target,
+								  type);
+		      gimple_set_lhs (stmt, target_ssa);
+		      update_stmt (stmt);
+		      mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, lhs,
+							       powi_result,
+							       target_ssa);
+		      gimple_set_location (mul_stmt, gimple_location (stmt));
+		      gsi_insert_after (&gsi, mul_stmt, GSI_NEW_STMT);
+		    }
 		}
 
 	      VEC_free (operand_entry_t, heap, ops);
@@ -3054,6 +3645,10 @@  fini_reassoc (void)
 			    reassociate_stats.ops_eliminated);
   statistics_counter_event (cfun, "Statements rewritten",
 			    reassociate_stats.rewritten);
+  statistics_counter_event (cfun, "Built-in pow[i] calls expanded",
+			    reassociate_stats.pows_expanded);
+  statistics_counter_event (cfun, "Built-in powi calls created",
+			    reassociate_stats.pows_created);
 
   pointer_map_destroy (operand_rank);
   free_alloc_pool (operand_entry_pool);
@@ -3077,19 +3672,33 @@  execute_reassoc (void)
   return 0;
 }
 
+static unsigned int
+execute_early_reassoc (void)
+{
+  early_reassoc = true;
+  return execute_reassoc ();
+}
+
+static unsigned int
+execute_late_reassoc (void)
+{
+  early_reassoc = false;
+  return execute_reassoc ();
+}
+
 static bool
 gate_tree_ssa_reassoc (void)
 {
   return flag_tree_reassoc != 0;
 }
 
-struct gimple_opt_pass pass_reassoc =
+struct gimple_opt_pass pass_early_reassoc =
 {
  {
   GIMPLE_PASS,
-  "reassoc",				/* name */
+  "reassoc1",				/* name */
   gate_tree_ssa_reassoc,		/* gate */
-  execute_reassoc,			/* execute */
+  execute_early_reassoc,		/* execute */
   NULL,					/* sub */
   NULL,					/* next */
   0,					/* static_pass_number */
@@ -3103,3 +3712,24 @@  gate_tree_ssa_reassoc (void)
     | TODO_ggc_collect			/* todo_flags_finish */
  }
 };
+
+struct gimple_opt_pass pass_late_reassoc =
+{
+ {
+  GIMPLE_PASS,
+  "reassoc2",				/* name */
+  gate_tree_ssa_reassoc,		/* gate */
+  execute_late_reassoc,			/* execute */
+  NULL,					/* sub */
+  NULL,					/* next */
+  0,					/* static_pass_number */
+  TV_TREE_REASSOC,			/* tv_id */
+  PROP_cfg | PROP_ssa,			/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_verify_ssa
+    | TODO_verify_flow
+    | TODO_ggc_collect			/* todo_flags_finish */
+ }
+};