diff mbox

Fix PR18589

Message ID 1331066954.17488.7.camel@gnopaine
State New
Headers show

Commit Message

Bill Schmidt March 6, 2012, 8:49 p.m. UTC
Hi,

This is a re-post of the patch I posted for comments in January to
address http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18589.  The patch
modifies reassociation to expose repeated factors from __builtin_pow*
calls, optimally reassociate repeated factors, and possibly reconstitute
__builtin_powi calls from the results of reassociation.

Bootstrapped and passes regression tests for powerpc64-linux-gnu.  I
expect there may need to be some small changes, but I am targeting this
for trunk approval.

Thanks very much for the review,

Bill


gcc:

2012-03-06  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* tree-pass.h: Replace pass_reassoc with pass_early_reassoc and
	pass_late_reassoc.
	* passes.c (init_optimization_passes): Change pass_reassoc calls to
	pass_early_reassoc and pass_late_reassoc.
	* tree-ssa-reassoc.c (reassociate_stats): Add two fields.
	(early_reassoc): New static var.
	(MAX_POW_EXPAND): New #define'd constant.
	(linear_expand_pow_common): New function.
	(linear_expand_powi): Likewise.
	(linear_expand_pow): Likewise.
	(break_up_subtract_bb): Attempt to expand __builtin_pow[i].
	(repeat_factor_d): New struct and associated typedefs.
	(repeat_factor_vec): New static vector variable.
	(compare_repeat_factors): New function.
	(get_reassoc_pow_ssa_name): Likewise.
	(attempt_builtin_powi): Likewise.
	(reassociate_bb): Attempt to reconstitute __builtin_powi calls, and
	multiply their results by any leftover reassociated factors.
	(fini_reassoc): Two new calls to statistics_counter_event.
	(execute_early_reassoc): New function.
	(execute_late_reassoc): Likewise.
	(pass_early_reassoc): Replace pass_reassoc, renamed to reassoc1,
	call execute_early_reassoc.
	(pass_late_reassoc): New gimple_opt_pass named reassoc2 that calls
	execute_late_reassoc.

gcc/testsuite:

2012-03-06  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* gcc.dg/pr46309.c: Change -fdump-tree-reassoc-details to
	-fdump-tree-reassoc[12]-details.
	* gcc.dg/tree-ssa/pr18589-1.c: New test.
	* gcc.dg/tree-ssa/pr18589-2.c: Likewise.
	* gcc.dg/tree-ssa/pr18589-3.c: Likewise.
	* gcc.dg/tree-ssa/pr18589-4.c: Likewise.
	* gcc.dg/tree-ssa/pr18589-5.c: Likewise.
	* gcc.dg/tree-ssa/pr18589-6.c: Likewise.
	* gcc.dg/tree-ssa/pr18589-7.c: Likewise.
	* gcc.dg/tree-ssa/pr18589-8.c: Likewise.

Comments

Richard Biener March 28, 2012, 1:57 p.m. UTC | #1
On Tue, Mar 6, 2012 at 9:49 PM, William J. Schmidt
<wschmidt@linux.vnet.ibm.com> wrote:
> Hi,
>
> This is a re-post of the patch I posted for comments in January to
> address http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18589.  The patch
> modifies reassociation to expose repeated factors from __builtin_pow*
> calls, optimally reassociate repeated factors, and possibly reconstitute
> __builtin_powi calls from the results of reassociation.
>
> Bootstrapped and passes regression tests for powerpc64-linux-gnu.  I
> expect there may need to be some small changes, but I am targeting this
> for trunk approval.
>
> Thanks very much for the review,

Hmm.  How much work would it be to extend the reassoc 'IL' to allow
a repeat factor per op?  I realize what you do is all within what reassoc
already does though ideally we would not require any GIMPLE IL changes
for building up / optimizing the reassoc IL but only do so when we commit
changes.

Thanks,
Richard.

> Bill
>
>
> gcc:
>
> 2012-03-06  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
>
>        * tree-pass.h: Replace pass_reassoc with pass_early_reassoc and
>        pass_late_reassoc.
>        * passes.c (init_optimization_passes): Change pass_reassoc calls to
>        pass_early_reassoc and pass_late_reassoc.
>        * tree-ssa-reassoc.c (reassociate_stats): Add two fields.
>        (early_reassoc): New static var.
>        (MAX_POW_EXPAND): New #define'd constant.
>        (linear_expand_pow_common): New function.
>        (linear_expand_powi): Likewise.
>        (linear_expand_pow): Likewise.
>        (break_up_subtract_bb): Attempt to expand __builtin_pow[i].
>        (repeat_factor_d): New struct and associated typedefs.
>        (repeat_factor_vec): New static vector variable.
>        (compare_repeat_factors): New function.
>        (get_reassoc_pow_ssa_name): Likewise.
>        (attempt_builtin_powi): Likewise.
>        (reassociate_bb): Attempt to reconstitute __builtin_powi calls, and
>        multiply their results by any leftover reassociated factors.
>        (fini_reassoc): Two new calls to statistics_counter_event.
>        (execute_early_reassoc): New function.
>        (execute_late_reassoc): Likewise.
>        (pass_early_reassoc): Replace pass_reassoc, renamed to reassoc1,
>        call execute_early_reassoc.
>        (pass_late_reassoc): New gimple_opt_pass named reassoc2 that calls
>        execute_late_reassoc.
>
> gcc/testsuite:
>
> 2012-03-06  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
>
>        * gcc.dg/pr46309.c: Change -fdump-tree-reassoc-details to
>        -fdump-tree-reassoc[12]-details.
>        * gcc.dg/tree-ssa/pr18589-1.c: New test.
>        * gcc.dg/tree-ssa/pr18589-2.c: Likewise.
>        * gcc.dg/tree-ssa/pr18589-3.c: Likewise.
>        * gcc.dg/tree-ssa/pr18589-4.c: Likewise.
>        * gcc.dg/tree-ssa/pr18589-5.c: Likewise.
>        * gcc.dg/tree-ssa/pr18589-6.c: Likewise.
>        * gcc.dg/tree-ssa/pr18589-7.c: Likewise.
>        * gcc.dg/tree-ssa/pr18589-8.c: Likewise.
>
>
> Index: gcc/tree-pass.h
> ===================================================================
> --- gcc/tree-pass.h     (revision 184997)
> +++ gcc/tree-pass.h     (working copy)
> @@ -440,7 +440,8 @@ extern struct gimple_opt_pass pass_copy_prop;
>  extern struct gimple_opt_pass pass_vrp;
>  extern struct gimple_opt_pass pass_uncprop;
>  extern struct gimple_opt_pass pass_return_slot;
> -extern struct gimple_opt_pass pass_reassoc;
> +extern struct gimple_opt_pass pass_early_reassoc;
> +extern struct gimple_opt_pass pass_late_reassoc;
>  extern struct gimple_opt_pass pass_rebuild_cgraph_edges;
>  extern struct gimple_opt_pass pass_remove_cgraph_callee_edges;
>  extern struct gimple_opt_pass pass_build_cgraph_edges;
> Index: gcc/testsuite/gcc.dg/pr46309.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/pr46309.c      (revision 184997)
> +++ gcc/testsuite/gcc.dg/pr46309.c      (working copy)
> @@ -1,6 +1,6 @@
>  /* PR tree-optimization/46309 */
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -fdump-tree-reassoc-details" } */
> +/* { dg-options "-O2 -fdump-tree-reassoc1-details -fdump-tree-reassoc2-details" } */
>  /* The transformation depends on BRANCH_COST being greater than 1
>    (see the notes in the PR), so try to force that.  */
>  /* { dg-additional-options "-mtune=octeon2" { target mips*-*-* } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +double baz (double x, double y, double z, double u)
> +{
> +  return x * x * y * y * y * z * z * z * z * u;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 7 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +double baz (double x, double y, double z, double u)
> +{
> +  return x * x * x * y * y * y * z * z * z * z * u * u * u * u;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 6 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c   (revision 0)
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +double baz (double x, double y)
> +{
> +  return __builtin_pow (x, -4.0) * __builtin_pow (y, -4.0);
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times " / " 1 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +float baz (float x, float y)
> +{
> +  return x * x * x * x * y * y * y * y;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +long double baz (long double x, long double y)
> +{
> +  return x * x * x * x * y * y * y * y;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +double baz (double x, double y)
> +{
> +  return x * x * x * x * y * y * y * y;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +double baz (double x, double y)
> +{
> +  return x * y * y * x * y * x * x * y;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +double baz (double x, double y, double z)
> +{
> +  return x * x * y * y * y * z * z * z * z;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 6 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/passes.c
> ===================================================================
> --- gcc/passes.c        (revision 184997)
> +++ gcc/passes.c        (working copy)
> @@ -1325,7 +1325,7 @@ init_optimization_passes (void)
>         opportunities.  */
>       NEXT_PASS (pass_phi_only_cprop);
>       NEXT_PASS (pass_dse);
> -      NEXT_PASS (pass_reassoc);
> +      NEXT_PASS (pass_early_reassoc);
>       NEXT_PASS (pass_dce);
>       NEXT_PASS (pass_forwprop);
>       NEXT_PASS (pass_phiopt);
> @@ -1377,7 +1377,7 @@ init_optimization_passes (void)
>        }
>       NEXT_PASS (pass_lower_vector_ssa);
>       NEXT_PASS (pass_cse_reciprocals);
> -      NEXT_PASS (pass_reassoc);
> +      NEXT_PASS (pass_late_reassoc);
>       NEXT_PASS (pass_vrp);
>       NEXT_PASS (pass_dominator);
>       /* The only const/copy propagation opportunities left after
> Index: gcc/tree-ssa-reassoc.c
> ===================================================================
> --- gcc/tree-ssa-reassoc.c      (revision 184997)
> +++ gcc/tree-ssa-reassoc.c      (working copy)
> @@ -53,6 +53,9 @@ along with GCC; see the file COPYING3.  If not see
>     1. Breaking up subtract operations into addition + negate, where
>     it would promote the reassociation of adds.
>
> +    1a. During the same pass, expanding __builtin_pow variants with
> +    small integer exponents into linearized multiply trees.
> +
>     2. Left linearization of the expression trees, so that (A+B)+(C+D)
>     becomes (((A+B)+C)+D), which is easier for us to rewrite later.
>     During linearization, we place the operands of the binary
> @@ -61,6 +64,10 @@ along with GCC; see the file COPYING3.  If not see
>     3. Optimization of the operand lists, eliminating things like a +
>     -a, a & a, etc.
>
> +    3a. Combine repeated factors with the same occurrence counts
> +    into a __builtin_powi call that will later be optimized into
> +    an optimal number of multiplies.
> +
>     4. Rewrite the expression trees we linearized and optimized so
>     they are in proper rank order.
>
> @@ -169,6 +176,8 @@ static struct
>   int constants_eliminated;
>   int ops_eliminated;
>   int rewritten;
> +  int pows_expanded;
> +  int pows_created;
>  } reassociate_stats;
>
>  /* Operator, rank pair.  */
> @@ -190,6 +199,12 @@ static int next_operand_entry_id;
>    depth.  */
>  static long *bb_rank;
>
> +/* Distinguish between early and late reassociation passes.  Early
> +   reassociation expands and rebuilds __builtin_pow* calls.  This
> +   is not done in late reassociation to preserve the builtin
> +   optimizations performed in pass_cse_sincos.  */
> +static bool early_reassoc;
> +
>  /* Operand->rank hashtable.  */
>  static struct pointer_map_t *operand_rank;
>
> @@ -2759,6 +2774,164 @@ can_reassociate_p (tree op)
>   return false;
>  }
>
> +/* Define a limit on the exponent of a __builtin_pow or __builtin_powi
> +   that will be converted into a linear chain of multiplies prior to
> +   reassociation.  */
> +
> +#define MAX_POW_EXPAND 32
> +
> +/* Given STMT at *GSIP that is a __builtin_pow or __builtin_powi
> +   operation with base ARG0 and integer power EXPONENT, transform
> +   it into a linear chain of multiplies, provided that EXPONENT is
> +   of sufficiently small magnitude. */
> +static void
> +linear_expand_pow_common (gimple stmt, gimple_stmt_iterator *gsip,
> +                         tree arg0, HOST_WIDE_INT exponent)
> +{
> +  tree lhs = gimple_call_lhs (stmt);
> +  HOST_WIDE_INT abs_exp = abs (exponent);
> +  location_t loc = gimple_location (stmt);
> +  gimple mul_stmt = NULL;
> +  gimple div_stmt = NULL;
> +  tree result, type, target;
> +  gimple copy_stmt;
> +
> +  if (abs_exp > MAX_POW_EXPAND)
> +    return;
> +
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    {
> +      fprintf (dump_file, "Expanding builtin pow/powi: ");
> +      print_gimple_stmt (dump_file, stmt, 0, 0);
> +    }
> +
> +  reassociate_stats.pows_expanded++;
> +
> +  type = TREE_TYPE (arg0);
> +
> +  if (exponent == 0)
> +    result = build_real (type, dconst1);
> +  else if (exponent == 1)
> +    result = arg0;
> +  else
> +    {
> +      tree target_ssa;
> +
> +      target = create_tmp_reg (type, "reassocpow");
> +      add_referenced_var (target);
> +
> +      if (exponent == -1)
> +       result = arg0;
> +      else
> +       {
> +         unsigned i;
> +
> +         target_ssa = make_ssa_name (target, NULL);
> +         mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
> +                                                  arg0, arg0);
> +         gimple_set_location (mul_stmt, loc);
> +         gsi_insert_before (gsip, mul_stmt, GSI_SAME_STMT);
> +
> +         for (i = abs_exp - 2; i > 0; i--)
> +           {
> +             tree prev_target_ssa = target_ssa;
> +             target_ssa = make_ssa_name (target, NULL);
> +             mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
> +                                                      prev_target_ssa, arg0);
> +             gimple_set_location (mul_stmt, loc);
> +             gsi_insert_before (gsip, mul_stmt, GSI_SAME_STMT);
> +           }
> +
> +         result = target_ssa;
> +       }
> +
> +      /* Possible TODO:  If we expand two __builtin_pow's with different
> +        negative exponents, the introduction of the RDIVs for inversion
> +        prevents combining their factors.  (If they have the same negative
> +        exponent, expression folding combines them as expected.)  I'm
> +        not worrying about this as it should be quite rare in practice.  */
> +      if (exponent < 0)
> +       {
> +         target_ssa = make_ssa_name (target, NULL);
> +         div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target_ssa,
> +                                                  build_real (type, dconst1),
> +                                                  result);
> +         gimple_set_location (div_stmt, loc);
> +         gsi_insert_before (gsip, div_stmt, GSI_SAME_STMT);
> +         result = target_ssa;
> +       }
> +    }
> +
> +  /* If we introduced multiplies but no inversion, avoid introducing a
> +     copy so that the copy doesn't truncate a larger reassociation chain.  */
> +  if (mul_stmt && !div_stmt)
> +    {
> +      unlink_stmt_vdef (stmt);
> +      gimple_set_lhs (mul_stmt, lhs);
> +      gsi_remove (gsip, true);
> +      update_stmt (mul_stmt);
> +      /* If we're at the end of the block, leave the iterator in a
> +        usable state.  */
> +      if (gsi_end_p (*gsip))
> +       *gsip = gsi_for_stmt (mul_stmt);
> +    }
> +  else
> +    {
> +      copy_stmt = gimple_build_assign (lhs, result);
> +      gimple_set_location (copy_stmt, loc);
> +      unlink_stmt_vdef (stmt);
> +      gsi_replace (gsip, copy_stmt, true);
> +    }
> +}
> +
> +/* Transform __builtin_powi into a linear chain of multiplies,
> +   if the exponent is of sufficiently small magnitude.  */
> +
> +static void
> +linear_expand_powi (gimple stmt, gimple_stmt_iterator *gsip)
> +{
> +  tree arg0 = gimple_call_arg (stmt, 0);
> +  tree arg1 = gimple_call_arg (stmt, 1);
> +  HOST_WIDE_INT exponent;
> +
> +  if (TREE_CODE (arg1) != INTEGER_CST)
> +    return;
> +
> +  if (host_integerp (arg1, 0))
> +    {
> +      exponent = TREE_INT_CST_LOW (arg1);
> +      linear_expand_pow_common (stmt, gsip, arg0, exponent);
> +    }
> +}
> +
> +/* Transform __builtin_pow into a linear chain of multiplies,
> +   if the exponent is constant and equivalent to an integer of
> +   sufficiently small magnitude.  */
> +
> +static void
> +linear_expand_pow (gimple stmt, gimple_stmt_iterator *gsip)
> +{
> +  tree arg0 = gimple_call_arg (stmt, 0);
> +  tree arg1 = gimple_call_arg (stmt, 1);
> +  REAL_VALUE_TYPE c, cint;
> +  HOST_WIDE_INT n;
> +
> +  /* The exponent must be a constant equivalent to an integer that
> +     fits in a HOST_WIDE_INT.  */
> +  if (TREE_CODE (arg1) != REAL_CST)
> +    return;
> +
> +  c = TREE_REAL_CST (arg1);
> +  if (REAL_EXP (&c) > HOST_BITS_PER_WIDE_INT)
> +    return;
> +
> +  n = real_to_integer (&c);
> +  real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
> +
> +  if (real_identical (&c, &cint))
> +    linear_expand_pow_common (stmt, gsip, arg0, n);
> +}
> +
>  /* Break up subtract operations in block BB.
>
>    We do this top down because we don't know whether the subtract is
> @@ -2784,9 +2957,32 @@ break_up_subtract_bb (basic_block bb)
>
>   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>     {
> +      tree fndecl;
>       gimple stmt = gsi_stmt (gsi);
>       gimple_set_visited (stmt, false);
>
> +      /* Look for __builtin_pow and __builtin_powi calls,
> +        and expand them to multiplies if possible.  */
> +      if (early_reassoc
> +         && flag_unsafe_math_optimizations
> +         && is_gimple_call (stmt)
> +         && (fndecl = gimple_call_fndecl (stmt))
> +         && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
> +       {
> +         switch (DECL_FUNCTION_CODE (fndecl))
> +           {
> +           CASE_FLT_FN (BUILT_IN_POW):
> +             linear_expand_pow (stmt, &gsi);
> +             break;
> +           CASE_FLT_FN (BUILT_IN_POWI):
> +             linear_expand_powi (stmt, &gsi);
> +             break;
> +           default:
> +             ;
> +           }
> +         continue;
> +       }
> +
>       if (!is_gimple_assign (stmt)
>          || !can_reassociate_p (gimple_assign_lhs (stmt)))
>        continue;
> @@ -2815,6 +3011,342 @@ break_up_subtract_bb (basic_block bb)
>     break_up_subtract_bb (son);
>  }
>
> +/* Used for repeated factor analysis.  */
> +struct repeat_factor_d
> +{
> +  /* An SSA name that occurs in a multiply chain.  */
> +  tree factor;
> +
> +  /* Cached rank of the factor.  */
> +  unsigned rank;
> +
> +  /* Number of occurrences of the factor in the chain.  */
> +  HOST_WIDE_INT count;
> +
> +  /* An SSA name representing the product of this factor and
> +     all factors appearing later in the repeated factor vector.  */
> +  tree repr;
> +};
> +
> +typedef struct repeat_factor_d repeat_factor, *repeat_factor_t;
> +typedef const struct repeat_factor_d *const_repeat_factor_t;
> +
> +DEF_VEC_O (repeat_factor);
> +DEF_VEC_ALLOC_O (repeat_factor, heap);
> +
> +static VEC (repeat_factor, heap) *repeat_factor_vec;
> +
> +/* Used for sorting the repeat factor vector.  Sort primarily by
> +   ascending occurrence count, secondarily by descending rank.  */
> +
> +static int
> +compare_repeat_factors (const void *x1, const void *x2)
> +{
> +  const_repeat_factor_t rf1 = (const_repeat_factor_t) x1;
> +  const_repeat_factor_t rf2 = (const_repeat_factor_t) x2;
> +
> +  if (rf1->count != rf2->count)
> +    return rf1->count - rf2->count;
> +
> +  return rf2->rank - rf1->rank;
> +}
> +
> +/* Get a new SSA name for register variable *TARGET of type TYPE.
> +   If *TARGET is null or incompatible with TYPE, create the variable
> +   first.  */
> +
> +static tree
> +get_reassoc_pow_ssa_name (tree *target, tree type)
> +{
> +  if (!*target || !types_compatible_p (type, TREE_TYPE (*target)))
> +    {
> +      *target = create_tmp_reg (type, "reassocpow");
> +      add_referenced_var (*target);
> +    }
> +
> +  return make_ssa_name (*target, NULL);
> +}
> +
> +/* Look for repeated operands in OPS in the multiply tree rooted at
> +   STMT.  Replace them with an optimal sequence of multiplies and powi
> +   builtin calls, and remove the used operands from OPS.  Return an
> +   SSA name representing the value of the replacement sequence.  */
> +
> +static tree
> +attempt_builtin_powi (gimple stmt, VEC(operand_entry_t, heap) **ops,
> +                     tree *target)
> +{
> +  unsigned i, j, cached, vec_len;
> +  int ii;
> +  operand_entry_t oe;
> +  repeat_factor_t rf1, rf2;
> +  repeat_factor rfnew;
> +  tree target_ssa;
> +  tree result = NULL_TREE;
> +  tree type = TREE_TYPE (gimple_get_lhs (stmt));
> +  tree powi_fndecl = mathfn_built_in (type, BUILT_IN_POWI);
> +  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
> +  gimple mul_stmt, pow_stmt;
> +
> +  /* Nothing to do if BUILT_IN_POWI doesn't exist for this type and
> +     target.  */
> +  if (!powi_fndecl)
> +    return NULL_TREE;
> +
> +  /* Allocate the repeated factor vector.  */
> +  repeat_factor_vec = VEC_alloc (repeat_factor, heap, 10);
> +
> +  /* Scan the OPS vector for all SSA names in the product and build
> +     up a vector of occurrence counts for each factor.  */
> +  FOR_EACH_VEC_ELT (operand_entry_t, *ops, i, oe)
> +    {
> +      if (TREE_CODE (oe->op) == SSA_NAME)
> +       {
> +         FOR_EACH_VEC_ELT (repeat_factor, repeat_factor_vec, j, rf1)
> +           {
> +             if (rf1->factor == oe->op)
> +               {
> +                 rf1->count++;
> +                 break;
> +               }
> +           }
> +
> +         if (j >= VEC_length (repeat_factor, repeat_factor_vec))
> +           {
> +             rfnew.factor = oe->op;
> +             rfnew.rank = oe->rank;
> +             rfnew.count = 1;
> +             rfnew.repr = NULL_TREE;
> +             VEC_safe_push (repeat_factor, heap, repeat_factor_vec, &rfnew);
> +           }
> +       }
> +    }
> +
> +  vec_len = VEC_length (repeat_factor, repeat_factor_vec);
> +
> +  /* Sort the repeated factor vector by (a) increasing occurrence count,
> +     and (b) decreasing rank.  */
> +  VEC_qsort (repeat_factor, repeat_factor_vec, compare_repeat_factors);
> +
> +  /* There are a variety of ways we could combine factors with different
> +     occurrence counts.  It seems best in practice to try to combine as
> +     many base factors as possible into a product before applying
> +     builtin_powi to the result.  The sort order chosen for the repeated
> +     factor vector allows us to cache partial results for the product
> +     of the base factors for subsequent use.
> +
> +     As an example, consider x * x * y * y * y * y * z * z * z * z.
> +     We want to first compose the product x * y * z, raise it to the
> +     second power, and then multiply this by the product y * z raised
> +     to the second power.  This can be done in 5 multiplies provided
> +     we cache y * z for use in both expressions:
> +
> +        t1 = y * z
> +       t2 = t1 * x
> +       t3 = t2 * t2
> +       t4 = t1 * t1
> +       result = t3 * t4
> +
> +     Alternatively we could also do this in 5 multiplies by first
> +     calculating y * z, squaring it twice, and multiplying by x * x.
> +     However, if the occurrence counts were not powers of 2 as in
> +     this example, combining higher occurrence counts first would
> +     require more multiplies than combining lower ones first.  */
> +
> +  /* Repeatedly look for opportunities to create a builtin_powi call.  */
> +  while (true)
> +    {
> +      HOST_WIDE_INT power;
> +
> +      /* Find the first factor in the repeated factor vector whose
> +        occurrence count is at least 2.  If no such factor exists,
> +        there are no builtin_powi opportunities remaining.  */
> +      FOR_EACH_VEC_ELT (repeat_factor, repeat_factor_vec, j, rf1)
> +       {
> +         if (rf1->count >= 2)
> +           break;
> +       }
> +
> +      if (j >= vec_len)
> +       break;
> +
> +      power = rf1->count;
> +
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       {
> +         unsigned elt;
> +         repeat_factor_t rf;
> +         fputs ("Building __builtin_pow call for (", dump_file);
> +         for (elt = j; elt < vec_len; elt++)
> +           {
> +             rf = VEC_index (repeat_factor, repeat_factor_vec, elt);
> +             print_generic_expr (dump_file, rf->factor, 0);
> +             if (elt < vec_len - 1)
> +               fputs (" * ", dump_file);
> +           }
> +         fprintf (dump_file, ")^%ld\n", power);
> +       }
> +
> +      reassociate_stats.pows_created++;
> +
> +      /* Visit each element of the vector in reverse order (so that
> +        high-occurrence elements are visited first, and within the
> +        same occurrence count, lower-ranked elements are visited
> +        first).  Form a linear product of all elements in this order
> +        whose occurrencce count is at least that of element J.
> +        Record the SSA name representing the product of each element
> +        with all subsequent elements in the vector.  */
> +      if (j == vec_len - 1)
> +       rf1->repr = rf1->factor;
> +      else
> +       {
> +         for (ii = vec_len - 2; ii >= (int)j; ii--)
> +           {
> +             tree op1, op2;
> +
> +             rf1 = VEC_index (repeat_factor, repeat_factor_vec, ii);
> +             rf2 = VEC_index (repeat_factor, repeat_factor_vec, ii + 1);
> +
> +             /* Init the last factor's representative to be itself.  */
> +             if (!rf2->repr)
> +               rf2->repr = rf2->factor;
> +
> +             op1 = rf1->factor;
> +             op2 = rf2->repr;
> +
> +             target_ssa = get_reassoc_pow_ssa_name (target, type);
> +             mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
> +                                                      op1, op2);
> +             gimple_set_location (mul_stmt, gimple_location (stmt));
> +             gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
> +             rf1->repr = target_ssa;
> +           }
> +       }
> +
> +      /* Form a call to __builtin_powi for the maximum product
> +        just formed, raised to the power obtained earlier.  */
> +      rf1 = VEC_index (repeat_factor, repeat_factor_vec, j);
> +      target_ssa = get_reassoc_pow_ssa_name (target, type);
> +      pow_stmt = gimple_build_call (powi_fndecl, 2, rf1->repr,
> +                                   build_int_cst (integer_type_node, power));
> +      gimple_call_set_lhs (pow_stmt, target_ssa);
> +      gimple_set_location (pow_stmt, gimple_location (stmt));
> +      gsi_insert_before (&gsi, pow_stmt, GSI_SAME_STMT);
> +
> +      /* Decrement the occurrence count of each element in the product
> +        by the count found above, and remove this many copies of each
> +        factor from OPS.  */
> +      for (i = j; i < vec_len; i++)
> +       {
> +         unsigned k = power;
> +         unsigned n;
> +
> +         rf1 = VEC_index (repeat_factor, repeat_factor_vec, i);
> +         rf1->count -= power;
> +
> +         FOR_EACH_VEC_ELT_REVERSE (operand_entry_t, *ops, n, oe)
> +           {
> +             if (oe->op == rf1->factor)
> +               {
> +                 VEC_ordered_remove (operand_entry_t, *ops, n);
> +                 if (--k == 0)
> +                   break;
> +               }
> +           }
> +       }
> +
> +      /* If we previously formed at least one other builtin_powi call,
> +        form the product of this one and those others.  */
> +      if (result)
> +       {
> +         tree new_target_ssa = get_reassoc_pow_ssa_name (target, type);
> +         mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, new_target_ssa,
> +                                                  target_ssa, result);
> +         gimple_set_location (mul_stmt, gimple_location (stmt));
> +         gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
> +         result = new_target_ssa;
> +       }
> +      else
> +       result = target_ssa;
> +    }
> +
> +  /* At this point all elements in the repeated factor vector have a
> +     remaining occurrence count of 0 or 1.  Scanning the vector in
> +     reverse order, find the last element with a 1 before a 0 is found.
> +     If this element has an SSA representative and is not the last
> +     element, then it represents a multiply we have already calculated.
> +     Multiply the result so far by this SSA name.  Remove one copy of
> +     each factor in the product from OPS.  */
> +  cached = vec_len;
> +
> +  for (ii = vec_len - 1; ii >= 0; ii--)
> +    {
> +      rf1 = VEC_index (repeat_factor, repeat_factor_vec, ii);
> +      if (rf1->count == 0)
> +       {
> +         cached = ii + 1;
> +         break;
> +       }
> +    }
> +
> +  if (cached < vec_len)
> +    {
> +      rf1 = VEC_index (repeat_factor, repeat_factor_vec, cached);
> +
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       {
> +         unsigned elt;
> +         repeat_factor_t rf;
> +         fputs ("Multiplying by cached product ", dump_file);
> +         for (elt = cached; elt < vec_len; elt++)
> +           {
> +             rf = VEC_index (repeat_factor, repeat_factor_vec, elt);
> +             print_generic_expr (dump_file, rf->factor, 0);
> +             if (elt < vec_len - 1)
> +               fputs (" * ", dump_file);
> +           }
> +         fputs ("\n", dump_file);
> +       }
> +
> +      if (!result)
> +       result = rf1->repr;
> +      else
> +       {
> +         tree new_target_ssa = get_reassoc_pow_ssa_name (target, type);
> +         mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, new_target_ssa,
> +                                                  rf1->repr, result);
> +         gimple_set_location (mul_stmt, gimple_location (stmt));
> +         gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
> +         result = new_target_ssa;
> +       }
> +    }
> +
> +  for (i = cached; i < vec_len; i++)
> +    {
> +      unsigned n;
> +
> +      rf1 = VEC_index (repeat_factor, repeat_factor_vec, i);
> +      rf1->count--;
> +
> +      FOR_EACH_VEC_ELT_REVERSE (operand_entry_t, *ops, n, oe)
> +       {
> +         if (oe->op == rf1->factor)
> +           {
> +             VEC_ordered_remove (operand_entry_t, *ops, n);
> +             break;
> +           }
> +       }
> +    }
> +
> +  /* Clean up.  */
> +  VEC_free (repeat_factor, heap, repeat_factor_vec);
> +
> +  /* Return the final product as computed herein.  Note that there
> +     may still be some elements with single occurrence count left
> +     in OPS; those will be handled by the normal reassociation logic.  */
> +  return result;
> +}
> +
>  /* Reassociate expressions in basic block BB and its post-dominator as
>    children.  */
>
> @@ -2823,6 +3355,7 @@ reassociate_bb (basic_block bb)
>  {
>   gimple_stmt_iterator gsi;
>   basic_block son;
> +  tree target = NULL_TREE;
>
>   for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
>     {
> @@ -2883,6 +3416,8 @@ reassociate_bb (basic_block bb)
>
>          if (associative_tree_code (rhs_code))
>            {
> +             tree powi_result = NULL_TREE;
> +
>              VEC(operand_entry_t, heap) *ops = NULL;
>
>              /* There may be no immediate uses left by the time we
> @@ -2904,7 +3439,14 @@ reassociate_bb (basic_block bb)
>              if (rhs_code == BIT_IOR_EXPR || rhs_code == BIT_AND_EXPR)
>                optimize_range_tests (rhs_code, &ops);
>
> -             if (VEC_length (operand_entry_t, ops) == 1)
> +             if (early_reassoc
> +                 && rhs_code == MULT_EXPR
> +                 && flag_unsafe_math_optimizations)
> +               powi_result = attempt_builtin_powi (stmt, &ops, &target);
> +
> +             /* If the operand vector is now empty, all operands were
> +                consumed by the __builtin_pow optimization.  */
> +             if (VEC_length (operand_entry_t, ops) == 0)
>                {
>                  if (dump_file && (dump_flags & TDF_DETAILS))
>                    {
> @@ -2913,9 +3455,7 @@ reassociate_bb (basic_block bb)
>                    }
>
>                  rhs1 = gimple_assign_rhs1 (stmt);
> -                 gimple_assign_set_rhs_from_tree (&gsi,
> -                                                  VEC_last (operand_entry_t,
> -                                                            ops)->op);
> +                 gimple_assign_set_rhs_from_tree (&gsi, powi_result);
>                  update_stmt (stmt);
>                  remove_visited_stmt_chain (rhs1);
>
> @@ -2925,6 +3465,39 @@ reassociate_bb (basic_block bb)
>                      print_gimple_stmt (dump_file, stmt, 0, 0);
>                    }
>                }
> +
> +             else if (VEC_length (operand_entry_t, ops) == 1)
> +               {
> +                 tree last_op = VEC_last (operand_entry_t, ops)->op;
> +                 if (dump_file && (dump_flags & TDF_DETAILS))
> +                   {
> +                     fprintf (dump_file, "Transforming ");
> +                     print_gimple_stmt (dump_file, stmt, 0, 0);
> +                   }
> +
> +                 if (powi_result)
> +                   {
> +                     rhs1 = gimple_assign_rhs1 (stmt);
> +                     gimple_assign_set_rhs_with_ops_1 (&gsi, MULT_EXPR,
> +                                                       powi_result, last_op,
> +                                                       NULL_TREE);
> +                     update_stmt (gsi_stmt (gsi));
> +                     remove_visited_stmt_chain (rhs1);
> +                   }
> +                 else
> +                   {
> +                     rhs1 = gimple_assign_rhs1 (stmt);
> +                     gimple_assign_set_rhs_from_tree (&gsi, last_op);
> +                     update_stmt (stmt);
> +                     remove_visited_stmt_chain (rhs1);
> +                   }
> +
> +                 if (dump_file && (dump_flags & TDF_DETAILS))
> +                   {
> +                     fprintf (dump_file, " into ");
> +                     print_gimple_stmt (dump_file, stmt, 0, 0);
> +                   }
> +               }
>              else
>                {
>                  enum machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
> @@ -2940,6 +3513,24 @@ reassociate_bb (basic_block bb)
>                    rewrite_expr_tree_parallel (stmt, width, ops);
>                  else
>                    rewrite_expr_tree (stmt, 0, ops, false);
> +
> +                 /* If we combined some repeated factors into a
> +                    __builtin_powi call, multiply that result by the
> +                    reassociated operands.  */
> +                 if (powi_result)
> +                   {
> +                     gimple mul_stmt;
> +                     tree type = TREE_TYPE (gimple_get_lhs (stmt));
> +                     tree target_ssa = get_reassoc_pow_ssa_name (&target,
> +                                                                 type);
> +                     gimple_set_lhs (stmt, target_ssa);
> +                     update_stmt (stmt);
> +                     mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, lhs,
> +                                                              powi_result,
> +                                                              target_ssa);
> +                     gimple_set_location (mul_stmt, gimple_location (stmt));
> +                     gsi_insert_after (&gsi, mul_stmt, GSI_NEW_STMT);
> +                   }
>                }
>
>              VEC_free (operand_entry_t, heap, ops);
> @@ -3054,6 +3645,10 @@ fini_reassoc (void)
>                            reassociate_stats.ops_eliminated);
>   statistics_counter_event (cfun, "Statements rewritten",
>                            reassociate_stats.rewritten);
> +  statistics_counter_event (cfun, "Built-in pow[i] calls expanded",
> +                           reassociate_stats.pows_expanded);
> +  statistics_counter_event (cfun, "Built-in powi calls created",
> +                           reassociate_stats.pows_created);
>
>   pointer_map_destroy (operand_rank);
>   free_alloc_pool (operand_entry_pool);
> @@ -3077,19 +3672,33 @@ execute_reassoc (void)
>   return 0;
>  }
>
> +static unsigned int
> +execute_early_reassoc (void)
> +{
> +  early_reassoc = true;
> +  return execute_reassoc ();
> +}
> +
> +static unsigned int
> +execute_late_reassoc (void)
> +{
> +  early_reassoc = false;
> +  return execute_reassoc ();
> +}
> +
>  static bool
>  gate_tree_ssa_reassoc (void)
>  {
>   return flag_tree_reassoc != 0;
>  }
>
> -struct gimple_opt_pass pass_reassoc =
> +struct gimple_opt_pass pass_early_reassoc =
>  {
>  {
>   GIMPLE_PASS,
> -  "reassoc",                           /* name */
> +  "reassoc1",                          /* name */
>   gate_tree_ssa_reassoc,               /* gate */
> -  execute_reassoc,                     /* execute */
> +  execute_early_reassoc,               /* execute */
>   NULL,                                        /* sub */
>   NULL,                                        /* next */
>   0,                                   /* static_pass_number */
> @@ -3103,3 +3712,24 @@ gate_tree_ssa_reassoc (void)
>     | TODO_ggc_collect                 /* todo_flags_finish */
>  }
>  };
> +
> +struct gimple_opt_pass pass_late_reassoc =
> +{
> + {
> +  GIMPLE_PASS,
> +  "reassoc2",                          /* name */
> +  gate_tree_ssa_reassoc,               /* gate */
> +  execute_late_reassoc,                        /* execute */
> +  NULL,                                        /* sub */
> +  NULL,                                        /* next */
> +  0,                                   /* static_pass_number */
> +  TV_TREE_REASSOC,                     /* tv_id */
> +  PROP_cfg | PROP_ssa,                 /* properties_required */
> +  0,                                   /* properties_provided */
> +  0,                                   /* properties_destroyed */
> +  0,                                   /* todo_flags_start */
> +  TODO_verify_ssa
> +    | TODO_verify_flow
> +    | TODO_ggc_collect                 /* todo_flags_finish */
> + }
> +};
>
>
Bill Schmidt March 28, 2012, 4:44 p.m. UTC | #2
On Wed, 2012-03-28 at 15:57 +0200, Richard Guenther wrote:
> On Tue, Mar 6, 2012 at 9:49 PM, William J. Schmidt
> <wschmidt@linux.vnet.ibm.com> wrote:
> > Hi,
> >
> > This is a re-post of the patch I posted for comments in January to
> > address http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18589.  The patch
> > modifies reassociation to expose repeated factors from __builtin_pow*
> > calls, optimally reassociate repeated factors, and possibly reconstitute
> > __builtin_powi calls from the results of reassociation.
> >
> > Bootstrapped and passes regression tests for powerpc64-linux-gnu.  I
> > expect there may need to be some small changes, but I am targeting this
> > for trunk approval.
> >
> > Thanks very much for the review,
> 
> Hmm.  How much work would it be to extend the reassoc 'IL' to allow
> a repeat factor per op?  I realize what you do is all within what reassoc
> already does though ideally we would not require any GIMPLE IL changes
> for building up / optimizing the reassoc IL but only do so when we commit
> changes.

Ah, I take your point.  I will look into it.  We still need the
additional data structures to allow sorting by factor repeat counts, but
perhaps expanding the builtins can be avoided until it's proven
necessary.  The patch as submitted may be slightly easier to implement
and understand, but I agree it would be better to avoid changing GIMPLE
unnecessarily if possible.  I'll get back to you shortly.

Thanks,
Bill

> 
> Thanks,
> Richard.
>
diff mbox

Patch

Index: gcc/tree-pass.h
===================================================================
--- gcc/tree-pass.h	(revision 184997)
+++ gcc/tree-pass.h	(working copy)
@@ -440,7 +440,8 @@  extern struct gimple_opt_pass pass_copy_prop;
 extern struct gimple_opt_pass pass_vrp;
 extern struct gimple_opt_pass pass_uncprop;
 extern struct gimple_opt_pass pass_return_slot;
-extern struct gimple_opt_pass pass_reassoc;
+extern struct gimple_opt_pass pass_early_reassoc;
+extern struct gimple_opt_pass pass_late_reassoc;
 extern struct gimple_opt_pass pass_rebuild_cgraph_edges;
 extern struct gimple_opt_pass pass_remove_cgraph_callee_edges;
 extern struct gimple_opt_pass pass_build_cgraph_edges;
Index: gcc/testsuite/gcc.dg/pr46309.c
===================================================================
--- gcc/testsuite/gcc.dg/pr46309.c	(revision 184997)
+++ gcc/testsuite/gcc.dg/pr46309.c	(working copy)
@@ -1,6 +1,6 @@ 
 /* PR tree-optimization/46309 */
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-reassoc-details" } */
+/* { dg-options "-O2 -fdump-tree-reassoc1-details -fdump-tree-reassoc2-details" } */
 /* The transformation depends on BRANCH_COST being greater than 1
    (see the notes in the PR), so try to force that.  */
 /* { dg-additional-options "-mtune=octeon2" { target mips*-*-* } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z, double u)
+{
+  return x * x * y * y * y * z * z * z * z * u;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 7 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z, double u)
+{
+  return x * x * x * y * y * y * z * z * z * z * u * u * u * u;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 6 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c	(revision 0)
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y)
+{
+  return __builtin_pow (x, -4.0) * __builtin_pow (y, -4.0);
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " / " 1 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+float baz (float x, float y)
+{
+  return x * x * x * x * y * y * y * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+long double baz (long double x, long double y)
+{
+  return x * x * x * x * y * y * y * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y)
+{
+  return x * x * x * x * y * y * y * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y)
+{
+  return x * y * y * x * y * x * x * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c	(revision 0)
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z)
+{
+  return x * x * y * y * y * z * z * z * z;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 6 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/passes.c
===================================================================
--- gcc/passes.c	(revision 184997)
+++ gcc/passes.c	(working copy)
@@ -1325,7 +1325,7 @@  init_optimization_passes (void)
 	 opportunities.  */
       NEXT_PASS (pass_phi_only_cprop);
       NEXT_PASS (pass_dse);
-      NEXT_PASS (pass_reassoc);
+      NEXT_PASS (pass_early_reassoc);
       NEXT_PASS (pass_dce);
       NEXT_PASS (pass_forwprop);
       NEXT_PASS (pass_phiopt);
@@ -1377,7 +1377,7 @@  init_optimization_passes (void)
 	}
       NEXT_PASS (pass_lower_vector_ssa);
       NEXT_PASS (pass_cse_reciprocals);
-      NEXT_PASS (pass_reassoc);
+      NEXT_PASS (pass_late_reassoc);
       NEXT_PASS (pass_vrp);
       NEXT_PASS (pass_dominator);
       /* The only const/copy propagation opportunities left after
Index: gcc/tree-ssa-reassoc.c
===================================================================
--- gcc/tree-ssa-reassoc.c	(revision 184997)
+++ gcc/tree-ssa-reassoc.c	(working copy)
@@ -53,6 +53,9 @@  along with GCC; see the file COPYING3.  If not see
     1. Breaking up subtract operations into addition + negate, where
     it would promote the reassociation of adds.
 
+    1a. During the same pass, expanding __builtin_pow variants with
+    small integer exponents into linearized multiply trees.
+
     2. Left linearization of the expression trees, so that (A+B)+(C+D)
     becomes (((A+B)+C)+D), which is easier for us to rewrite later.
     During linearization, we place the operands of the binary
@@ -61,6 +64,10 @@  along with GCC; see the file COPYING3.  If not see
     3. Optimization of the operand lists, eliminating things like a +
     -a, a & a, etc.
 
+    3a. Combine repeated factors with the same occurrence counts
+    into a __builtin_powi call that will later be optimized into
+    an optimal number of multiplies.
+
     4. Rewrite the expression trees we linearized and optimized so
     they are in proper rank order.
 
@@ -169,6 +176,8 @@  static struct
   int constants_eliminated;
   int ops_eliminated;
   int rewritten;
+  int pows_expanded;
+  int pows_created;
 } reassociate_stats;
 
 /* Operator, rank pair.  */
@@ -190,6 +199,12 @@  static int next_operand_entry_id;
    depth.  */
 static long *bb_rank;
 
+/* Distinguish between early and late reassociation passes.  Early
+   reassociation expands and rebuilds __builtin_pow* calls.  This
+   is not done in late reassociation to preserve the builtin
+   optimizations performed in pass_cse_sincos.  */
+static bool early_reassoc;
+
 /* Operand->rank hashtable.  */
 static struct pointer_map_t *operand_rank;
 
@@ -2759,6 +2774,164 @@  can_reassociate_p (tree op)
   return false;
 }
 
+/* Define a limit on the exponent of a __builtin_pow or __builtin_powi
+   that will be converted into a linear chain of multiplies prior to
+   reassociation.  */
+
+#define MAX_POW_EXPAND 32
+
+/* Given STMT at *GSIP that is a __builtin_pow or __builtin_powi
+   operation with base ARG0 and integer power EXPONENT, transform
+   it into a linear chain of multiplies, provided that EXPONENT is
+   of sufficiently small magnitude. */
+static void
+linear_expand_pow_common (gimple stmt, gimple_stmt_iterator *gsip,
+			  tree arg0, HOST_WIDE_INT exponent)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  HOST_WIDE_INT abs_exp = abs (exponent);
+  location_t loc = gimple_location (stmt);
+  gimple mul_stmt = NULL;
+  gimple div_stmt = NULL;
+  tree result, type, target;
+  gimple copy_stmt;
+
+  if (abs_exp > MAX_POW_EXPAND)
+    return;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "Expanding builtin pow/powi: ");
+      print_gimple_stmt (dump_file, stmt, 0, 0);
+    }
+
+  reassociate_stats.pows_expanded++;
+
+  type = TREE_TYPE (arg0);
+
+  if (exponent == 0)
+    result = build_real (type, dconst1);
+  else if (exponent == 1)
+    result = arg0;
+  else
+    {
+      tree target_ssa;
+
+      target = create_tmp_reg (type, "reassocpow");
+      add_referenced_var (target);
+
+      if (exponent == -1)
+	result = arg0;
+      else
+	{
+	  unsigned i;
+
+	  target_ssa = make_ssa_name (target, NULL);
+	  mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
+						   arg0, arg0);
+	  gimple_set_location (mul_stmt, loc);
+	  gsi_insert_before (gsip, mul_stmt, GSI_SAME_STMT);
+	  
+	  for (i = abs_exp - 2; i > 0; i--)
+	    {
+	      tree prev_target_ssa = target_ssa;
+	      target_ssa = make_ssa_name (target, NULL);
+	      mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
+						       prev_target_ssa, arg0);
+	      gimple_set_location (mul_stmt, loc);
+	      gsi_insert_before (gsip, mul_stmt, GSI_SAME_STMT);
+	    }
+
+	  result = target_ssa;
+	}
+
+      /* Possible TODO:  If we expand two __builtin_pow's with different
+	 negative exponents, the introduction of the RDIVs for inversion
+	 prevents combining their factors.  (If they have the same negative 
+	 exponent, expression folding combines them as expected.)  I'm
+	 not worrying about this as it should be quite rare in practice.  */
+      if (exponent < 0)
+	{
+	  target_ssa = make_ssa_name (target, NULL);
+	  div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target_ssa,
+						   build_real (type, dconst1),
+						   result);
+	  gimple_set_location (div_stmt, loc);
+	  gsi_insert_before (gsip, div_stmt, GSI_SAME_STMT);
+	  result = target_ssa;
+	}
+    }
+  
+  /* If we introduced multiplies but no inversion, avoid introducing a
+     copy so that the copy doesn't truncate a larger reassociation chain.  */
+  if (mul_stmt && !div_stmt)
+    {
+      unlink_stmt_vdef (stmt);
+      gimple_set_lhs (mul_stmt, lhs);
+      gsi_remove (gsip, true);
+      update_stmt (mul_stmt);
+      /* If we're at the end of the block, leave the iterator in a
+	 usable state.  */
+      if (gsi_end_p (*gsip))
+	*gsip = gsi_for_stmt (mul_stmt);
+    }
+  else
+    {
+      copy_stmt = gimple_build_assign (lhs, result);
+      gimple_set_location (copy_stmt, loc);
+      unlink_stmt_vdef (stmt);
+      gsi_replace (gsip, copy_stmt, true);
+    }
+}
+
+/* Transform __builtin_powi into a linear chain of multiplies,
+   if the exponent is of sufficiently small magnitude.  */
+
+static void
+linear_expand_powi (gimple stmt, gimple_stmt_iterator *gsip)
+{
+  tree arg0 = gimple_call_arg (stmt, 0);
+  tree arg1 = gimple_call_arg (stmt, 1);
+  HOST_WIDE_INT exponent;
+
+  if (TREE_CODE (arg1) != INTEGER_CST)
+    return;
+
+  if (host_integerp (arg1, 0))
+    {
+      exponent = TREE_INT_CST_LOW (arg1);
+      linear_expand_pow_common (stmt, gsip, arg0, exponent);
+    }
+}
+
+/* Transform __builtin_pow into a linear chain of multiplies,
+   if the exponent is constant and equivalent to an integer of
+   sufficiently small magnitude.  */
+
+static void
+linear_expand_pow (gimple stmt, gimple_stmt_iterator *gsip)
+{
+  tree arg0 = gimple_call_arg (stmt, 0);
+  tree arg1 = gimple_call_arg (stmt, 1);
+  REAL_VALUE_TYPE c, cint;
+  HOST_WIDE_INT n;
+
+  /* The exponent must be a constant equivalent to an integer that
+     fits in a HOST_WIDE_INT.  */
+  if (TREE_CODE (arg1) != REAL_CST)
+    return;
+  
+  c = TREE_REAL_CST (arg1);
+  if (REAL_EXP (&c) > HOST_BITS_PER_WIDE_INT)
+    return;
+
+  n = real_to_integer (&c);
+  real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
+
+  if (real_identical (&c, &cint))
+    linear_expand_pow_common (stmt, gsip, arg0, n);
+}
+
 /* Break up subtract operations in block BB.
 
    We do this top down because we don't know whether the subtract is
@@ -2784,9 +2957,32 @@  break_up_subtract_bb (basic_block bb)
 
   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
     {
+      tree fndecl;
       gimple stmt = gsi_stmt (gsi);
       gimple_set_visited (stmt, false);
 
+      /* Look for __builtin_pow and __builtin_powi calls,
+	 and expand them to multiplies if possible.  */
+      if (early_reassoc
+	  && flag_unsafe_math_optimizations
+	  && is_gimple_call (stmt)
+	  && (fndecl = gimple_call_fndecl (stmt))
+	  && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
+	{
+	  switch (DECL_FUNCTION_CODE (fndecl))
+	    {
+	    CASE_FLT_FN (BUILT_IN_POW):
+	      linear_expand_pow (stmt, &gsi);
+	      break;
+	    CASE_FLT_FN (BUILT_IN_POWI):
+	      linear_expand_powi (stmt, &gsi);
+	      break;
+	    default:
+	      ;
+	    }
+	  continue;
+	}
+
       if (!is_gimple_assign (stmt)
 	  || !can_reassociate_p (gimple_assign_lhs (stmt)))
 	continue;
@@ -2815,6 +3011,342 @@  break_up_subtract_bb (basic_block bb)
     break_up_subtract_bb (son);
 }
 
+/* Used for repeated factor analysis.  */
+struct repeat_factor_d
+{
+  /* An SSA name that occurs in a multiply chain.  */
+  tree factor;
+
+  /* Cached rank of the factor.  */
+  unsigned rank;
+
+  /* Number of occurrences of the factor in the chain.  */
+  HOST_WIDE_INT count;
+
+  /* An SSA name representing the product of this factor and
+     all factors appearing later in the repeated factor vector.  */
+  tree repr;
+};
+
+typedef struct repeat_factor_d repeat_factor, *repeat_factor_t;
+typedef const struct repeat_factor_d *const_repeat_factor_t;
+
+DEF_VEC_O (repeat_factor);
+DEF_VEC_ALLOC_O (repeat_factor, heap);
+
+static VEC (repeat_factor, heap) *repeat_factor_vec;
+
+/* Used for sorting the repeat factor vector.  Sort primarily by
+   ascending occurrence count, secondarily by descending rank.  */
+
+static int
+compare_repeat_factors (const void *x1, const void *x2)
+{
+  const_repeat_factor_t rf1 = (const_repeat_factor_t) x1;
+  const_repeat_factor_t rf2 = (const_repeat_factor_t) x2;
+
+  if (rf1->count != rf2->count)
+    return rf1->count - rf2->count;
+
+  return rf2->rank - rf1->rank;
+}
+
+/* Get a new SSA name for register variable *TARGET of type TYPE.
+   If *TARGET is null or incompatible with TYPE, create the variable
+   first.  */
+
+static tree
+get_reassoc_pow_ssa_name (tree *target, tree type)
+{
+  if (!*target || !types_compatible_p (type, TREE_TYPE (*target)))
+    {
+      *target = create_tmp_reg (type, "reassocpow");
+      add_referenced_var (*target);
+    }
+
+  return make_ssa_name (*target, NULL);
+}
+
+/* Look for repeated operands in OPS in the multiply tree rooted at
+   STMT.  Replace them with an optimal sequence of multiplies and powi
+   builtin calls, and remove the used operands from OPS.  Return an
+   SSA name representing the value of the replacement sequence.  */
+
+static tree
+attempt_builtin_powi (gimple stmt, VEC(operand_entry_t, heap) **ops,
+		      tree *target)
+{
+  unsigned i, j, cached, vec_len;
+  int ii;
+  operand_entry_t oe;
+  repeat_factor_t rf1, rf2;
+  repeat_factor rfnew;
+  tree target_ssa;
+  tree result = NULL_TREE;
+  tree type = TREE_TYPE (gimple_get_lhs (stmt));
+  tree powi_fndecl = mathfn_built_in (type, BUILT_IN_POWI);
+  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+  gimple mul_stmt, pow_stmt;
+
+  /* Nothing to do if BUILT_IN_POWI doesn't exist for this type and
+     target.  */
+  if (!powi_fndecl)
+    return NULL_TREE;
+
+  /* Allocate the repeated factor vector.  */
+  repeat_factor_vec = VEC_alloc (repeat_factor, heap, 10);
+
+  /* Scan the OPS vector for all SSA names in the product and build
+     up a vector of occurrence counts for each factor.  */
+  FOR_EACH_VEC_ELT (operand_entry_t, *ops, i, oe)
+    {
+      if (TREE_CODE (oe->op) == SSA_NAME)
+	{
+	  FOR_EACH_VEC_ELT (repeat_factor, repeat_factor_vec, j, rf1)
+	    {
+	      if (rf1->factor == oe->op)
+		{
+		  rf1->count++;
+		  break;
+		}
+	    }
+
+	  if (j >= VEC_length (repeat_factor, repeat_factor_vec))
+	    {
+	      rfnew.factor = oe->op;
+	      rfnew.rank = oe->rank;
+	      rfnew.count = 1;
+	      rfnew.repr = NULL_TREE;
+	      VEC_safe_push (repeat_factor, heap, repeat_factor_vec, &rfnew);
+	    }
+	}
+    }
+
+  vec_len = VEC_length (repeat_factor, repeat_factor_vec);
+  
+  /* Sort the repeated factor vector by (a) increasing occurrence count,
+     and (b) decreasing rank.  */
+  VEC_qsort (repeat_factor, repeat_factor_vec, compare_repeat_factors);
+
+  /* There are a variety of ways we could combine factors with different
+     occurrence counts.  It seems best in practice to try to combine as
+     many base factors as possible into a product before applying
+     builtin_powi to the result.  The sort order chosen for the repeated
+     factor vector allows us to cache partial results for the product
+     of the base factors for subsequent use.
+
+     As an example, consider x * x * y * y * y * y * z * z * z * z.
+     We want to first compose the product x * y * z, raise it to the
+     second power, and then multiply this by the product y * z raised
+     to the second power.  This can be done in 5 multiplies provided
+     we cache y * z for use in both expressions:
+
+        t1 = y * z
+	t2 = t1 * x
+	t3 = t2 * t2
+	t4 = t1 * t1
+	result = t3 * t4
+
+     Alternatively we could also do this in 5 multiplies by first 
+     calculating y * z, squaring it twice, and multiplying by x * x.
+     However, if the occurrence counts were not powers of 2 as in 
+     this example, combining higher occurrence counts first would 
+     require more multiplies than combining lower ones first.  */
+
+  /* Repeatedly look for opportunities to create a builtin_powi call.  */
+  while (true)
+    {
+      HOST_WIDE_INT power;
+
+      /* Find the first factor in the repeated factor vector whose 
+	 occurrence count is at least 2.  If no such factor exists,
+	 there are no builtin_powi opportunities remaining.  */
+      FOR_EACH_VEC_ELT (repeat_factor, repeat_factor_vec, j, rf1)
+	{
+	  if (rf1->count >= 2)
+	    break;
+	}
+
+      if (j >= vec_len)
+	break;
+
+      power = rf1->count;
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  unsigned elt;
+	  repeat_factor_t rf;
+	  fputs ("Building __builtin_pow call for (", dump_file);
+	  for (elt = j; elt < vec_len; elt++)
+	    {
+	      rf = VEC_index (repeat_factor, repeat_factor_vec, elt);
+	      print_generic_expr (dump_file, rf->factor, 0);
+	      if (elt < vec_len - 1)
+		fputs (" * ", dump_file);
+	    }
+	  fprintf (dump_file, ")^%ld\n", power);
+	}
+
+      reassociate_stats.pows_created++;
+
+      /* Visit each element of the vector in reverse order (so that
+	 high-occurrence elements are visited first, and within the
+	 same occurrence count, lower-ranked elements are visited
+	 first).  Form a linear product of all elements in this order
+	 whose occurrencce count is at least that of element J.
+	 Record the SSA name representing the product of each element
+	 with all subsequent elements in the vector.  */
+      if (j == vec_len - 1)
+	rf1->repr = rf1->factor;
+      else
+	{
+	  for (ii = vec_len - 2; ii >= (int)j; ii--)
+	    {
+	      tree op1, op2;
+
+	      rf1 = VEC_index (repeat_factor, repeat_factor_vec, ii);
+	      rf2 = VEC_index (repeat_factor, repeat_factor_vec, ii + 1);
+
+	      /* Init the last factor's representative to be itself.  */
+	      if (!rf2->repr)
+		rf2->repr = rf2->factor;
+
+	      op1 = rf1->factor;
+	      op2 = rf2->repr;
+
+	      target_ssa = get_reassoc_pow_ssa_name (target, type);
+	      mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
+						       op1, op2);
+	      gimple_set_location (mul_stmt, gimple_location (stmt));
+	      gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
+	      rf1->repr = target_ssa;
+	    }
+	}
+
+      /* Form a call to __builtin_powi for the maximum product
+	 just formed, raised to the power obtained earlier.  */
+      rf1 = VEC_index (repeat_factor, repeat_factor_vec, j);
+      target_ssa = get_reassoc_pow_ssa_name (target, type);
+      pow_stmt = gimple_build_call (powi_fndecl, 2, rf1->repr, 
+				    build_int_cst (integer_type_node, power));
+      gimple_call_set_lhs (pow_stmt, target_ssa);
+      gimple_set_location (pow_stmt, gimple_location (stmt));
+      gsi_insert_before (&gsi, pow_stmt, GSI_SAME_STMT);
+
+      /* Decrement the occurrence count of each element in the product
+	 by the count found above, and remove this many copies of each
+	 factor from OPS.  */
+      for (i = j; i < vec_len; i++)
+	{
+	  unsigned k = power;
+	  unsigned n;
+
+	  rf1 = VEC_index (repeat_factor, repeat_factor_vec, i);
+	  rf1->count -= power;
+	  
+	  FOR_EACH_VEC_ELT_REVERSE (operand_entry_t, *ops, n, oe)
+	    {
+	      if (oe->op == rf1->factor)
+		{
+		  VEC_ordered_remove (operand_entry_t, *ops, n);
+		  if (--k == 0)
+		    break;
+		}
+	    }
+	}
+
+      /* If we previously formed at least one other builtin_powi call,
+	 form the product of this one and those others.  */
+      if (result)
+	{
+	  tree new_target_ssa = get_reassoc_pow_ssa_name (target, type);
+	  mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, new_target_ssa,
+						   target_ssa, result);
+	  gimple_set_location (mul_stmt, gimple_location (stmt));
+	  gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
+	  result = new_target_ssa;
+	}
+      else
+	result = target_ssa;
+    }
+
+  /* At this point all elements in the repeated factor vector have a
+     remaining occurrence count of 0 or 1.  Scanning the vector in
+     reverse order, find the last element with a 1 before a 0 is found.
+     If this element has an SSA representative and is not the last
+     element, then it represents a multiply we have already calculated.
+     Multiply the result so far by this SSA name.  Remove one copy of
+     each factor in the product from OPS.  */
+  cached = vec_len;
+
+  for (ii = vec_len - 1; ii >= 0; ii--)
+    {
+      rf1 = VEC_index (repeat_factor, repeat_factor_vec, ii);
+      if (rf1->count == 0)
+	{
+	  cached = ii + 1;
+	  break;
+	}
+    }
+
+  if (cached < vec_len)
+    {
+      rf1 = VEC_index (repeat_factor, repeat_factor_vec, cached);
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  unsigned elt;
+	  repeat_factor_t rf;
+	  fputs ("Multiplying by cached product ", dump_file);
+	  for (elt = cached; elt < vec_len; elt++)
+	    {
+	      rf = VEC_index (repeat_factor, repeat_factor_vec, elt);
+	      print_generic_expr (dump_file, rf->factor, 0);
+	      if (elt < vec_len - 1)
+		fputs (" * ", dump_file);
+	    }
+	  fputs ("\n", dump_file);
+	}
+
+      if (!result)
+	result = rf1->repr;
+      else
+	{
+	  tree new_target_ssa = get_reassoc_pow_ssa_name (target, type);
+	  mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, new_target_ssa,
+						   rf1->repr, result);
+	  gimple_set_location (mul_stmt, gimple_location (stmt));
+	  gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
+	  result = new_target_ssa;
+	}
+    }
+
+  for (i = cached; i < vec_len; i++)
+    {
+      unsigned n;
+
+      rf1 = VEC_index (repeat_factor, repeat_factor_vec, i);
+      rf1->count--;
+	  
+      FOR_EACH_VEC_ELT_REVERSE (operand_entry_t, *ops, n, oe)
+	{
+	  if (oe->op == rf1->factor)
+	    {
+	      VEC_ordered_remove (operand_entry_t, *ops, n);
+	      break;
+	    }
+	}
+    }
+
+  /* Clean up.  */
+  VEC_free (repeat_factor, heap, repeat_factor_vec);
+
+  /* Return the final product as computed herein.  Note that there
+     may still be some elements with single occurrence count left
+     in OPS; those will be handled by the normal reassociation logic.  */
+  return result;
+}
+
 /* Reassociate expressions in basic block BB and its post-dominator as
    children.  */
 
@@ -2823,6 +3355,7 @@  reassociate_bb (basic_block bb)
 {
   gimple_stmt_iterator gsi;
   basic_block son;
+  tree target = NULL_TREE;
 
   for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
     {
@@ -2883,6 +3416,8 @@  reassociate_bb (basic_block bb)
 
 	  if (associative_tree_code (rhs_code))
 	    {
+	      tree powi_result = NULL_TREE;
+
 	      VEC(operand_entry_t, heap) *ops = NULL;
 
 	      /* There may be no immediate uses left by the time we
@@ -2904,7 +3439,14 @@  reassociate_bb (basic_block bb)
 	      if (rhs_code == BIT_IOR_EXPR || rhs_code == BIT_AND_EXPR)
 		optimize_range_tests (rhs_code, &ops);
 
-	      if (VEC_length (operand_entry_t, ops) == 1)
+	      if (early_reassoc
+		  && rhs_code == MULT_EXPR
+		  && flag_unsafe_math_optimizations)
+		powi_result = attempt_builtin_powi (stmt, &ops, &target);
+
+	      /* If the operand vector is now empty, all operands were
+		 consumed by the __builtin_pow optimization.  */
+	      if (VEC_length (operand_entry_t, ops) == 0)
 		{
 		  if (dump_file && (dump_flags & TDF_DETAILS))
 		    {
@@ -2913,9 +3455,7 @@  reassociate_bb (basic_block bb)
 		    }
 
 		  rhs1 = gimple_assign_rhs1 (stmt);
-		  gimple_assign_set_rhs_from_tree (&gsi,
-						   VEC_last (operand_entry_t,
-							     ops)->op);
+		  gimple_assign_set_rhs_from_tree (&gsi, powi_result);
 		  update_stmt (stmt);
 		  remove_visited_stmt_chain (rhs1);
 
@@ -2925,6 +3465,39 @@  reassociate_bb (basic_block bb)
 		      print_gimple_stmt (dump_file, stmt, 0, 0);
 		    }
 		}
+
+	      else if (VEC_length (operand_entry_t, ops) == 1)
+		{
+		  tree last_op = VEC_last (operand_entry_t, ops)->op;
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    {
+		      fprintf (dump_file, "Transforming ");
+		      print_gimple_stmt (dump_file, stmt, 0, 0);
+		    }
+
+		  if (powi_result)
+		    {
+		      rhs1 = gimple_assign_rhs1 (stmt);
+		      gimple_assign_set_rhs_with_ops_1 (&gsi, MULT_EXPR,
+							powi_result, last_op,
+							NULL_TREE);
+		      update_stmt (gsi_stmt (gsi));
+		      remove_visited_stmt_chain (rhs1);
+		    }
+		  else
+		    {
+		      rhs1 = gimple_assign_rhs1 (stmt);
+		      gimple_assign_set_rhs_from_tree (&gsi, last_op);
+		      update_stmt (stmt);
+		      remove_visited_stmt_chain (rhs1);
+		    }
+
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    {
+		      fprintf (dump_file, " into ");
+		      print_gimple_stmt (dump_file, stmt, 0, 0);
+		    }
+		}
 	      else
 		{
 		  enum machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
@@ -2940,6 +3513,24 @@  reassociate_bb (basic_block bb)
 		    rewrite_expr_tree_parallel (stmt, width, ops);
 		  else
 		    rewrite_expr_tree (stmt, 0, ops, false);
+
+		  /* If we combined some repeated factors into a 
+		     __builtin_powi call, multiply that result by the
+		     reassociated operands.  */
+		  if (powi_result)
+		    {
+		      gimple mul_stmt;
+		      tree type = TREE_TYPE (gimple_get_lhs (stmt));
+		      tree target_ssa = get_reassoc_pow_ssa_name (&target,
+								  type);
+		      gimple_set_lhs (stmt, target_ssa);
+		      update_stmt (stmt);
+		      mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, lhs,
+							       powi_result,
+							       target_ssa);
+		      gimple_set_location (mul_stmt, gimple_location (stmt));
+		      gsi_insert_after (&gsi, mul_stmt, GSI_NEW_STMT);
+		    }
 		}
 
 	      VEC_free (operand_entry_t, heap, ops);
@@ -3054,6 +3645,10 @@  fini_reassoc (void)
 			    reassociate_stats.ops_eliminated);
   statistics_counter_event (cfun, "Statements rewritten",
 			    reassociate_stats.rewritten);
+  statistics_counter_event (cfun, "Built-in pow[i] calls expanded",
+			    reassociate_stats.pows_expanded);
+  statistics_counter_event (cfun, "Built-in powi calls created",
+			    reassociate_stats.pows_created);
 
   pointer_map_destroy (operand_rank);
   free_alloc_pool (operand_entry_pool);
@@ -3077,19 +3672,33 @@  execute_reassoc (void)
   return 0;
 }
 
+static unsigned int
+execute_early_reassoc (void)
+{
+  early_reassoc = true;
+  return execute_reassoc ();
+}
+
+static unsigned int
+execute_late_reassoc (void)
+{
+  early_reassoc = false;
+  return execute_reassoc ();
+}
+
 static bool
 gate_tree_ssa_reassoc (void)
 {
   return flag_tree_reassoc != 0;
 }
 
-struct gimple_opt_pass pass_reassoc =
+struct gimple_opt_pass pass_early_reassoc =
 {
  {
   GIMPLE_PASS,
-  "reassoc",				/* name */
+  "reassoc1",				/* name */
   gate_tree_ssa_reassoc,		/* gate */
-  execute_reassoc,			/* execute */
+  execute_early_reassoc,		/* execute */
   NULL,					/* sub */
   NULL,					/* next */
   0,					/* static_pass_number */
@@ -3103,3 +3712,24 @@  gate_tree_ssa_reassoc (void)
     | TODO_ggc_collect			/* todo_flags_finish */
  }
 };
+
+struct gimple_opt_pass pass_late_reassoc =
+{
+ {
+  GIMPLE_PASS,
+  "reassoc2",				/* name */
+  gate_tree_ssa_reassoc,		/* gate */
+  execute_late_reassoc,			/* execute */
+  NULL,					/* sub */
+  NULL,					/* next */
+  0,					/* static_pass_number */
+  TV_TREE_REASSOC,			/* tv_id */
+  PROP_cfg | PROP_ssa,			/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_verify_ssa
+    | TODO_verify_flow
+    | TODO_ggc_collect			/* todo_flags_finish */
+ }
+};