diff mbox

Add powi-to-multiply expansion to cse_sincos pass

Message ID 1306173991.4821.7.camel@L3G5336.ibm.com
State New
Headers show

Commit Message

Bill Schmidt May 23, 2011, 6:06 p.m. UTC
Richard, thanks for the excellent comments.  I really appreciate your
help.  I've implemented them all, and bootstrap/regtest succeeds on
powerpc target.  Here's the revised patch for your consideration.

Thanks,
Bill


2011-05-23  Bill Schmidt  <wschmidt@vnet.linux.ibm.com>

	* tree-ssa-math-opts.c (powi_table): New.
	(powi_lookup_cost): New.
	(powi_cost): New.
	(powi_as_mults_1): New.
	(powi_as_mults): New.
	(gimple_expand_builtin_powi): New.
	(execute_cse_sincos): Add switch case for BUILT_IN_POWI.
	(gate_cse_sincos): Remove sincos/cexp restriction.

Comments

Richard Biener May 24, 2011, 12:26 p.m. UTC | #1
On Mon, May 23, 2011 at 8:06 PM, William J. Schmidt
<wschmidt@linux.vnet.ibm.com> wrote:
> Richard, thanks for the excellent comments.  I really appreciate your
> help.  I've implemented them all, and bootstrap/regtest succeeds on
> powerpc target.  Here's the revised patch for your consideration.
>
> Thanks,
> Bill
>
>
> 2011-05-23  Bill Schmidt  <wschmidt@vnet.linux.ibm.com>
>
>        * tree-ssa-math-opts.c (powi_table): New.
>        (powi_lookup_cost): New.
>        (powi_cost): New.
>        (powi_as_mults_1): New.
>        (powi_as_mults): New.
>        (gimple_expand_builtin_powi): New.
>        (execute_cse_sincos): Add switch case for BUILT_IN_POWI.
>        (gate_cse_sincos): Remove sincos/cexp restriction.
>
> Index: gcc/ChangeLog
> ===================================================================
> --- gcc/ChangeLog       (revision 174075)
> +++ gcc/ChangeLog       (working copy)
> @@ -1,3 +1,14 @@
> +2011-05-23  Bill Schmidt  <wschmidt@vnet.linux.ibm.com>
> +
> +       * tree-ssa-math-opts.c (powi_table): New.
> +       (powi_lookup_cost): New.
> +       (powi_cost): New.
> +       (powi_as_mults_1): New.
> +       (powi_as_mults): New.
> +       (gimple_expand_builtin_powi): New.
> +       (execute_cse_sincos): Add switch case for BUILT_IN_POWI.
> +       (gate_cse_sincos): Remove sincos/cexp restriction.
> +
>  2011-05-23  Richard Guenther  <rguenther@suse.de>
>
>        * gimple.c (gimple_types_compatible_p_1): Always compare type names.
> Index: gcc/tree-ssa-math-opts.c
> ===================================================================
> --- gcc/tree-ssa-math-opts.c    (revision 174075)
> +++ gcc/tree-ssa-math-opts.c    (working copy)
> @@ -795,8 +795,243 @@ execute_cse_sincos_1 (tree name)
>   return cfg_changed;
>  }
>
> +/* To evaluate powi(x,n), the floating point value x raised to the
> +   constant integer exponent n, we use a hybrid algorithm that
> +   combines the "window method" with look-up tables.  For an
> +   introduction to exponentiation algorithms and "addition chains",
> +   see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
> +   "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
> +   3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
> +   Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
> +
> +/* Provide a default value for POWI_MAX_MULTS, the maximum number of
> +   multiplications to inline before calling the system library's pow
> +   function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
> +   so this default never requires calling pow, powf or powl.  */
> +
> +#ifndef POWI_MAX_MULTS
> +#define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
> +#endif
> +
> +/* The size of the "optimal power tree" lookup table.  All
> +   exponents less than this value are simply looked up in the
> +   powi_table below.  This threshold is also used to size the
> +   cache of pseudo registers that hold intermediate results.  */
> +#define POWI_TABLE_SIZE 256
> +
> +/* The size, in bits of the window, used in the "window method"
> +   exponentiation algorithm.  This is equivalent to a radix of
> +   (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
> +#define POWI_WINDOW_SIZE 3
> +
> +/* The following table is an efficient representation of an
> +   "optimal power tree".  For each value, i, the corresponding
> +   value, j, in the table states than an optimal evaluation
> +   sequence for calculating pow(x,i) can be found by evaluating
> +   pow(x,j)*pow(x,i-j).  An optimal power tree for the first
> +   100 integers is given in Knuth's "Seminumerical algorithms".  */
> +
> +static const unsigned char powi_table[POWI_TABLE_SIZE] =
> +  {
> +      0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
> +      4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
> +      8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
> +     12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
> +     16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
> +     20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
> +     24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
> +     28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
> +     32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
> +     36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
> +     40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
> +     44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
> +     48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
> +     52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
> +     56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
> +     60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
> +     64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
> +     68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
> +     72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
> +     76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
> +     80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
> +     84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
> +     88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
> +     92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
> +     96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
> +    100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
> +    104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
> +    108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
> +    112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
> +    116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
> +    120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
> +    124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
> +  };
> +
> +
> +/* Return the number of multiplications required to calculate
> +   powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
> +   subroutine of powi_cost.  CACHE is an array indicating
> +   which exponents have already been calculated.  */
> +
> +static int
> +powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
> +{
> +  /* If we've already calculated this exponent, then this evaluation
> +     doesn't require any additional multiplications.  */
> +  if (cache[n])
> +    return 0;
> +
> +  cache[n] = true;
> +  return powi_lookup_cost (n - powi_table[n], cache)
> +        + powi_lookup_cost (powi_table[n], cache) + 1;
> +}
> +
> +/* Return the number of multiplications required to calculate
> +   powi(x,n) for an arbitrary x, given the exponent N.  This
> +   function needs to be kept in sync with powi_as_mults below.  */
> +
> +static int
> +powi_cost (HOST_WIDE_INT n)
> +{
> +  bool cache[POWI_TABLE_SIZE];
> +  unsigned HOST_WIDE_INT digit;
> +  unsigned HOST_WIDE_INT val;
> +  int result;
> +
> +  if (n == 0)
> +    return 0;
> +
> +  /* Ignore the reciprocal when calculating the cost.  */
> +  val = (n < 0) ? -n : n;
> +
> +  /* Initialize the exponent cache.  */
> +  memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
> +  cache[1] = true;
> +
> +  result = 0;
> +
> +  while (val >= POWI_TABLE_SIZE)
> +    {
> +      if (val & 1)
> +       {
> +         digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
> +         result += powi_lookup_cost (digit, cache)
> +                   + POWI_WINDOW_SIZE + 1;
> +         val >>= POWI_WINDOW_SIZE;
> +       }
> +      else
> +       {
> +         val >>= 1;
> +         result++;
> +       }
> +    }
> +
> +  return result + powi_lookup_cost (val, cache);
> +}
> +
> +/* Recursive subroutine of powi_as_mults.  This function takes the
> +   array, CACHE, of already calculated exponents and an exponent N and
> +   returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
> +
> +static tree
> +powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
> +                HOST_WIDE_INT n, tree *cache, tree target)
> +{
> +  tree op0, op1, ssa_target;
> +  unsigned HOST_WIDE_INT digit;
> +  gimple mult_stmt;
> +
> +  if (n < POWI_TABLE_SIZE)
> +    {
> +      if (cache[n])
> +       return cache[n];
> +
> +      ssa_target = make_ssa_name (target, NULL);

You can hoist the ssa_target creation before the if instead of
replicating it ...

> +      cache[n] = ssa_target;
> +
> +      op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache, target);
> +      op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache, target);
> +    }
> +  else if (n & 1)
> +    {
> +      digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
> +      op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache, target);
> +      op1 = powi_as_mults_1 (gsi, loc, type, digit, cache, target);
> +      ssa_target = make_ssa_name (target, NULL);

here and

> +    }
> +  else
> +    {
> +      op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache, target);
> +      op1 = op0;
> +      ssa_target = make_ssa_name (target, NULL);

here.

> +    }
> +
> +  mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
> +  gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
> +
> +  return ssa_target;
> +}
> +
> +/* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
> +   This function needs to be kept in sync with powi_cost above.  */
> +
> +static tree
> +powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
> +              tree arg0, HOST_WIDE_INT n)
> +{
> +  tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0), target;
> +  gimple div_stmt;
> +
> +  if (n == 0)
> +    return build_real (type, dconst1);
> +
> +  memset (cache, 0,  sizeof (cache));
> +  cache[1] = arg0;
> +
> +  target = create_tmp_var (type, "powmult");
> +  add_referenced_var (target);
> +
> +  result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache, target);
> +
> +  if (n >= 0)
> +    return result;
> +
> +  /* If the original exponent was negative, reciprocate the result.  */
> +  target = create_tmp_var (type, "powmult");
> +  add_referenced_var (target);

re-use target from above.

> +  target = make_ssa_name (target, NULL);
> +
> +  div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
> +                                          build_real (type, dconst1),
> +                                          result);
> +  gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
> +
> +  return target;
> +}
> +
> +/* ARG0 and N are the two arguments to a powi builtin in GSI with
> +   location info LOC.  If the arguments are appropriate, create an
> +   equivalent sequence of statements prior to GSI using an optimal
> +   number of multiplications, and return an expession holding the
> +   result.  */
> +
> +static tree
> +gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
> +                           tree arg0, HOST_WIDE_INT n)
> +{
> +  /* Avoid largest negative number.  */
> +  if (n != -n
> +      && ((n >= -1 && n <= 2)
> +         || (optimize_function_for_speed_p (cfun)
> +             && powi_cost (n) <= POWI_MAX_MULTS)))
> +    return powi_as_mults (gsi, loc, arg0, n);
> +
> +  return NULL_TREE;
> +}
> +
>  /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
> -   on the SSA_NAME argument of each of them.  */
> +   on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
> +   an optimal number of multiplies, when n is a constant.  */
>
>  static unsigned int
>  execute_cse_sincos (void)
> @@ -821,7 +1056,9 @@ execute_cse_sincos (void)
>              && (fndecl = gimple_call_fndecl (stmt))
>              && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
>            {
> -             tree arg;
> +             tree arg, arg0, arg1, result;
> +             HOST_WIDE_INT n, n_hi;
> +             location_t loc;
>
>              switch (DECL_FUNCTION_CODE (fndecl))
>                {
> @@ -833,6 +1070,29 @@ execute_cse_sincos (void)
>                    cfg_changed |= execute_cse_sincos_1 (arg);
>                  break;
>
> +               CASE_FLT_FN (BUILT_IN_POWI):
> +                 arg0 = gimple_call_arg (stmt, 0);
> +                 arg1 = gimple_call_arg (stmt, 1);
> +                 if (!host_integerp (arg1, 0))
> +                   break;
> +
> +                 n = TREE_INT_CST_LOW (arg1);
> +                 n_hi = TREE_INT_CST_HIGH (arg1);
> +                 if (n_hi != 0 && n_hi != -1)
> +                   break;

The n_hi query and check isn't necessary as host_integerp already
ensures this.

> +
> +                 loc = gimple_location (stmt);
> +                 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
> +
> +                 if (result)
> +                   {
> +                     tree lhs = gimple_get_lhs (stmt);
> +                     gimple new_stmt = gimple_build_assign (lhs, result);
> +                     gimple_set_location (new_stmt, loc);
> +                     gsi_replace (&gsi, new_stmt, true);
> +                   }
> +                 break;
> +
>                default:;
>                }
>            }
> @@ -849,10 +1109,9 @@ execute_cse_sincos (void)
>  static bool
>  gate_cse_sincos (void)
>  {
> -  /* Make sure we have either sincos or cexp.  */
> -  return (TARGET_HAS_SINCOS
> -         || TARGET_C99_FUNCTIONS)
> -        && optimize;
> +  /* We no longer require either sincos or cexp, since powi expansion
> +     piggybacks on this pass.  */
> +  return optimize;
>  }
>
>  struct gimple_opt_pass pass_cse_sincos =

With the above changes the patch is ok for trunk, possibly after we
resolved the gsi_replace virtual operand case (you probably
can reproduce it with -funsafe-math-optimizations -ferrno-math).

Thanks,
Richard.
Bill Schmidt May 24, 2011, 1:19 p.m. UTC | #2
On Tue, 2011-05-24 at 14:26 +0200, Richard Guenther wrote:

<snip>

> > +/* Recursive subroutine of powi_as_mults.  This function takes the
> > +   array, CACHE, of already calculated exponents and an exponent N and
> > +   returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
> > +
> > +static tree
> > +powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
> > +                HOST_WIDE_INT n, tree *cache, tree target)
> > +{
> > +  tree op0, op1, ssa_target;
> > +  unsigned HOST_WIDE_INT digit;
> > +  gimple mult_stmt;
> > +
> > +  if (n < POWI_TABLE_SIZE)
> > +    {
> > +      if (cache[n])
> > +       return cache[n];
> > +
> > +      ssa_target = make_ssa_name (target, NULL);
> 
> You can hoist the ssa_target creation before the if instead of
> replicating it ...

OK.  The time spent in make_ssa_name will be wasted in the common case
where the value is found in the cache.  But I can restructure the logic
to avoid that.

> 
> > +      cache[n] = ssa_target;
> > +
> > +      op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache, target);
> > +      op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache, target);
> > +    }
> > +  else if (n & 1)
> > +    {
> > +      digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
> > +      op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache, target);
> > +      op1 = powi_as_mults_1 (gsi, loc, type, digit, cache, target);
> > +      ssa_target = make_ssa_name (target, NULL);
> 
> here and
> 
> > +    }
> > +  else
> > +    {
> > +      op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache, target);
> > +      op1 = op0;
> > +      ssa_target = make_ssa_name (target, NULL);
> 
> here.
> 
> > +    }
> > +
> > +  mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
> > +  gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
> > +
> > +  return ssa_target;
> > +}
> > +
> > +/* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
> > +   This function needs to be kept in sync with powi_cost above.  */
> > +
> > +static tree
> > +powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
> > +              tree arg0, HOST_WIDE_INT n)
> > +{
> > +  tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0), target;
> > +  gimple div_stmt;
> > +
> > +  if (n == 0)
> > +    return build_real (type, dconst1);
> > +
> > +  memset (cache, 0,  sizeof (cache));
> > +  cache[1] = arg0;
> > +
> > +  target = create_tmp_var (type, "powmult");
> > +  add_referenced_var (target);
> > +
> > +  result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache, target);
> > +
> > +  if (n >= 0)
> > +    return result;
> > +
> > +  /* If the original exponent was negative, reciprocate the result.  */
> > +  target = create_tmp_var (type, "powmult");
> > +  add_referenced_var (target);
> 
> re-use target from above.
> 

Oops, sorry, that was sloppy.

> > +  target = make_ssa_name (target, NULL);
> > +
> > +  div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
> > +                                          build_real (type, dconst1),
> > +                                          result);
> > +  gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
> > +
> > +  return target;
> > +}
> > +
> > +/* ARG0 and N are the two arguments to a powi builtin in GSI with
> > +   location info LOC.  If the arguments are appropriate, create an
> > +   equivalent sequence of statements prior to GSI using an optimal
> > +   number of multiplications, and return an expession holding the
> > +   result.  */
> > +
> > +static tree
> > +gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
> > +                           tree arg0, HOST_WIDE_INT n)
> > +{
> > +  /* Avoid largest negative number.  */
> > +  if (n != -n
> > +      && ((n >= -1 && n <= 2)
> > +         || (optimize_function_for_speed_p (cfun)
> > +             && powi_cost (n) <= POWI_MAX_MULTS)))
> > +    return powi_as_mults (gsi, loc, arg0, n);
> > +
> > +  return NULL_TREE;
> > +}
> > +
> >  /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
> > -   on the SSA_NAME argument of each of them.  */
> > +   on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
> > +   an optimal number of multiplies, when n is a constant.  */
> >
> >  static unsigned int
> >  execute_cse_sincos (void)
> > @@ -821,7 +1056,9 @@ execute_cse_sincos (void)
> >              && (fndecl = gimple_call_fndecl (stmt))
> >              && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
> >            {
> > -             tree arg;
> > +             tree arg, arg0, arg1, result;
> > +             HOST_WIDE_INT n, n_hi;
> > +             location_t loc;
> >
> >              switch (DECL_FUNCTION_CODE (fndecl))
> >                {
> > @@ -833,6 +1070,29 @@ execute_cse_sincos (void)
> >                    cfg_changed |= execute_cse_sincos_1 (arg);
> >                  break;
> >
> > +               CASE_FLT_FN (BUILT_IN_POWI):
> > +                 arg0 = gimple_call_arg (stmt, 0);
> > +                 arg1 = gimple_call_arg (stmt, 1);
> > +                 if (!host_integerp (arg1, 0))
> > +                   break;
> > +
> > +                 n = TREE_INT_CST_LOW (arg1);
> > +                 n_hi = TREE_INT_CST_HIGH (arg1);
> > +                 if (n_hi != 0 && n_hi != -1)
> > +                   break;
> 
> The n_hi query and check isn't necessary as host_integerp already
> ensures this.
> 

Ah, OK.  Thanks.

> > +
> > +                 loc = gimple_location (stmt);
> > +                 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
> > +
> > +                 if (result)
> > +                   {
> > +                     tree lhs = gimple_get_lhs (stmt);
> > +                     gimple new_stmt = gimple_build_assign (lhs, result);
> > +                     gimple_set_location (new_stmt, loc);
> > +                     gsi_replace (&gsi, new_stmt, true);
> > +                   }
> > +                 break;
> > +
> >                default:;
> >                }
> >            }
> > @@ -849,10 +1109,9 @@ execute_cse_sincos (void)
> >  static bool
> >  gate_cse_sincos (void)
> >  {
> > -  /* Make sure we have either sincos or cexp.  */
> > -  return (TARGET_HAS_SINCOS
> > -         || TARGET_C99_FUNCTIONS)
> > -        && optimize;
> > +  /* We no longer require either sincos or cexp, since powi expansion
> > +     piggybacks on this pass.  */
> > +  return optimize;
> >  }
> >
> >  struct gimple_opt_pass pass_cse_sincos =
> 
> With the above changes the patch is ok for trunk, possibly after we
> resolved the gsi_replace virtual operand case (you probably
> can reproduce it with -funsafe-math-optimizations -ferrno-math).

OK, just saw your other note.  I'll copy the logic from
update_call_from-tree ().

Thanks,
Bill

> 
> Thanks,
> Richard.
diff mbox

Patch

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 174075)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,14 @@ 
+2011-05-23  Bill Schmidt  <wschmidt@vnet.linux.ibm.com>
+
+	* tree-ssa-math-opts.c (powi_table): New.
+	(powi_lookup_cost): New.
+	(powi_cost): New.
+	(powi_as_mults_1): New.
+	(powi_as_mults): New.
+	(gimple_expand_builtin_powi): New.
+	(execute_cse_sincos): Add switch case for BUILT_IN_POWI.
+	(gate_cse_sincos): Remove sincos/cexp restriction.
+	
 2011-05-23  Richard Guenther  <rguenther@suse.de>
 
 	* gimple.c (gimple_types_compatible_p_1): Always compare type names.
Index: gcc/tree-ssa-math-opts.c
===================================================================
--- gcc/tree-ssa-math-opts.c	(revision 174075)
+++ gcc/tree-ssa-math-opts.c	(working copy)
@@ -795,8 +795,243 @@  execute_cse_sincos_1 (tree name)
   return cfg_changed;
 }
 
+/* To evaluate powi(x,n), the floating point value x raised to the
+   constant integer exponent n, we use a hybrid algorithm that
+   combines the "window method" with look-up tables.  For an
+   introduction to exponentiation algorithms and "addition chains",
+   see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
+   "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
+   3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
+   Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
+
+/* Provide a default value for POWI_MAX_MULTS, the maximum number of
+   multiplications to inline before calling the system library's pow
+   function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
+   so this default never requires calling pow, powf or powl.  */
+
+#ifndef POWI_MAX_MULTS
+#define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
+#endif
+
+/* The size of the "optimal power tree" lookup table.  All
+   exponents less than this value are simply looked up in the
+   powi_table below.  This threshold is also used to size the
+   cache of pseudo registers that hold intermediate results.  */
+#define POWI_TABLE_SIZE 256
+
+/* The size, in bits of the window, used in the "window method"
+   exponentiation algorithm.  This is equivalent to a radix of
+   (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
+#define POWI_WINDOW_SIZE 3
+
+/* The following table is an efficient representation of an
+   "optimal power tree".  For each value, i, the corresponding
+   value, j, in the table states than an optimal evaluation
+   sequence for calculating pow(x,i) can be found by evaluating
+   pow(x,j)*pow(x,i-j).  An optimal power tree for the first
+   100 integers is given in Knuth's "Seminumerical algorithms".  */
+
+static const unsigned char powi_table[POWI_TABLE_SIZE] =
+  {
+      0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
+      4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
+      8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
+     12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
+     16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
+     20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
+     24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
+     28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
+     32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
+     36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
+     40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
+     44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
+     48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
+     52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
+     56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
+     60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
+     64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
+     68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
+     72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
+     76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
+     80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
+     84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
+     88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
+     92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
+     96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
+    100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
+    104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
+    108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
+    112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
+    116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
+    120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
+    124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
+  };
+
+
+/* Return the number of multiplications required to calculate
+   powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
+   subroutine of powi_cost.  CACHE is an array indicating
+   which exponents have already been calculated.  */
+
+static int
+powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
+{
+  /* If we've already calculated this exponent, then this evaluation
+     doesn't require any additional multiplications.  */
+  if (cache[n])
+    return 0;
+
+  cache[n] = true;
+  return powi_lookup_cost (n - powi_table[n], cache)
+	 + powi_lookup_cost (powi_table[n], cache) + 1;
+}
+
+/* Return the number of multiplications required to calculate
+   powi(x,n) for an arbitrary x, given the exponent N.  This
+   function needs to be kept in sync with powi_as_mults below.  */
+
+static int
+powi_cost (HOST_WIDE_INT n)
+{
+  bool cache[POWI_TABLE_SIZE];
+  unsigned HOST_WIDE_INT digit;
+  unsigned HOST_WIDE_INT val;
+  int result;
+
+  if (n == 0)
+    return 0;
+
+  /* Ignore the reciprocal when calculating the cost.  */
+  val = (n < 0) ? -n : n;
+
+  /* Initialize the exponent cache.  */
+  memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
+  cache[1] = true;
+
+  result = 0;
+
+  while (val >= POWI_TABLE_SIZE)
+    {
+      if (val & 1)
+	{
+	  digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
+	  result += powi_lookup_cost (digit, cache)
+		    + POWI_WINDOW_SIZE + 1;
+	  val >>= POWI_WINDOW_SIZE;
+	}
+      else
+	{
+	  val >>= 1;
+	  result++;
+	}
+    }
+
+  return result + powi_lookup_cost (val, cache);
+}
+
+/* Recursive subroutine of powi_as_mults.  This function takes the
+   array, CACHE, of already calculated exponents and an exponent N and
+   returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
+
+static tree
+powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
+		 HOST_WIDE_INT n, tree *cache, tree target)
+{
+  tree op0, op1, ssa_target;
+  unsigned HOST_WIDE_INT digit;
+  gimple mult_stmt;
+
+  if (n < POWI_TABLE_SIZE)
+    {
+      if (cache[n])
+	return cache[n];
+
+      ssa_target = make_ssa_name (target, NULL);
+      cache[n] = ssa_target;
+
+      op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache, target);
+      op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache, target);
+    }
+  else if (n & 1)
+    {
+      digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
+      op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache, target);
+      op1 = powi_as_mults_1 (gsi, loc, type, digit, cache, target);
+      ssa_target = make_ssa_name (target, NULL);
+    }
+  else
+    {
+      op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache, target);
+      op1 = op0;
+      ssa_target = make_ssa_name (target, NULL);
+    }
+
+  mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
+  gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
+
+  return ssa_target;
+}
+
+/* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
+   This function needs to be kept in sync with powi_cost above.  */
+
+static tree
+powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
+	       tree arg0, HOST_WIDE_INT n)
+{
+  tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0), target;
+  gimple div_stmt;
+
+  if (n == 0)
+    return build_real (type, dconst1);
+
+  memset (cache, 0,  sizeof (cache));
+  cache[1] = arg0;
+
+  target = create_tmp_var (type, "powmult");
+  add_referenced_var (target);
+
+  result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache, target);
+
+  if (n >= 0)
+    return result;
+
+  /* If the original exponent was negative, reciprocate the result.  */
+  target = create_tmp_var (type, "powmult");
+  add_referenced_var (target);
+  target = make_ssa_name (target, NULL);
+      
+  div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target, 
+					   build_real (type, dconst1),
+					   result);
+  gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
+
+  return target;
+}
+
+/* ARG0 and N are the two arguments to a powi builtin in GSI with
+   location info LOC.  If the arguments are appropriate, create an
+   equivalent sequence of statements prior to GSI using an optimal
+   number of multiplications, and return an expession holding the
+   result.  */
+
+static tree
+gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc, 
+			    tree arg0, HOST_WIDE_INT n)
+{
+  /* Avoid largest negative number.  */
+  if (n != -n
+      && ((n >= -1 && n <= 2)
+	  || (optimize_function_for_speed_p (cfun)
+	      && powi_cost (n) <= POWI_MAX_MULTS)))
+    return powi_as_mults (gsi, loc, arg0, n);
+
+  return NULL_TREE;
+}
+
 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
-   on the SSA_NAME argument of each of them.  */
+   on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
+   an optimal number of multiplies, when n is a constant.  */
 
 static unsigned int
 execute_cse_sincos (void)
@@ -821,7 +1056,9 @@  execute_cse_sincos (void)
 	      && (fndecl = gimple_call_fndecl (stmt))
 	      && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
 	    {
-	      tree arg;
+	      tree arg, arg0, arg1, result;
+	      HOST_WIDE_INT n, n_hi;
+	      location_t loc;
 
 	      switch (DECL_FUNCTION_CODE (fndecl))
 		{
@@ -833,6 +1070,29 @@  execute_cse_sincos (void)
 		    cfg_changed |= execute_cse_sincos_1 (arg);
 		  break;
 
+		CASE_FLT_FN (BUILT_IN_POWI):
+		  arg0 = gimple_call_arg (stmt, 0);
+		  arg1 = gimple_call_arg (stmt, 1);
+		  if (!host_integerp (arg1, 0))
+		    break;
+
+		  n = TREE_INT_CST_LOW (arg1);
+		  n_hi = TREE_INT_CST_HIGH (arg1);
+		  if (n_hi != 0 && n_hi != -1)
+		    break;
+
+		  loc = gimple_location (stmt);
+		  result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
+
+		  if (result)
+		    {
+		      tree lhs = gimple_get_lhs (stmt);
+		      gimple new_stmt = gimple_build_assign (lhs, result);
+		      gimple_set_location (new_stmt, loc);
+		      gsi_replace (&gsi, new_stmt, true);
+		    }
+		  break;
+
 		default:;
 		}
 	    }
@@ -849,10 +1109,9 @@  execute_cse_sincos (void)
 static bool
 gate_cse_sincos (void)
 {
-  /* Make sure we have either sincos or cexp.  */
-  return (TARGET_HAS_SINCOS
-	  || TARGET_C99_FUNCTIONS)
-	 && optimize;
+  /* We no longer require either sincos or cexp, since powi expansion
+     piggybacks on this pass.  */
+  return optimize;
 }
 
 struct gimple_opt_pass pass_cse_sincos =