diff mbox series

Optimise sqrt reciprocal multiplications

Message ID 5B7C371C.8040207@foss.arm.com
State New
Headers show
Series Optimise sqrt reciprocal multiplications | expand

Commit Message

Kyrill Tkachov Aug. 21, 2018, 4 p.m. UTC
Hi all,

This patch aims to optimise sequences involving uses of 1.0 / sqrt (a) under -freciprocal-math and -funsafe-math-optimizations.
In particular consider:

x = 1.0 / sqrt (a);
r1 = x * x;  // same as 1.0 / a
r2 = a * x; // same as sqrt (a)

If x, r1 and r2 are all used further on in the code, this can be transformed into:
tmp1 = 1.0 / a
tmp2 = sqrt (a)
tmp3 = tmp1 * tmp2
x = tmp3
r1 = tmp1
r2 = tmp2

A bit convoluted, but this saves us one multiplication and, more importantly, the sqrt and division are now independent.
This also allows optimisation of a subset of these expressions.
For example:
x = 1.0 / sqrt (a)
r1 = x * x

can be transformed to r1 = 1.0 / a, eliminating the sqrt if x is not used anywhere else.
And similarly:
x = 1.0 / sqrt (a)
r1 = a * x

can be transformed to sqrt (a) eliminating the division.

For the testcase:
double res, res2, tmp;
void
foo (double a, double b)
{
   tmp = 1.0 / __builtin_sqrt (a);
   res = tmp * tmp;
   res2 = a * tmp;
}

We now generate for aarch64 with -Ofast:
foo:
         fmov    d2, 1.0e+0
         adrp    x2, res2
         fsqrt   d1, d0
         adrp    x1, res
         fdiv    d0, d2, d0
         adrp    x0, tmp
         str     d1, [x2, #:lo12:res2]
         fmul    d1, d1, d0
         str     d0, [x1, #:lo12:res]
         str     d1, [x0, #:lo12:tmp]
         ret

where before it generated:
foo:
         fsqrt   d2, d0
         fmov    d1, 1.0e+0
         adrp    x1, res2
         adrp    x2, tmp
         adrp    x0, res
         fdiv    d1, d1, d2
         fmul    d0, d1, d0
         fmul    d2, d1, d1
         str     d1, [x2, #:lo12:tmp]
         str     d0, [x1, #:lo12:res2]
         str     d2, [x0, #:lo12:res]
         ret

As you can see, the new sequence has one fewer multiply and the fsqrt and fdiv are independent.

With this patch I see a 14% improvement on 544.nab_r from SPEC2017 on Cortex-A72 at -Ofast.
Bootstrapped and tested on aarch64-none-linux-gnu and x86_64-unknown-linux-gnu.

Ok for trunk?
Thanks,
Kyrill

2018-08-21  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * tree-ssa-math-opts.c (is_mult_by): New function.
     (optimize_recip_sqrt): Likewise.
     (pass_cse_reciprocals::execute): Use the above.

2018-08-21  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * gcc.dg/recip_sqrt_mult_1.c: New test.
     * gcc.dg/recip_sqrt_mult_2.c: Likewise.
     * gcc.dg/recip_sqrt_mult_3.c: Likewise.

Comments

Richard Sandiford Aug. 23, 2018, 10:13 a.m. UTC | #1
Kyrill  Tkachov <kyrylo.tkachov@foss.arm.com> writes:
> Hi all,
>
> This patch aims to optimise sequences involving uses of 1.0 / sqrt (a) under -freciprocal-math and -funsafe-math-optimizations.
> In particular consider:
>
> x = 1.0 / sqrt (a);
> r1 = x * x;  // same as 1.0 / a
> r2 = a * x; // same as sqrt (a)
>
> If x, r1 and r2 are all used further on in the code, this can be transformed into:
> tmp1 = 1.0 / a
> tmp2 = sqrt (a)
> tmp3 = tmp1 * tmp2
> x = tmp3
> r1 = tmp1
> r2 = tmp2

Nice optimisation :-)  Someone who knows the pass better should review,
but:

There seems to be an implicit assumption that this is a win even
when the r1 and r2 assignments are only conditionally executed.
That's probably true, but it might be worth saying explicitly.

> +/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
> +
> +static inline bool
> +is_mult_by (gimple *use_stmt, tree def, tree a)
> +{
> +  if (gimple_code (use_stmt) == GIMPLE_ASSIGN
> +      && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
> +    {
> +      tree op0 = gimple_assign_rhs1 (use_stmt);
> +      tree op1 = gimple_assign_rhs2 (use_stmt);
> +
> +      return (op0 == def && op1 == a)
> +	      || (op0 == a && op1 == def);
> +    }
> +  return 0;
> +}

Seems like is_square_of could now be a light-weight wrapper around this.

> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
>    occ_head = NULL;
>  }
>  
> +/* Transform sequences like
> +   x = 1.0 / sqrt (a);
> +   r1 = x * x;
> +   r2 = a * x;
> +   into:
> +   tmp1 = 1.0 / a;
> +   tmp2 = sqrt (a);
> +   tmp3 = tmp1 * tmp2;
> +   x = tmp3;
> +   r1 = tmp1;
> +   r2 = tmp2;
> +   depending on the uses of x, r1, r2.  This removes one multiplication and
> +   allows the sqrt and division operations to execute in parallel.
> +   DEF_GSI is the gsi of the initial division by sqrt that defines
> +   DEF (x in the example abovs).  */
> +
> +static void
> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
> +{
> +  use_operand_p use_p;
> +  imm_use_iterator use_iter;
> +  gimple *stmt = gsi_stmt (*def_gsi);
> +  tree x = def;
> +  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
> +  tree div_rhs1 = gimple_assign_rhs1 (stmt);
> +
> +  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
> +      || TREE_CODE (div_rhs1) != REAL_CST
> +      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
> +    return;
> +
> +  gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
> +  if (!is_gimple_call (sqrt_stmt)
> +      || !gimple_call_lhs (sqrt_stmt))
> +    return;
> +
> +  gcall *call = as_a <gcall *> (sqrt_stmt);

Very minor, but:

  gcall *sqrt_stmt
    = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
  if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
    return;

would avoid the need for the separate as_a<>, and would mean that
we only call gimple_call_* on gcalls.

> +  if (has_other_use)
> +    {
> +      /* Using the two temporaries tmp1, tmp2 from above
> +	 the original x is now:
> +	 x = tmp1 * tmp2.  */
> +      gcc_assert (mult_ssa_name);
> +      gcc_assert (sqr_ssa_name);
> +      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
> +
> +      tree new_ssa_name
> +	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_transformed");
> +      gimple *new_stmt
> +	= gimple_build_assign (new_ssa_name, MULT_EXPR,
> +			       mult_ssa_name, sqr_ssa_name);
> +      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
> +      gcc_assert (gsi_stmt (gsi2) == stmt);
> +      gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
> +      fold_stmt (&gsi2);
> +      update_stmt (stmt);

In this case we're replacing the statement in its original position,
so there's no real need to use a temporary.  It seems better to 
change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same
lhs as before.

> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
>        if (optimize_bb_for_size_p (bb))
>          continue;

Seems unnecessary to skip the new optimisation when optimising for size.
Like you say, it saves a multiplication overall.  Also:

> +      if (flag_unsafe_math_optimizations)
> +	{
> +	  for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
> +	       !gsi_end_p (gsi);
> +	       gsi_next (&gsi))
> +	    {
> +	      gimple *stmt = gsi_stmt (gsi);
> +
> +	      if (gimple_has_lhs (stmt)
> +		  && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
> +		  && FLOAT_TYPE_P (TREE_TYPE (def))
> +		  && TREE_CODE (def) == SSA_NAME
> +		  && is_gimple_assign (stmt)
> +		  && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
> +		optimize_recip_sqrt (&gsi, def);
> +	    }
> +	}

It looks like this could safely be done in one of the existing walks
(e.g. the execute_cse_reciprocals_1 one, if we do this when optimising
for size).

Thanks,
Richard
Kyrill Tkachov Aug. 23, 2018, 5:09 p.m. UTC | #2
Hi Richard,

On 23/08/18 11:13, Richard Sandiford wrote:
> Kyrill  Tkachov <kyrylo.tkachov@foss.arm.com> writes:
>> Hi all,
>>
>> This patch aims to optimise sequences involving uses of 1.0 / sqrt (a) under -freciprocal-math and -funsafe-math-optimizations.
>> In particular consider:
>>
>> x = 1.0 / sqrt (a);
>> r1 = x * x;  // same as 1.0 / a
>> r2 = a * x; // same as sqrt (a)
>>
>> If x, r1 and r2 are all used further on in the code, this can be transformed into:
>> tmp1 = 1.0 / a
>> tmp2 = sqrt (a)
>> tmp3 = tmp1 * tmp2
>> x = tmp3
>> r1 = tmp1
>> r2 = tmp2
> Nice optimisation :-)  Someone who knows the pass better should review,
> but:

Thanks for the review.

> There seems to be an implicit assumption that this is a win even
> when the r1 and r2 assignments are only conditionally executed.
> That's probably true, but it might be worth saying explicitly.

I'll admit I had not considered that case.
I think it won't make a difference in practice, as the really expensive operations here
are the sqrt and the division and they are on the executed path in either case and them
becoming independent should be a benefit of its own.

>> +/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
>> +
>> +static inline bool
>> +is_mult_by (gimple *use_stmt, tree def, tree a)
>> +{
>> +  if (gimple_code (use_stmt) == GIMPLE_ASSIGN
>> +      && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
>> +    {
>> +      tree op0 = gimple_assign_rhs1 (use_stmt);
>> +      tree op1 = gimple_assign_rhs2 (use_stmt);
>> +
>> +      return (op0 == def && op1 == a)
>> +	      || (op0 == a && op1 == def);
>> +    }
>> +  return 0;
>> +}
> Seems like is_square_of could now be a light-weight wrapper around this.

Indeed, I've done the wrapping now.

>> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
>>     occ_head = NULL;
>>   }
>>   
>> +/* Transform sequences like
>> +   x = 1.0 / sqrt (a);
>> +   r1 = x * x;
>> +   r2 = a * x;
>> +   into:
>> +   tmp1 = 1.0 / a;
>> +   tmp2 = sqrt (a);
>> +   tmp3 = tmp1 * tmp2;
>> +   x = tmp3;
>> +   r1 = tmp1;
>> +   r2 = tmp2;
>> +   depending on the uses of x, r1, r2.  This removes one multiplication and
>> +   allows the sqrt and division operations to execute in parallel.
>> +   DEF_GSI is the gsi of the initial division by sqrt that defines
>> +   DEF (x in the example abovs).  */
>> +
>> +static void
>> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
>> +{
>> +  use_operand_p use_p;
>> +  imm_use_iterator use_iter;
>> +  gimple *stmt = gsi_stmt (*def_gsi);
>> +  tree x = def;
>> +  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
>> +  tree div_rhs1 = gimple_assign_rhs1 (stmt);
>> +
>> +  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
>> +      || TREE_CODE (div_rhs1) != REAL_CST
>> +      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
>> +    return;
>> +
>> +  gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
>> +  if (!is_gimple_call (sqrt_stmt)
>> +      || !gimple_call_lhs (sqrt_stmt))
>> +    return;
>> +
>> +  gcall *call = as_a <gcall *> (sqrt_stmt);
> Very minor, but:
>
>    gcall *sqrt_stmt
>      = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
>    if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
>      return;
>
> would avoid the need for the separate as_a<>, and would mean that
> we only call gimple_call_* on gcalls.

Ok.

>> +  if (has_other_use)
>> +    {
>> +      /* Using the two temporaries tmp1, tmp2 from above
>> +	 the original x is now:
>> +	 x = tmp1 * tmp2.  */
>> +      gcc_assert (mult_ssa_name);
>> +      gcc_assert (sqr_ssa_name);
>> +      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
>> +
>> +      tree new_ssa_name
>> +	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_transformed");
>> +      gimple *new_stmt
>> +	= gimple_build_assign (new_ssa_name, MULT_EXPR,
>> +			       mult_ssa_name, sqr_ssa_name);
>> +      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
>> +      gcc_assert (gsi_stmt (gsi2) == stmt);
>> +      gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
>> +      fold_stmt (&gsi2);
>> +      update_stmt (stmt);
> In this case we're replacing the statement in its original position,
> so there's no real need to use a temporary.  It seems better to
> change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same
> lhs as before.

Yes, that's cleaner.

>> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
>>         if (optimize_bb_for_size_p (bb))
>>           continue;
> Seems unnecessary to skip the new optimisation when optimising for size.
> Like you say, it saves a multiplication overall.  Also:

Indeed.

>> +      if (flag_unsafe_math_optimizations)
>> +	{
>> +	  for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
>> +	       !gsi_end_p (gsi);
>> +	       gsi_next (&gsi))
>> +	    {
>> +	      gimple *stmt = gsi_stmt (gsi);
>> +
>> +	      if (gimple_has_lhs (stmt)
>> +		  && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
>> +		  && FLOAT_TYPE_P (TREE_TYPE (def))
>> +		  && TREE_CODE (def) == SSA_NAME
>> +		  && is_gimple_assign (stmt)
>> +		  && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
>> +		optimize_recip_sqrt (&gsi, def);
>> +	    }
>> +	}
> It looks like this could safely be done in one of the existing walks
> (e.g. the execute_cse_reciprocals_1 one, if we do this when optimising
> for size).

You're right. I've moved this into one of the walks above this.

Bootstrapped and tested on aarch64-none-linux-gnu and x86_64-unknown-linux-gnu.
CC'ing richi as he's reviewed these kinds of patches in the past.

Is this ok for trunk?

Thanks,
Kyrill

2018-08-23  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * tree-ssa-math-opts.c (is_mult_by): New function.
     (is_square_of): Use the above.
     (optimize_recip_sqrt): New function.
     (pass_cse_reciprocals::execute): Use the above.

2018-08-23  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * gcc.dg/recip_sqrt_mult_1.c: New test.
     * gcc.dg/recip_sqrt_mult_2.c: Likewise.
     * gcc.dg/recip_sqrt_mult_3.c: Likewise.

> Thanks,
> Richard
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..86429e4183ba913638c2b931f7e65a9f6e159c6d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+double res, res2, tmp;
+void
+foo (double a, double b)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  res = tmp * tmp;
+  res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing multiplication" "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..c5fc3de7b657b1769e76254b4bc874e0595e43ef
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+float
+foo (float a)
+{
+  float tmp = 1.0f / __builtin_sqrtf (a);
+  return a * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not " / " "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..e7d185ba7e22cfc7cca72296d5ccc544f24fdb14
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+double
+foo (double a)
+{
+  double tmp = 1.0f / __builtin_sqrt (a);
+  return tmp * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not "__builtin_sqrt" "optimized" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index a90d9d28e4ef6ce66fa380f314a9749fa952be0a..56571e77dae8d504bb984c2088c774bee5b9dce0 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -337,9 +337,9 @@ is_division_by (gimple *use_stmt, tree def)
 	 && gimple_assign_rhs1 (use_stmt) != def;
 }
 
-/* Return whether USE_STMT is DEF * DEF.  */
+/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
 static inline bool
-is_square_of (gimple *use_stmt, tree def)
+is_mult_by (gimple *use_stmt, tree def, tree a)
 {
   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
       && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
@@ -347,11 +347,19 @@ is_square_of (gimple *use_stmt, tree def)
       tree op0 = gimple_assign_rhs1 (use_stmt);
       tree op1 = gimple_assign_rhs2 (use_stmt);
 
-      return op0 == op1 && op0 == def;
+      return (op0 == def && op1 == a)
+	      || (op0 == a && op1 == def);
     }
   return 0;
 }
 
+/* Return whether USE_STMT is DEF * DEF.  */
+static inline bool
+is_square_of (gimple *use_stmt, tree def)
+{
+  return is_mult_by (use_stmt, def, def);
+}
+
 /* Return whether USE_STMT is a floating-point division by
    DEF * DEF.  */
 static inline bool
@@ -526,6 +534,173 @@ free_bb (struct occurrence *occ)
     }
 }
 
+/* Transform sequences like
+   x = 1.0 / sqrt (a);
+   r1 = x * x;
+   r2 = a * x;
+   into:
+   tmp1 = 1.0 / a;
+   tmp2 = sqrt (a);
+   tmp3 = tmp1 * tmp2;
+   x = tmp3;
+   r1 = tmp1;
+   r2 = tmp2;
+   depending on the uses of x, r1, r2.  This removes one multiplication and
+   allows the sqrt and division operations to execute in parallel.
+   DEF_GSI is the gsi of the initial division by sqrt that defines
+   DEF (x in the example abovs).  */
+
+static void
+optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
+{
+  use_operand_p use_p;
+  imm_use_iterator use_iter;
+  gimple *stmt = gsi_stmt (*def_gsi);
+  tree x = def;
+  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
+  tree div_rhs1 = gimple_assign_rhs1 (stmt);
+
+  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
+      || TREE_CODE (div_rhs1) != REAL_CST
+      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
+    return;
+
+  gcall *sqrt_stmt
+    = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
+
+  if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
+    return;
+
+  switch (gimple_call_combined_fn (sqrt_stmt))
+    {
+    CASE_CFN_SQRT:
+    CASE_CFN_SQRT_FN:
+      break;
+
+    default:
+      return;
+    }
+  tree a = gimple_call_arg (sqrt_stmt, 0);
+
+  /* We have 'a' and 'x'.  Now analyze the uses of 'x'.  */
+
+  /* Statements that use x in x * x.  */
+  auto_vec<gimple *> sqr_stmts;
+  /* Statements that use x in a * x.  */
+  auto_vec<gimple *> mult_stmts;
+  bool has_other_use = false;
+
+  FOR_EACH_IMM_USE_FAST (use_p, use_iter, x)
+    {
+      gimple *stmt2 = USE_STMT (use_p);
+      if (is_gimple_debug (stmt2))
+	continue;
+      if (is_square_of (stmt2, x))
+	{
+	  if (!sqr_stmts.contains (stmt2))
+	    sqr_stmts.safe_push (stmt2);
+	}
+      else if (is_mult_by (stmt2, x, a))
+	mult_stmts.safe_push (stmt2);
+      else
+	has_other_use = true;
+    }
+  if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
+    return;
+
+  /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
+     to be able to compose it from the sqr and mult cases.  */
+  if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
+    return;
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
+      print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
+      print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+      fprintf (dump_file, "\n");
+    }
+
+  tree mult_ssa_name = NULL_TREE;
+  tree sqr_ssa_name = NULL_TREE;
+  if (!sqr_stmts.is_empty ())
+    {
+      /* r1 = x * x.  Transform this into:
+	 tmp1 = 1.0 / a
+	 r1 = tmp1.  */
+
+      sqr_ssa_name
+	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
+      gimple *new_stmt
+	= gimple_build_assign (sqr_ssa_name, RDIV_EXPR,
+				build_real (TREE_TYPE (a), dconst1), a);
+      gsi_insert_before (def_gsi, new_stmt, GSI_NEW_STMT);
+
+      unsigned int i;
+      gimple *sqr_stmt;
+      FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
+	{
+	  gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
+
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "Replacing squaring multiplication\n");
+	      print_gimple_stmt (dump_file, sqr_stmt, 0, TDF_NONE);
+	      fprintf (dump_file, "with\n");
+	      print_gimple_stmt (dump_file, new_stmt, 0, TDF_NONE);
+	      fprintf (dump_file, "\n");
+	    }
+	  gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
+	  fold_stmt_inplace (&gsi2);
+	  update_stmt (sqr_stmt);
+      }
+    }
+  if (!mult_stmts.is_empty ())
+    {
+      /* r2 = a * x.  Transform this into:
+	 tmp2 = sqrt (a)
+	 r2 = tmp2.  */
+      mult_ssa_name
+	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_mult");
+      gimple *new_stmt
+	= gimple_build_assign (mult_ssa_name, orig_sqrt_ssa_name);
+      gsi_insert_before (def_gsi, new_stmt, GSI_NEW_STMT);
+
+      unsigned int i;
+      gimple *mult_stmt = NULL;
+      FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
+	{
+	  gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
+
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "Replacing multiplication\n");
+	      print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
+	      fprintf (dump_file, "with\n");
+	      print_gimple_stmt (dump_file, new_stmt, 0, TDF_NONE);
+	      fprintf (dump_file, "\n");
+	    }
+	  gimple_assign_set_rhs_from_tree (&gsi2, mult_ssa_name);
+	  fold_stmt_inplace (&gsi2);
+	  update_stmt (mult_stmt);
+      }
+    }
+
+  if (has_other_use)
+    {
+      /* Using the two temporaries tmp1, tmp2 from above
+	 the original x is now:
+	 x = tmp1 * tmp2.  */
+      gcc_assert (mult_ssa_name);
+      gcc_assert (sqr_ssa_name);
+
+      gimple_assign_set_rhs1 (stmt, mult_ssa_name);
+      gimple_assign_set_rhs2 (stmt, sqr_ssa_name);
+      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
+      fold_stmt_inplace (def_gsi);
+      update_stmt (stmt);
+    }
+}
 
 /* Look for floating-point divisions among DEF's uses, and try to
    replace them by multiplications with the reciprocal.  Add
@@ -546,6 +721,7 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
   int sqrt_recip_count = 0;
 
   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME);
+
   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 
   /* If DEF is a square (x * x), count the number of divisions by x.
@@ -756,7 +932,14 @@ pass_cse_reciprocals::execute (function *fun)
 	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 	      && FLOAT_TYPE_P (TREE_TYPE (def))
 	      && TREE_CODE (def) == SSA_NAME)
-	    execute_cse_reciprocals_1 (&gsi, def);
+	    {
+	      if (flag_unsafe_math_optimizations
+		  && is_gimple_assign (stmt)
+		  && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
+		optimize_recip_sqrt (&gsi, def);
+	      else
+		execute_cse_reciprocals_1 (&gsi, def);
+	    }
 	}
 
       if (optimize_bb_for_size_p (bb))
Kyrill Tkachov Aug. 30, 2018, 9:14 a.m. UTC | #3
Ping.

https://gcc.gnu.org/ml/gcc-patches/2018-08/msg01496.html

Thanks,
Kyrill

On 23/08/18 18:09, Kyrill Tkachov wrote:
> Hi Richard,
>
> On 23/08/18 11:13, Richard Sandiford wrote:
>> Kyrill  Tkachov <kyrylo.tkachov@foss.arm.com> writes:
>>> Hi all,
>>>
>>> This patch aims to optimise sequences involving uses of 1.0 / sqrt (a) under -freciprocal-math and -funsafe-math-optimizations.
>>> In particular consider:
>>>
>>> x = 1.0 / sqrt (a);
>>> r1 = x * x;  // same as 1.0 / a
>>> r2 = a * x; // same as sqrt (a)
>>>
>>> If x, r1 and r2 are all used further on in the code, this can be transformed into:
>>> tmp1 = 1.0 / a
>>> tmp2 = sqrt (a)
>>> tmp3 = tmp1 * tmp2
>>> x = tmp3
>>> r1 = tmp1
>>> r2 = tmp2
>> Nice optimisation :-)  Someone who knows the pass better should review,
>> but:
>
> Thanks for the review.
>
>> There seems to be an implicit assumption that this is a win even
>> when the r1 and r2 assignments are only conditionally executed.
>> That's probably true, but it might be worth saying explicitly.
>
> I'll admit I had not considered that case.
> I think it won't make a difference in practice, as the really expensive operations here
> are the sqrt and the division and they are on the executed path in either case and them
> becoming independent should be a benefit of its own.
>
>>> +/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
>>> +
>>> +static inline bool
>>> +is_mult_by (gimple *use_stmt, tree def, tree a)
>>> +{
>>> +  if (gimple_code (use_stmt) == GIMPLE_ASSIGN
>>> +      && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
>>> +    {
>>> +      tree op0 = gimple_assign_rhs1 (use_stmt);
>>> +      tree op1 = gimple_assign_rhs2 (use_stmt);
>>> +
>>> +      return (op0 == def && op1 == a)
>>> +          || (op0 == a && op1 == def);
>>> +    }
>>> +  return 0;
>>> +}
>> Seems like is_square_of could now be a light-weight wrapper around this.
>
> Indeed, I've done the wrapping now.
>
>>> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
>>>     occ_head = NULL;
>>>   }
>>>   +/* Transform sequences like
>>> +   x = 1.0 / sqrt (a);
>>> +   r1 = x * x;
>>> +   r2 = a * x;
>>> +   into:
>>> +   tmp1 = 1.0 / a;
>>> +   tmp2 = sqrt (a);
>>> +   tmp3 = tmp1 * tmp2;
>>> +   x = tmp3;
>>> +   r1 = tmp1;
>>> +   r2 = tmp2;
>>> +   depending on the uses of x, r1, r2.  This removes one multiplication and
>>> +   allows the sqrt and division operations to execute in parallel.
>>> +   DEF_GSI is the gsi of the initial division by sqrt that defines
>>> +   DEF (x in the example abovs).  */
>>> +
>>> +static void
>>> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
>>> +{
>>> +  use_operand_p use_p;
>>> +  imm_use_iterator use_iter;
>>> +  gimple *stmt = gsi_stmt (*def_gsi);
>>> +  tree x = def;
>>> +  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
>>> +  tree div_rhs1 = gimple_assign_rhs1 (stmt);
>>> +
>>> +  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
>>> +      || TREE_CODE (div_rhs1) != REAL_CST
>>> +      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
>>> +    return;
>>> +
>>> +  gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
>>> +  if (!is_gimple_call (sqrt_stmt)
>>> +      || !gimple_call_lhs (sqrt_stmt))
>>> +    return;
>>> +
>>> +  gcall *call = as_a <gcall *> (sqrt_stmt);
>> Very minor, but:
>>
>>    gcall *sqrt_stmt
>>      = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
>>    if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
>>      return;
>>
>> would avoid the need for the separate as_a<>, and would mean that
>> we only call gimple_call_* on gcalls.
>
> Ok.
>
>>> +  if (has_other_use)
>>> +    {
>>> +      /* Using the two temporaries tmp1, tmp2 from above
>>> +     the original x is now:
>>> +     x = tmp1 * tmp2.  */
>>> +      gcc_assert (mult_ssa_name);
>>> +      gcc_assert (sqr_ssa_name);
>>> +      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
>>> +
>>> +      tree new_ssa_name
>>> +    = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_transformed");
>>> +      gimple *new_stmt
>>> +    = gimple_build_assign (new_ssa_name, MULT_EXPR,
>>> +                   mult_ssa_name, sqr_ssa_name);
>>> +      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
>>> +      gcc_assert (gsi_stmt (gsi2) == stmt);
>>> +      gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
>>> +      fold_stmt (&gsi2);
>>> +      update_stmt (stmt);
>> In this case we're replacing the statement in its original position,
>> so there's no real need to use a temporary.  It seems better to
>> change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same
>> lhs as before.
>
> Yes, that's cleaner.
>
>>> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
>>>         if (optimize_bb_for_size_p (bb))
>>>           continue;
>> Seems unnecessary to skip the new optimisation when optimising for size.
>> Like you say, it saves a multiplication overall.  Also:
>
> Indeed.
>
>>> +      if (flag_unsafe_math_optimizations)
>>> +    {
>>> +      for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
>>> +           !gsi_end_p (gsi);
>>> +           gsi_next (&gsi))
>>> +        {
>>> +          gimple *stmt = gsi_stmt (gsi);
>>> +
>>> +          if (gimple_has_lhs (stmt)
>>> +          && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
>>> +          && FLOAT_TYPE_P (TREE_TYPE (def))
>>> +          && TREE_CODE (def) == SSA_NAME
>>> +          && is_gimple_assign (stmt)
>>> +          && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
>>> +        optimize_recip_sqrt (&gsi, def);
>>> +        }
>>> +    }
>> It looks like this could safely be done in one of the existing walks
>> (e.g. the execute_cse_reciprocals_1 one, if we do this when optimising
>> for size).
>
> You're right. I've moved this into one of the walks above this.
>
> Bootstrapped and tested on aarch64-none-linux-gnu and x86_64-unknown-linux-gnu.
> CC'ing richi as he's reviewed these kinds of patches in the past.
>
> Is this ok for trunk?
>
> Thanks,
> Kyrill
>
> 2018-08-23  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
>     * tree-ssa-math-opts.c (is_mult_by): New function.
>     (is_square_of): Use the above.
>     (optimize_recip_sqrt): New function.
>     (pass_cse_reciprocals::execute): Use the above.
>
> 2018-08-23  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
>     * gcc.dg/recip_sqrt_mult_1.c: New test.
>     * gcc.dg/recip_sqrt_mult_2.c: Likewise.
>     * gcc.dg/recip_sqrt_mult_3.c: Likewise.
>
>> Thanks,
>> Richard
>
Richard Biener Aug. 31, 2018, 11:07 a.m. UTC | #4
On Thu, 30 Aug 2018, Kyrill Tkachov wrote:

> Ping.
> 
> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg01496.html
> 
> Thanks,
> Kyrill
> 
> On 23/08/18 18:09, Kyrill Tkachov wrote:
> > Hi Richard,
> > 
> > On 23/08/18 11:13, Richard Sandiford wrote:
> > > Kyrill  Tkachov <kyrylo.tkachov@foss.arm.com> writes:
> > > > Hi all,
> > > > 
> > > > This patch aims to optimise sequences involving uses of 1.0 / sqrt (a)
> > > > under -freciprocal-math and -funsafe-math-optimizations.
> > > > In particular consider:
> > > > 
> > > > x = 1.0 / sqrt (a);
> > > > r1 = x * x;  // same as 1.0 / a
> > > > r2 = a * x; // same as sqrt (a)
> > > > 
> > > > If x, r1 and r2 are all used further on in the code, this can be
> > > > transformed into:
> > > > tmp1 = 1.0 / a
> > > > tmp2 = sqrt (a)
> > > > tmp3 = tmp1 * tmp2
> > > > x = tmp3
> > > > r1 = tmp1
> > > > r2 = tmp2
> > > Nice optimisation :-)  Someone who knows the pass better should review,
> > > but:
> > 
> > Thanks for the review.
> > 
> > > There seems to be an implicit assumption that this is a win even
> > > when the r1 and r2 assignments are only conditionally executed.
> > > That's probably true, but it might be worth saying explicitly.
> > 
> > I'll admit I had not considered that case.
> > I think it won't make a difference in practice, as the really expensive
> > operations here
> > are the sqrt and the division and they are on the executed path in either
> > case and them
> > becoming independent should be a benefit of its own.
> > 
> > > > +/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
> > > > +
> > > > +static inline bool
> > > > +is_mult_by (gimple *use_stmt, tree def, tree a)
> > > > +{
> > > > +  if (gimple_code (use_stmt) == GIMPLE_ASSIGN
> > > > +      && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
> > > > +    {
> > > > +      tree op0 = gimple_assign_rhs1 (use_stmt);
> > > > +      tree op1 = gimple_assign_rhs2 (use_stmt);
> > > > +
> > > > +      return (op0 == def && op1 == a)
> > > > +          || (op0 == a && op1 == def);
> > > > +    }
> > > > +  return 0;
> > > > +}
> > > Seems like is_square_of could now be a light-weight wrapper around this.
> > 
> > Indeed, I've done the wrapping now.
> > 
> > > > @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator
> > > > *def_gsi, tree def)
> > > >     occ_head = NULL;
> > > >   }
> > > >   +/* Transform sequences like
> > > > +   x = 1.0 / sqrt (a);
> > > > +   r1 = x * x;
> > > > +   r2 = a * x;
> > > > +   into:
> > > > +   tmp1 = 1.0 / a;
> > > > +   tmp2 = sqrt (a);
> > > > +   tmp3 = tmp1 * tmp2;
> > > > +   x = tmp3;
> > > > +   r1 = tmp1;
> > > > +   r2 = tmp2;
> > > > +   depending on the uses of x, r1, r2.  This removes one multiplication
> > > > and
> > > > +   allows the sqrt and division operations to execute in parallel.
> > > > +   DEF_GSI is the gsi of the initial division by sqrt that defines
> > > > +   DEF (x in the example abovs).  */
> > > > +
> > > > +static void
> > > > +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
> > > > +{
> > > > +  use_operand_p use_p;
> > > > +  imm_use_iterator use_iter;
> > > > +  gimple *stmt = gsi_stmt (*def_gsi);
> > > > +  tree x = def;
> > > > +  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
> > > > +  tree div_rhs1 = gimple_assign_rhs1 (stmt);
> > > > +
> > > > +  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
> > > > +      || TREE_CODE (div_rhs1) != REAL_CST
> > > > +      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
> > > > +    return;
> > > > +
> > > > +  gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
> > > > +  if (!is_gimple_call (sqrt_stmt)
> > > > +      || !gimple_call_lhs (sqrt_stmt))
> > > > +    return;
> > > > +
> > > > +  gcall *call = as_a <gcall *> (sqrt_stmt);
> > > Very minor, but:
> > > 
> > >    gcall *sqrt_stmt
> > >      = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
> > >    if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
> > >      return;
> > > 
> > > would avoid the need for the separate as_a<>, and would mean that
> > > we only call gimple_call_* on gcalls.
> > 
> > Ok.
> > 
> > > > +  if (has_other_use)
> > > > +    {
> > > > +      /* Using the two temporaries tmp1, tmp2 from above
> > > > +     the original x is now:
> > > > +     x = tmp1 * tmp2.  */
> > > > +      gcc_assert (mult_ssa_name);
> > > > +      gcc_assert (sqr_ssa_name);
> > > > +      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
> > > > +
> > > > +      tree new_ssa_name
> > > > +    = make_temp_ssa_name (TREE_TYPE (a), NULL,
> > > > "recip_sqrt_transformed");
> > > > +      gimple *new_stmt
> > > > +    = gimple_build_assign (new_ssa_name, MULT_EXPR,
> > > > +                   mult_ssa_name, sqr_ssa_name);
> > > > +      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
> > > > +      gcc_assert (gsi_stmt (gsi2) == stmt);
> > > > +      gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
> > > > +      fold_stmt (&gsi2);
> > > > +      update_stmt (stmt);
> > > In this case we're replacing the statement in its original position,
> > > so there's no real need to use a temporary.  It seems better to
> > > change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same
> > > lhs as before.
> > 
> > Yes, that's cleaner.
> > 
> > > > @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
> > > >         if (optimize_bb_for_size_p (bb))
> > > >           continue;
> > > Seems unnecessary to skip the new optimisation when optimising for size.
> > > Like you say, it saves a multiplication overall.  Also:
> > 
> > Indeed.
> > 
> > > > +      if (flag_unsafe_math_optimizations)
> > > > +    {
> > > > +      for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
> > > > +           !gsi_end_p (gsi);
> > > > +           gsi_next (&gsi))
> > > > +        {
> > > > +          gimple *stmt = gsi_stmt (gsi);
> > > > +
> > > > +          if (gimple_has_lhs (stmt)
> > > > +          && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
> > > > +          && FLOAT_TYPE_P (TREE_TYPE (def))
> > > > +          && TREE_CODE (def) == SSA_NAME
> > > > +          && is_gimple_assign (stmt)
> > > > +          && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
> > > > +        optimize_recip_sqrt (&gsi, def);
> > > > +        }
> > > > +    }
> > > It looks like this could safely be done in one of the existing walks
> > > (e.g. the execute_cse_reciprocals_1 one, if we do this when optimising
> > > for size).
> > 
> > You're right. I've moved this into one of the walks above this.
> > 
> > Bootstrapped and tested on aarch64-none-linux-gnu and
> > x86_64-unknown-linux-gnu.
> > CC'ing richi as he's reviewed these kinds of patches in the past.
> > 
> > Is this ok for trunk?

I wonder how it interacts with execute_cse_reciprocals_1 given it
introduces a division 1.0 / a which will be not processed by
execute_cse_reciprocals_1 given that operates by walking uses of 'a'.
That may be just a missed optimization of course.

+  if (has_other_use)
+    {
+      /* Using the two temporaries tmp1, tmp2 from above
+        the original x is now:
+        x = tmp1 * tmp2.  */
+      gcc_assert (mult_ssa_name);
+      gcc_assert (sqr_ssa_name);
+
+      gimple_assign_set_rhs1 (stmt, mult_ssa_name);
+      gimple_assign_set_rhs2 (stmt, sqr_ssa_name);
+      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
+      fold_stmt_inplace (def_gsi);
+      update_stmt (stmt);
+    }

so you are leaving the original stmt in place unchanged even if it is
not used?  Why?  Note that with -fno-call-exceptions this stmt may
throw, so you should arrange to code-generate 1./a in place of the
original division to preserve EH behavior.  Watch out because then
with other uses you have to find a place to insert its computation.

+      if (is_square_of (stmt2, x))
+       {
+         if (!sqr_stmts.contains (stmt2))
+           sqr_stmts.safe_push (stmt2);
+       }

this is quadratic in the number of square stmts...  please consider
making sqr_stmts a bitmap of SSA defs (so the stmt you have now
is then SSA_NAME_DEF_STMT (ssa_name (bitmap-element))).

You do not seem to restrict placement of the use stmts but insert
before the 1/sqrt(a) stmt.  That possibly hoists the multiplications
where they are not needed.  Consider

  x = 1./sqrt(a);
  if (l)
    l1 = x * 3.;
  else if (l2)
    l1 = x * x;
  else if (l3)
    l1 = a * x;

or similar where on the path to x * 3. you now perform two extra
multiplications.

Your testcases do not cover the case of other uses at all.  Or of
EH.

Richard.


> > Thanks,
> > Kyrill
> > 
> > 2018-08-23  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> > 
> >     * tree-ssa-math-opts.c (is_mult_by): New function.
> >     (is_square_of): Use the above.
> >     (optimize_recip_sqrt): New function.
> >     (pass_cse_reciprocals::execute): Use the above.
> > 
> > 2018-08-23  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> > 
> >     * gcc.dg/recip_sqrt_mult_1.c: New test.
> >     * gcc.dg/recip_sqrt_mult_2.c: Likewise.
> >     * gcc.dg/recip_sqrt_mult_3.c: Likewise.
> > 
> > > Thanks,
> > > Richard
> > 
> 
>
Kyrill Tkachov Sept. 4, 2018, 12:23 p.m. UTC | #5
Hi Richard,

On 31/08/18 12:07, Richard Biener wrote:
 > On Thu, 30 Aug 2018, Kyrill Tkachov wrote:
 >
 >> Ping.
 >>
 >> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg01496.html
 >>
 >> Thanks,
 >> Kyrill
 >>
 >> On 23/08/18 18:09, Kyrill Tkachov wrote:
 >>> Hi Richard,
 >>>
 >>> On 23/08/18 11:13, Richard Sandiford wrote:
 >>>> Kyrill  Tkachov <kyrylo.tkachov@foss.arm.com> writes:
 >>>>> Hi all,
 >>>>>
 >>>>> This patch aims to optimise sequences involving uses of 1.0 / sqrt (a)
 >>>>> under -freciprocal-math and -funsafe-math-optimizations.
 >>>>> In particular consider:
 >>>>>
 >>>>> x = 1.0 / sqrt (a);
 >>>>> r1 = x * x;  // same as 1.0 / a
 >>>>> r2 = a * x; // same as sqrt (a)
 >>>>>
 >>>>> If x, r1 and r2 are all used further on in the code, this can be
 >>>>> transformed into:
 >>>>> tmp1 = 1.0 / a
 >>>>> tmp2 = sqrt (a)
 >>>>> tmp3 = tmp1 * tmp2
 >>>>> x = tmp3
 >>>>> r1 = tmp1
 >>>>> r2 = tmp2
 >>>> Nice optimisation :-)  Someone who knows the pass better should review,
 >>>> but:
 >>>
 >>> Thanks for the review.
 >>>
 >>>> There seems to be an implicit assumption that this is a win even
 >>>> when the r1 and r2 assignments are only conditionally executed.
 >>>> That's probably true, but it might be worth saying explicitly.
 >>>
 >>> I'll admit I had not considered that case.
 >>> I think it won't make a difference in practice, as the really expensive
 >>> operations here
 >>> are the sqrt and the division and they are on the executed path in either
 >>> case and them
 >>> becoming independent should be a benefit of its own.
 >>>
 >>>>> +/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
 >>>>> +
 >>>>> +static inline bool
 >>>>> +is_mult_by (gimple *use_stmt, tree def, tree a)
 >>>>> +{
 >>>>> +  if (gimple_code (use_stmt) == GIMPLE_ASSIGN
 >>>>> +      && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
 >>>>> +    {
 >>>>> +      tree op0 = gimple_assign_rhs1 (use_stmt);
 >>>>> +      tree op1 = gimple_assign_rhs2 (use_stmt);
 >>>>> +
 >>>>> +      return (op0 == def && op1 == a)
 >>>>> +          || (op0 == a && op1 == def);
 >>>>> +    }
 >>>>> +  return 0;
 >>>>> +}
 >>>> Seems like is_square_of could now be a light-weight wrapper around this.
 >>>
 >>> Indeed, I've done the wrapping now.
 >>>
 >>>>> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator
 >>>>> *def_gsi, tree def)
 >>>>>     occ_head = NULL;
 >>>>>   }
 >>>>>   +/* Transform sequences like
 >>>>> +   x = 1.0 / sqrt (a);
 >>>>> +   r1 = x * x;
 >>>>> +   r2 = a * x;
 >>>>> +   into:
 >>>>> +   tmp1 = 1.0 / a;
 >>>>> +   tmp2 = sqrt (a);
 >>>>> +   tmp3 = tmp1 * tmp2;
 >>>>> +   x = tmp3;
 >>>>> +   r1 = tmp1;
 >>>>> +   r2 = tmp2;
 >>>>> +   depending on the uses of x, r1, r2.  This removes one multiplication
 >>>>> and
 >>>>> +   allows the sqrt and division operations to execute in parallel.
 >>>>> +   DEF_GSI is the gsi of the initial division by sqrt that defines
 >>>>> +   DEF (x in the example abovs).  */
 >>>>> +
 >>>>> +static void
 >>>>> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
 >>>>> +{
 >>>>> +  use_operand_p use_p;
 >>>>> +  imm_use_iterator use_iter;
 >>>>> +  gimple *stmt = gsi_stmt (*def_gsi);
 >>>>> +  tree x = def;
 >>>>> +  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
 >>>>> +  tree div_rhs1 = gimple_assign_rhs1 (stmt);
 >>>>> +
 >>>>> +  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
 >>>>> +      || TREE_CODE (div_rhs1) != REAL_CST
 >>>>> +      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
 >>>>> +    return;
 >>>>> +
 >>>>> +  gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
 >>>>> +  if (!is_gimple_call (sqrt_stmt)
 >>>>> +      || !gimple_call_lhs (sqrt_stmt))
 >>>>> +    return;
 >>>>> +
 >>>>> +  gcall *call = as_a <gcall *> (sqrt_stmt);
 >>>> Very minor, but:
 >>>>
 >>>>    gcall *sqrt_stmt
 >>>>      = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
 >>>>    if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
 >>>>      return;
 >>>>
 >>>> would avoid the need for the separate as_a<>, and would mean that
 >>>> we only call gimple_call_* on gcalls.
 >>>
 >>> Ok.
 >>>
 >>>>> +  if (has_other_use)
 >>>>> +    {
 >>>>> +      /* Using the two temporaries tmp1, tmp2 from above
 >>>>> +     the original x is now:
 >>>>> +     x = tmp1 * tmp2.  */
 >>>>> +      gcc_assert (mult_ssa_name);
 >>>>> +      gcc_assert (sqr_ssa_name);
 >>>>> +      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
 >>>>> +
 >>>>> +      tree new_ssa_name
 >>>>> +    = make_temp_ssa_name (TREE_TYPE (a), NULL,
 >>>>> "recip_sqrt_transformed");
 >>>>> +      gimple *new_stmt
 >>>>> +    = gimple_build_assign (new_ssa_name, MULT_EXPR,
 >>>>> +                   mult_ssa_name, sqr_ssa_name);
 >>>>> +      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
 >>>>> +      gcc_assert (gsi_stmt (gsi2) == stmt);
 >>>>> +      gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
 >>>>> +      fold_stmt (&gsi2);
 >>>>> +      update_stmt (stmt);
 >>>> In this case we're replacing the statement in its original position,
 >>>> so there's no real need to use a temporary.  It seems better to
 >>>> change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same
 >>>> lhs as before.
 >>>
 >>> Yes, that's cleaner.
 >>>
 >>>>> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
 >>>>>         if (optimize_bb_for_size_p (bb))
 >>>>>           continue;
 >>>> Seems unnecessary to skip the new optimisation when optimising for size.
 >>>> Like you say, it saves a multiplication overall. Also:
 >>>
 >>> Indeed.
 >>>
 >>>>> +      if (flag_unsafe_math_optimizations)
 >>>>> +    {
 >>>>> +      for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
 >>>>> +           !gsi_end_p (gsi);
 >>>>> +           gsi_next (&gsi))
 >>>>> +        {
 >>>>> +          gimple *stmt = gsi_stmt (gsi);
 >>>>> +
 >>>>> +          if (gimple_has_lhs (stmt)
 >>>>> +          && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 >>>>> +          && FLOAT_TYPE_P (TREE_TYPE (def))
 >>>>> +          && TREE_CODE (def) == SSA_NAME
 >>>>> +          && is_gimple_assign (stmt)
 >>>>> +          && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 >>>>> +        optimize_recip_sqrt (&gsi, def);
 >>>>> +        }
 >>>>> +    }
 >>>> It looks like this could safely be done in one of the existing walks
 >>>> (e.g. the execute_cse_reciprocals_1 one, if we do this when optimising
 >>>> for size).
 >>>
 >>> You're right. I've moved this into one of the walks above this.
 >>>
 >>> Bootstrapped and tested on aarch64-none-linux-gnu and
 >>> x86_64-unknown-linux-gnu.
 >>> CC'ing richi as he's reviewed these kinds of patches in the past.
 >>>
 >>> Is this ok for trunk?
 >
 > I wonder how it interacts with execute_cse_reciprocals_1 given it
 > introduces a division 1.0 / a which will be not processed by
 > execute_cse_reciprocals_1 given that operates by walking uses of 'a'.
 > That may be just a missed optimization of course.

Hmm, I believe right now it doesn't interact with execute_cse_reciprocals_1 as it's either one or the other.
I've left it as it is for now, but would you like us to call execute_cse_reciprocals_1 on the potentially transformed division?


 >
 > +  if (has_other_use)
 > +    {
 > +      /* Using the two temporaries tmp1, tmp2 from above
 > +        the original x is now:
 > +        x = tmp1 * tmp2.  */
 > +      gcc_assert (mult_ssa_name);
 > +      gcc_assert (sqr_ssa_name);
 > +
 > +      gimple_assign_set_rhs1 (stmt, mult_ssa_name);
 > +      gimple_assign_set_rhs2 (stmt, sqr_ssa_name);
 > +      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 > +      fold_stmt_inplace (def_gsi);
 > +      update_stmt (stmt);
 > +    }
 >
 > so you are leaving the original stmt in place unchanged even if it is
 > not used?  Why?  Note that with -fno-call-exceptions this stmt may
 > throw, so you should arrange to code-generate 1./a in place of the
 > original division to preserve EH behavior.  Watch out because then
 > with other uses you have to find a place to insert its computation.

Ok. These are oversights on my part. I've updated the patch to modify the
original division in place. The multiplication is placed after it.


 >
 > +      if (is_square_of (stmt2, x))
 > +       {
 > +         if (!sqr_stmts.contains (stmt2))
 > +           sqr_stmts.safe_push (stmt2);
 > +       }
 >
 > this is quadratic in the number of square stmts...  please consider
 > making sqr_stmts a bitmap of SSA defs (so the stmt you have now
 > is then SSA_NAME_DEF_STMT (ssa_name (bitmap-element))).
 >

Done. In practice I didn't see there being more than one such use, I expect them
to be CSE'd, but maybe if they're in different basic blocks...

 > You do not seem to restrict placement of the use stmts but insert
 > before the 1/sqrt(a) stmt.  That possibly hoists the multiplications
 > where they are not needed.  Consider
 >
 >   x = 1./sqrt(a);
 >   if (l)
 >     l1 = x * 3.;
 >   else if (l2)
 >     l1 = x * x;
 >   else if (l3)
 >     l1 = a * x;
 >
 > or similar where on the path to x * 3. you now perform two extra
 > multiplications.
 >

Ok, I've restricted the optimisation somewhat.
Now if there's an other use it won't perform the transformation unless there is already
a multiplication present on the main path. That way it won't introduce multiplications
on paths where there aren't any.


 > Your testcases do not cover the case of other uses at all.  Or of
 > EH.

gcc.dg/recip_sqrt_mult_1.c should handle the other uses case as tmp is a global
being written to. I've added more testcases involving uses in different basic blocks
and a g++.dg testcase with -fnon-call-exceptions.
I'm not sure what functionality to test though apart from that it doesn't ICE and does
the transformations I expect.

Bootstrapped and tested on aarch64-none-linux-gnu.
Is this version better?

Thanks,
Kyrill


2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * tree-ssa-math-opts.c (is_mult_by): New function.
     (is_square_of): Use the above.
     (optimize_recip_sqrt): New function.
     (pass_cse_reciprocals::execute): Use the above.

2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * gcc.dg/recip_sqrt_mult_1.c: New test.
     * gcc.dg/recip_sqrt_mult_2.c: Likewise.
     * gcc.dg/recip_sqrt_mult_3.c: Likewise.
     * gcc.dg/recip_sqrt_mult_4.c: Likewise.
     * gcc.dg/recip_sqrt_mult_5.c: Likewise.
     * g++.dg/recip_sqrt_mult_1.c: Likewise.
diff --git a/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C
new file mode 100644
index 0000000000000000000000000000000000000000..11d9c6f758f1529d8ed4cadf85010f6ce379c195
--- /dev/null
+++ b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fnon-call-exceptions -fdump-tree-recip" } */
+
+double res, res2, tmp;
+void
+foo1 (double a, double b)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+    res = tmp * tmp;
+    res2 = a * tmp;
+  }
+  catch (...)
+    { ; }
+}
+
+void
+foo4 (double a, double b, int c, int d)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+  }
+  catch (...)
+    {
+      if (c)
+	res = tmp * tmp;
+
+      if (d)
+	res2 = a * tmp;
+    }
+}
+
+void
+foo5 (double a, double b, int c, int d)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+    res = tmp * tmp;
+
+    if (d)
+      res2 = a * tmp;
+  }
+  catch (...)
+    { ; }
+}
+
+/* { dg-final { scan-tree-dump-times "Optimizing reciprocal sqrt multiplications" 2 "recip" } } */
+/* { dg-final { scan-tree-dump-times "Replacing squaring multiplication" 2 "recip" } } */
+/* { dg-final { scan-tree-dump-times "Replacing original division" 2 "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..188390a4ecffca1b21f05c4f19abecfb7ebd188f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+double res, res2, tmp;
+void
+foo (double a, double b)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  res = tmp * tmp;
+  res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..c5fc3de7b657b1769e76254b4bc874e0595e43ef
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+float
+foo (float a)
+{
+  float tmp = 1.0f / __builtin_sqrtf (a);
+  return a * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not " / " "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..e7d185ba7e22cfc7cca72296d5ccc544f24fdb14
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+double
+foo (double a)
+{
+  double tmp = 1.0f / __builtin_sqrt (a);
+  return tmp * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not "__builtin_sqrt" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..e3005f2feb6f4bacbb6eafc0155e196cb866fcdf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+/* The main path doesn't have any multiplications.
+   Avoid introducing them in the recip pass.  */
+
+double res, res2, tmp;
+void
+foo (double a, double b, int c, int d)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  if (c)
+    res = tmp * tmp;
+
+  if (d)
+    res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump-not "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump-not "Replacing original division" "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c
new file mode 100644
index 0000000000000000000000000000000000000000..e871f0fcd4feb1687f9815e4babf4d0667a15ea8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+/* We want to do the recip_sqrt transformations here there is already
+   a multiplication on the main path.  */
+
+double res, res2, tmp;
+void
+foo (double a, double b, int c, int d)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  res = tmp * tmp;
+
+  if (d)
+    res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 25378da6f4ab27dbce51e10461efc18cc416a4d7..297afcd7a8f9cfc5f1edc1a762371b2a127c66db 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -337,9 +337,9 @@ is_division_by (gimple *use_stmt, tree def)
 	 && gimple_assign_rhs1 (use_stmt) != def;
 }
 
-/* Return whether USE_STMT is DEF * DEF.  */
+/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
 static inline bool
-is_square_of (gimple *use_stmt, tree def)
+is_mult_by (gimple *use_stmt, tree def, tree a)
 {
   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
       && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
@@ -347,11 +347,19 @@ is_square_of (gimple *use_stmt, tree def)
       tree op0 = gimple_assign_rhs1 (use_stmt);
       tree op1 = gimple_assign_rhs2 (use_stmt);
 
-      return op0 == op1 && op0 == def;
+      return (op0 == def && op1 == a)
+	      || (op0 == a && op1 == def);
     }
   return 0;
 }
 
+/* Return whether USE_STMT is DEF * DEF.  */
+static inline bool
+is_square_of (gimple *use_stmt, tree def)
+{
+  return is_mult_by (use_stmt, def, def);
+}
+
 /* Return whether USE_STMT is a floating-point division by
    DEF * DEF.  */
 static inline bool
@@ -526,6 +534,195 @@ free_bb (struct occurrence *occ)
     }
 }
 
+/* Transform sequences like
+   t = sqrt (a)
+   x = 1.0 / t;
+   r1 = x * x;
+   r2 = a * x;
+   into:
+   t = sqrt (a)
+   r1 = 1.0 / a;
+   r2 = t;
+   x = r1 * r2;
+   depending on the uses of x, r1, r2.  This removes one multiplication and
+   allows the sqrt and division operations to execute in parallel.
+   DEF_GSI is the gsi of the initial division by sqrt that defines
+   DEF (x in the example abovs).  */
+
+static void
+optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
+{
+  use_operand_p use_p;
+  imm_use_iterator use_iter;
+  gimple *stmt = gsi_stmt (*def_gsi);
+  tree x = def;
+  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
+  tree div_rhs1 = gimple_assign_rhs1 (stmt);
+
+  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
+      || TREE_CODE (div_rhs1) != REAL_CST
+      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
+    return;
+
+  gcall *sqrt_stmt
+    = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
+
+  if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
+    return;
+
+  switch (gimple_call_combined_fn (sqrt_stmt))
+    {
+    CASE_CFN_SQRT:
+    CASE_CFN_SQRT_FN:
+      break;
+
+    default:
+      return;
+    }
+  tree a = gimple_call_arg (sqrt_stmt, 0);
+
+  /* We have 'a' and 'x'.  Now analyze the uses of 'x'.  */
+
+  /* Statements that use x in x * x.  Since x appears twice use an SSA_NAME
+     bitmap to avoid recording the statement twice.  */
+  auto_bitmap sqr_ssa_names;
+  /* Statements that use x in a * x.  */
+  auto_vec<gimple *> mult_stmts;
+  bool has_other_use = false;
+  bool mult_on_main_path = false;
+
+  FOR_EACH_IMM_USE_FAST (use_p, use_iter, x)
+    {
+      gimple *stmt2 = USE_STMT (use_p);
+      if (is_gimple_debug (stmt2))
+	continue;
+      if (is_square_of (stmt2, x))
+	{
+	  bitmap_set_bit (sqr_ssa_names,
+			  SSA_NAME_VERSION (gimple_assign_lhs (stmt2)));
+	  if (gimple_bb (stmt2) == gimple_bb (stmt))
+	    mult_on_main_path = true;
+	}
+      else if (is_mult_by (stmt2, x, a))
+	{
+	  mult_stmts.safe_push (stmt2);
+	  if (gimple_bb (stmt2) == gimple_bb (stmt))
+	    mult_on_main_path = true;
+	}
+      else
+	has_other_use = true;
+    }
+
+  /* In the x * x and a * x cases we just rewire stmt operands or
+     remove multiplications.  In the has_other_use case we introduce
+     a multiplication so make sure we don't introduce a multiplication
+     on a path where there was none.  */
+  if (has_other_use && !mult_on_main_path)
+    return;
+
+  if (bitmap_empty_p (sqr_ssa_names) && mult_stmts.is_empty ())
+    return;
+
+  /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
+     to be able to compose it from the sqr and mult cases.  */
+  if (has_other_use && (bitmap_empty_p (sqr_ssa_names)
+			|| mult_stmts.is_empty ()))
+    return;
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
+      print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
+      print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+      fprintf (dump_file, "\n");
+    }
+
+  bool delete_div = !has_other_use;
+  tree sqr_ssa_name = NULL_TREE;
+  if (!bitmap_empty_p (sqr_ssa_names))
+    {
+      /* r1 = x * x.  Transform the original
+	 x = 1.0 / t
+	 into
+	 tmp1 = 1.0 / a
+	 r1 = tmp1.  */
+
+      sqr_ssa_name
+	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
+
+      if (dump_file)
+	{
+	  fprintf (dump_file, "Replacing original division\n");
+	  print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+	  fprintf (dump_file, "with new division\n");
+	}
+      gimple_assign_set_lhs (stmt, sqr_ssa_name);
+      gimple_assign_set_rhs2 (stmt, a);
+      fold_stmt_inplace (def_gsi);
+      update_stmt (stmt);
+
+      if (dump_file)
+	print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+
+      delete_div = false;
+      gimple *sqr_stmt;
+      bitmap_iterator bi;
+      unsigned int i;
+      EXECUTE_IF_SET_IN_BITMAP (sqr_ssa_names, 0, i, bi)
+	{
+	  sqr_stmt = SSA_NAME_DEF_STMT (ssa_name (i));
+
+	  gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
+	  gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
+	  update_stmt (sqr_stmt);
+      }
+    }
+  if (!mult_stmts.is_empty ())
+    {
+      /* r2 = a * x.  Transform this into:
+	 r2 = t (The original sqrt (a)).  */
+      unsigned int i;
+      gimple *mult_stmt = NULL;
+      FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
+	{
+	  gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
+
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "Replacing squaring multiplication\n");
+	      print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
+	      fprintf (dump_file, "with assignment\n");
+	    }
+	  gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
+	  fold_stmt_inplace (&gsi2);
+	  update_stmt (mult_stmt);
+	  if (dump_file)
+	    print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
+      }
+    }
+
+  if (has_other_use)
+    {
+      /* Using the two temporaries tmp1, tmp2 from above
+	 the original x is now:
+	 x = tmp1 * tmp2.  */
+      gcc_assert (orig_sqrt_ssa_name);
+      gcc_assert (sqr_ssa_name);
+
+      gimple *new_stmt
+	= gimple_build_assign (x, MULT_EXPR,
+				orig_sqrt_ssa_name, sqr_ssa_name);
+      gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
+      update_stmt (stmt);
+    }
+  else if (delete_div)
+    {
+      /* Remove the original division.  */
+      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
+      gsi_remove (&gsi2, true);
+      release_defs (stmt);
+    }
+}
 
 /* Look for floating-point divisions among DEF's uses, and try to
    replace them by multiplications with the reciprocal.  Add
@@ -546,6 +743,7 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
   int sqrt_recip_count = 0;
 
   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME);
+
   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 
   /* If DEF is a square (x * x), count the number of divisions by x.
@@ -756,7 +954,14 @@ pass_cse_reciprocals::execute (function *fun)
 	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 	      && FLOAT_TYPE_P (TREE_TYPE (def))
 	      && TREE_CODE (def) == SSA_NAME)
-	    execute_cse_reciprocals_1 (&gsi, def);
+	    {
+	      if (flag_unsafe_math_optimizations
+		  && is_gimple_assign (stmt)
+		  && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
+		optimize_recip_sqrt (&gsi, def);
+	      else
+		execute_cse_reciprocals_1 (&gsi, def);
+	    }
 	}
 
       if (optimize_bb_for_size_p (bb))
Richard Biener Sept. 4, 2018, 2:31 p.m. UTC | #6
On Tue, 4 Sep 2018, Kyrill Tkachov wrote:

> Hi Richard,
> 
> On 31/08/18 12:07, Richard Biener wrote:
> > On Thu, 30 Aug 2018, Kyrill Tkachov wrote:
> >
> >> Ping.
> >>
> >> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg01496.html
> >>
> >> Thanks,
> >> Kyrill
> >>
> >> On 23/08/18 18:09, Kyrill Tkachov wrote:
> >>> Hi Richard,
> >>>
> >>> On 23/08/18 11:13, Richard Sandiford wrote:
> >>>> Kyrill  Tkachov <kyrylo.tkachov@foss.arm.com> writes:
> >>>>> Hi all,
> >>>>>
> >>>>> This patch aims to optimise sequences involving uses of 1.0 / sqrt (a)
> >>>>> under -freciprocal-math and -funsafe-math-optimizations.
> >>>>> In particular consider:
> >>>>>
> >>>>> x = 1.0 / sqrt (a);
> >>>>> r1 = x * x;  // same as 1.0 / a
> >>>>> r2 = a * x; // same as sqrt (a)
> >>>>>
> >>>>> If x, r1 and r2 are all used further on in the code, this can be
> >>>>> transformed into:
> >>>>> tmp1 = 1.0 / a
> >>>>> tmp2 = sqrt (a)
> >>>>> tmp3 = tmp1 * tmp2
> >>>>> x = tmp3
> >>>>> r1 = tmp1
> >>>>> r2 = tmp2
> >>>> Nice optimisation :-)  Someone who knows the pass better should review,
> >>>> but:
> >>>
> >>> Thanks for the review.
> >>>
> >>>> There seems to be an implicit assumption that this is a win even
> >>>> when the r1 and r2 assignments are only conditionally executed.
> >>>> That's probably true, but it might be worth saying explicitly.
> >>>
> >>> I'll admit I had not considered that case.
> >>> I think it won't make a difference in practice, as the really expensive
> >>> operations here
> >>> are the sqrt and the division and they are on the executed path in either
> >>> case and them
> >>> becoming independent should be a benefit of its own.
> >>>
> >>>>> +/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
> >>>>> +
> >>>>> +static inline bool
> >>>>> +is_mult_by (gimple *use_stmt, tree def, tree a)
> >>>>> +{
> >>>>> +  if (gimple_code (use_stmt) == GIMPLE_ASSIGN
> >>>>> +      && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
> >>>>> +    {
> >>>>> +      tree op0 = gimple_assign_rhs1 (use_stmt);
> >>>>> +      tree op1 = gimple_assign_rhs2 (use_stmt);
> >>>>> +
> >>>>> +      return (op0 == def && op1 == a)
> >>>>> +          || (op0 == a && op1 == def);
> >>>>> +    }
> >>>>> +  return 0;
> >>>>> +}
> >>>> Seems like is_square_of could now be a light-weight wrapper around this.
> >>>
> >>> Indeed, I've done the wrapping now.
> >>>
> >>>>> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator
> >>>>> *def_gsi, tree def)
> >>>>>     occ_head = NULL;
> >>>>>   }
> >>>>>   +/* Transform sequences like
> >>>>> +   x = 1.0 / sqrt (a);
> >>>>> +   r1 = x * x;
> >>>>> +   r2 = a * x;
> >>>>> +   into:
> >>>>> +   tmp1 = 1.0 / a;
> >>>>> +   tmp2 = sqrt (a);
> >>>>> +   tmp3 = tmp1 * tmp2;
> >>>>> +   x = tmp3;
> >>>>> +   r1 = tmp1;
> >>>>> +   r2 = tmp2;
> >>>>> +   depending on the uses of x, r1, r2.  This removes one multiplication
> >>>>> and
> >>>>> +   allows the sqrt and division operations to execute in parallel.
> >>>>> +   DEF_GSI is the gsi of the initial division by sqrt that defines
> >>>>> +   DEF (x in the example abovs).  */
> >>>>> +
> >>>>> +static void
> >>>>> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
> >>>>> +{
> >>>>> +  use_operand_p use_p;
> >>>>> +  imm_use_iterator use_iter;
> >>>>> +  gimple *stmt = gsi_stmt (*def_gsi);
> >>>>> +  tree x = def;
> >>>>> +  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
> >>>>> +  tree div_rhs1 = gimple_assign_rhs1 (stmt);
> >>>>> +
> >>>>> +  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
> >>>>> +      || TREE_CODE (div_rhs1) != REAL_CST
> >>>>> +      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
> >>>>> +    return;
> >>>>> +
> >>>>> +  gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
> >>>>> +  if (!is_gimple_call (sqrt_stmt)
> >>>>> +      || !gimple_call_lhs (sqrt_stmt))
> >>>>> +    return;
> >>>>> +
> >>>>> +  gcall *call = as_a <gcall *> (sqrt_stmt);
> >>>> Very minor, but:
> >>>>
> >>>>    gcall *sqrt_stmt
> >>>>      = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
> >>>>    if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
> >>>>      return;
> >>>>
> >>>> would avoid the need for the separate as_a<>, and would mean that
> >>>> we only call gimple_call_* on gcalls.
> >>>
> >>> Ok.
> >>>
> >>>>> +  if (has_other_use)
> >>>>> +    {
> >>>>> +      /* Using the two temporaries tmp1, tmp2 from above
> >>>>> +     the original x is now:
> >>>>> +     x = tmp1 * tmp2.  */
> >>>>> +      gcc_assert (mult_ssa_name);
> >>>>> +      gcc_assert (sqr_ssa_name);
> >>>>> +      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
> >>>>> +
> >>>>> +      tree new_ssa_name
> >>>>> +    = make_temp_ssa_name (TREE_TYPE (a), NULL,
> >>>>> "recip_sqrt_transformed");
> >>>>> +      gimple *new_stmt
> >>>>> +    = gimple_build_assign (new_ssa_name, MULT_EXPR,
> >>>>> +                   mult_ssa_name, sqr_ssa_name);
> >>>>> +      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
> >>>>> +      gcc_assert (gsi_stmt (gsi2) == stmt);
> >>>>> +      gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
> >>>>> +      fold_stmt (&gsi2);
> >>>>> +      update_stmt (stmt);
> >>>> In this case we're replacing the statement in its original position,
> >>>> so there's no real need to use a temporary.  It seems better to
> >>>> change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same
> >>>> lhs as before.
> >>>
> >>> Yes, that's cleaner.
> >>>
> >>>>> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
> >>>>>         if (optimize_bb_for_size_p (bb))
> >>>>>           continue;
> >>>> Seems unnecessary to skip the new optimisation when optimising for size.
> >>>> Like you say, it saves a multiplication overall. Also:
> >>>
> >>> Indeed.
> >>>
> >>>>> +      if (flag_unsafe_math_optimizations)
> >>>>> +    {
> >>>>> +      for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
> >>>>> +           !gsi_end_p (gsi);
> >>>>> +           gsi_next (&gsi))
> >>>>> +        {
> >>>>> +          gimple *stmt = gsi_stmt (gsi);
> >>>>> +
> >>>>> +          if (gimple_has_lhs (stmt)
> >>>>> +          && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
> >>>>> +          && FLOAT_TYPE_P (TREE_TYPE (def))
> >>>>> +          && TREE_CODE (def) == SSA_NAME
> >>>>> +          && is_gimple_assign (stmt)
> >>>>> +          && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
> >>>>> +        optimize_recip_sqrt (&gsi, def);
> >>>>> +        }
> >>>>> +    }
> >>>> It looks like this could safely be done in one of the existing walks
> >>>> (e.g. the execute_cse_reciprocals_1 one, if we do this when optimising
> >>>> for size).
> >>>
> >>> You're right. I've moved this into one of the walks above this.
> >>>
> >>> Bootstrapped and tested on aarch64-none-linux-gnu and
> >>> x86_64-unknown-linux-gnu.
> >>> CC'ing richi as he's reviewed these kinds of patches in the past.
> >>>
> >>> Is this ok for trunk?
> >
> > I wonder how it interacts with execute_cse_reciprocals_1 given it
> > introduces a division 1.0 / a which will be not processed by
> > execute_cse_reciprocals_1 given that operates by walking uses of 'a'.
> > That may be just a missed optimization of course.
> 
> Hmm, I believe right now it doesn't interact with execute_cse_reciprocals_1 as
> it's either one or the other.
> I've left it as it is for now, but would you like us to call
> execute_cse_reciprocals_1 on the potentially transformed division?

I think that wouldn't "fix" it given execute_cse_reciprocals_1 works by
seeing all reciprocal uses of 'a' so it may have alrady seen two and
you introduced the third.  But we can leave "solving" this for
future enhacements (I'd avoid doing two passes over the IL just because
of said possibility right now).

> 
> >
> > +  if (has_other_use)
> > +    {
> > +      /* Using the two temporaries tmp1, tmp2 from above
> > +        the original x is now:
> > +        x = tmp1 * tmp2.  */
> > +      gcc_assert (mult_ssa_name);
> > +      gcc_assert (sqr_ssa_name);
> > +
> > +      gimple_assign_set_rhs1 (stmt, mult_ssa_name);
> > +      gimple_assign_set_rhs2 (stmt, sqr_ssa_name);
> > +      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
> > +      fold_stmt_inplace (def_gsi);
> > +      update_stmt (stmt);
> > +    }
> >
> > so you are leaving the original stmt in place unchanged even if it is
> > not used?  Why?  Note that with -fno-call-exceptions this stmt may
> > throw, so you should arrange to code-generate 1./a in place of the
> > original division to preserve EH behavior.  Watch out because then
> > with other uses you have to find a place to insert its computation.
> 
> Ok. These are oversights on my part. I've updated the patch to modify the
> original division in place. The multiplication is placed after it.

If the division throws internally you can't insert after it, you
have to insert on the single non-EH edge outgoing from the BB
with the division.  Just try a C++ testcase with -fnon-call-exceptions
and a try {} catch(...) {} around.

Not sure if it's worth to handle so bailing out if
stmt_can_throw_internal (stmt) would be an option as well.

> >
> > +      if (is_square_of (stmt2, x))
> > +       {
> > +         if (!sqr_stmts.contains (stmt2))
> > +           sqr_stmts.safe_push (stmt2);
> > +       }
> >
> > this is quadratic in the number of square stmts...  please consider
> > making sqr_stmts a bitmap of SSA defs (so the stmt you have now
> > is then SSA_NAME_DEF_STMT (ssa_name (bitmap-element))).
> >
> 
> Done. In practice I didn't see there being more than one such use, I expect
> them
> to be CSE'd, but maybe if they're in different basic blocks...

Oh, it just occured to me you could use FOR_EACH_IMM_USE_STMT ()
which will only get you each stmt once... ;)  Sorry for the misleading
bitmap suggestion.

> > You do not seem to restrict placement of the use stmts but insert
> > before the 1/sqrt(a) stmt.  That possibly hoists the multiplications
> > where they are not needed.  Consider
> >
> >   x = 1./sqrt(a);
> >   if (l)
> >     l1 = x * 3.;
> >   else if (l2)
> >     l1 = x * x;
> >   else if (l3)
> >     l1 = a * x;
> >
> > or similar where on the path to x * 3. you now perform two extra
> > multiplications.
> >
> 
> Ok, I've restricted the optimisation somewhat.
> Now if there's an other use it won't perform the transformation unless there
> is already
> a multiplication present on the main path. That way it won't introduce
> multiplications
> on paths where there aren't any.

OK.

> 
> > Your testcases do not cover the case of other uses at all.  Or of
> > EH.
> 
> gcc.dg/recip_sqrt_mult_1.c should handle the other uses case as tmp is a
> global
> being written to. I've added more testcases involving uses in different basic
> blocks
> and a g++.dg testcase with -fnon-call-exceptions.
> I'm not sure what functionality to test though apart from that it doesn't ICE
> and does
> the transformations I expect.

That's good enough I guess.  I see you do have a testcase with
try/catch so I wonder why foo4 doesn't ICE when you insert after
the throwing stmt...  maybe it doesn't throw?  Ah - they probably
fail your new mult_on_main_path test?

Richard.

> Bootstrapped and tested on aarch64-none-linux-gnu.
> Is this version better?
>
> Thanks,
> Kyrill
> 
> 
> 2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 
>     * tree-ssa-math-opts.c (is_mult_by): New function.
>     (is_square_of): Use the above.
>     (optimize_recip_sqrt): New function.
>     (pass_cse_reciprocals::execute): Use the above.
> 
> 2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 
>     * gcc.dg/recip_sqrt_mult_1.c: New test.
>     * gcc.dg/recip_sqrt_mult_2.c: Likewise.
>     * gcc.dg/recip_sqrt_mult_3.c: Likewise.
>     * gcc.dg/recip_sqrt_mult_4.c: Likewise.
>     * gcc.dg/recip_sqrt_mult_5.c: Likewise.
>     * g++.dg/recip_sqrt_mult_1.c: Likewise.
>
Kyrill Tkachov Sept. 4, 2018, 4:52 p.m. UTC | #7
On 04/09/18 15:31, Richard Biener wrote:
> On Tue, 4 Sep 2018, Kyrill Tkachov wrote:
>
>> Hi Richard,
>>
>> On 31/08/18 12:07, Richard Biener wrote:
>>> On Thu, 30 Aug 2018, Kyrill Tkachov wrote:
>>>
>>>> Ping.
>>>>
>>>> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg01496.html
>>>>
>>>> Thanks,
>>>> Kyrill
>>>>
>>>> On 23/08/18 18:09, Kyrill Tkachov wrote:
>>>>> Hi Richard,
>>>>>
>>>>> On 23/08/18 11:13, Richard Sandiford wrote:
>>>>>> Kyrill  Tkachov <kyrylo.tkachov@foss.arm.com> writes:
>>>>>>> Hi all,
>>>>>>>
>>>>>>> This patch aims to optimise sequences involving uses of 1.0 / sqrt (a)
>>>>>>> under -freciprocal-math and -funsafe-math-optimizations.
>>>>>>> In particular consider:
>>>>>>>
>>>>>>> x = 1.0 / sqrt (a);
>>>>>>> r1 = x * x;  // same as 1.0 / a
>>>>>>> r2 = a * x; // same as sqrt (a)
>>>>>>>
>>>>>>> If x, r1 and r2 are all used further on in the code, this can be
>>>>>>> transformed into:
>>>>>>> tmp1 = 1.0 / a
>>>>>>> tmp2 = sqrt (a)
>>>>>>> tmp3 = tmp1 * tmp2
>>>>>>> x = tmp3
>>>>>>> r1 = tmp1
>>>>>>> r2 = tmp2
>>>>>> Nice optimisation :-)  Someone who knows the pass better should review,
>>>>>> but:
>>>>> Thanks for the review.
>>>>>
>>>>>> There seems to be an implicit assumption that this is a win even
>>>>>> when the r1 and r2 assignments are only conditionally executed.
>>>>>> That's probably true, but it might be worth saying explicitly.
>>>>> I'll admit I had not considered that case.
>>>>> I think it won't make a difference in practice, as the really expensive
>>>>> operations here
>>>>> are the sqrt and the division and they are on the executed path in either
>>>>> case and them
>>>>> becoming independent should be a benefit of its own.
>>>>>
>>>>>>> +/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
>>>>>>> +
>>>>>>> +static inline bool
>>>>>>> +is_mult_by (gimple *use_stmt, tree def, tree a)
>>>>>>> +{
>>>>>>> +  if (gimple_code (use_stmt) == GIMPLE_ASSIGN
>>>>>>> +      && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
>>>>>>> +    {
>>>>>>> +      tree op0 = gimple_assign_rhs1 (use_stmt);
>>>>>>> +      tree op1 = gimple_assign_rhs2 (use_stmt);
>>>>>>> +
>>>>>>> +      return (op0 == def && op1 == a)
>>>>>>> +          || (op0 == a && op1 == def);
>>>>>>> +    }
>>>>>>> +  return 0;
>>>>>>> +}
>>>>>> Seems like is_square_of could now be a light-weight wrapper around this.
>>>>> Indeed, I've done the wrapping now.
>>>>>
>>>>>>> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator
>>>>>>> *def_gsi, tree def)
>>>>>>>      occ_head = NULL;
>>>>>>>    }
>>>>>>>    +/* Transform sequences like
>>>>>>> +   x = 1.0 / sqrt (a);
>>>>>>> +   r1 = x * x;
>>>>>>> +   r2 = a * x;
>>>>>>> +   into:
>>>>>>> +   tmp1 = 1.0 / a;
>>>>>>> +   tmp2 = sqrt (a);
>>>>>>> +   tmp3 = tmp1 * tmp2;
>>>>>>> +   x = tmp3;
>>>>>>> +   r1 = tmp1;
>>>>>>> +   r2 = tmp2;
>>>>>>> +   depending on the uses of x, r1, r2.  This removes one multiplication
>>>>>>> and
>>>>>>> +   allows the sqrt and division operations to execute in parallel.
>>>>>>> +   DEF_GSI is the gsi of the initial division by sqrt that defines
>>>>>>> +   DEF (x in the example abovs).  */
>>>>>>> +
>>>>>>> +static void
>>>>>>> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
>>>>>>> +{
>>>>>>> +  use_operand_p use_p;
>>>>>>> +  imm_use_iterator use_iter;
>>>>>>> +  gimple *stmt = gsi_stmt (*def_gsi);
>>>>>>> +  tree x = def;
>>>>>>> +  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
>>>>>>> +  tree div_rhs1 = gimple_assign_rhs1 (stmt);
>>>>>>> +
>>>>>>> +  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
>>>>>>> +      || TREE_CODE (div_rhs1) != REAL_CST
>>>>>>> +      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
>>>>>>> +    return;
>>>>>>> +
>>>>>>> +  gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
>>>>>>> +  if (!is_gimple_call (sqrt_stmt)
>>>>>>> +      || !gimple_call_lhs (sqrt_stmt))
>>>>>>> +    return;
>>>>>>> +
>>>>>>> +  gcall *call = as_a <gcall *> (sqrt_stmt);
>>>>>> Very minor, but:
>>>>>>
>>>>>>     gcall *sqrt_stmt
>>>>>>       = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
>>>>>>     if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
>>>>>>       return;
>>>>>>
>>>>>> would avoid the need for the separate as_a<>, and would mean that
>>>>>> we only call gimple_call_* on gcalls.
>>>>> Ok.
>>>>>
>>>>>>> +  if (has_other_use)
>>>>>>> +    {
>>>>>>> +      /* Using the two temporaries tmp1, tmp2 from above
>>>>>>> +     the original x is now:
>>>>>>> +     x = tmp1 * tmp2.  */
>>>>>>> +      gcc_assert (mult_ssa_name);
>>>>>>> +      gcc_assert (sqr_ssa_name);
>>>>>>> +      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
>>>>>>> +
>>>>>>> +      tree new_ssa_name
>>>>>>> +    = make_temp_ssa_name (TREE_TYPE (a), NULL,
>>>>>>> "recip_sqrt_transformed");
>>>>>>> +      gimple *new_stmt
>>>>>>> +    = gimple_build_assign (new_ssa_name, MULT_EXPR,
>>>>>>> +                   mult_ssa_name, sqr_ssa_name);
>>>>>>> +      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
>>>>>>> +      gcc_assert (gsi_stmt (gsi2) == stmt);
>>>>>>> +      gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
>>>>>>> +      fold_stmt (&gsi2);
>>>>>>> +      update_stmt (stmt);
>>>>>> In this case we're replacing the statement in its original position,
>>>>>> so there's no real need to use a temporary.  It seems better to
>>>>>> change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same
>>>>>> lhs as before.
>>>>> Yes, that's cleaner.
>>>>>
>>>>>>> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
>>>>>>>          if (optimize_bb_for_size_p (bb))
>>>>>>>            continue;
>>>>>> Seems unnecessary to skip the new optimisation when optimising for size.
>>>>>> Like you say, it saves a multiplication overall. Also:
>>>>> Indeed.
>>>>>
>>>>>>> +      if (flag_unsafe_math_optimizations)
>>>>>>> +    {
>>>>>>> +      for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
>>>>>>> +           !gsi_end_p (gsi);
>>>>>>> +           gsi_next (&gsi))
>>>>>>> +        {
>>>>>>> +          gimple *stmt = gsi_stmt (gsi);
>>>>>>> +
>>>>>>> +          if (gimple_has_lhs (stmt)
>>>>>>> +          && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
>>>>>>> +          && FLOAT_TYPE_P (TREE_TYPE (def))
>>>>>>> +          && TREE_CODE (def) == SSA_NAME
>>>>>>> +          && is_gimple_assign (stmt)
>>>>>>> +          && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
>>>>>>> +        optimize_recip_sqrt (&gsi, def);
>>>>>>> +        }
>>>>>>> +    }
>>>>>> It looks like this could safely be done in one of the existing walks
>>>>>> (e.g. the execute_cse_reciprocals_1 one, if we do this when optimising
>>>>>> for size).
>>>>> You're right. I've moved this into one of the walks above this.
>>>>>
>>>>> Bootstrapped and tested on aarch64-none-linux-gnu and
>>>>> x86_64-unknown-linux-gnu.
>>>>> CC'ing richi as he's reviewed these kinds of patches in the past.
>>>>>
>>>>> Is this ok for trunk?
>>> I wonder how it interacts with execute_cse_reciprocals_1 given it
>>> introduces a division 1.0 / a which will be not processed by
>>> execute_cse_reciprocals_1 given that operates by walking uses of 'a'.
>>> That may be just a missed optimization of course.
>> Hmm, I believe right now it doesn't interact with execute_cse_reciprocals_1 as
>> it's either one or the other.
>> I've left it as it is for now, but would you like us to call
>> execute_cse_reciprocals_1 on the potentially transformed division?
> I think that wouldn't "fix" it given execute_cse_reciprocals_1 works by
> seeing all reciprocal uses of 'a' so it may have alrady seen two and
> you introduced the third.  But we can leave "solving" this for
> future enhacements (I'd avoid doing two passes over the IL just because
> of said possibility right now).
>
>>> +  if (has_other_use)
>>> +    {
>>> +      /* Using the two temporaries tmp1, tmp2 from above
>>> +        the original x is now:
>>> +        x = tmp1 * tmp2.  */
>>> +      gcc_assert (mult_ssa_name);
>>> +      gcc_assert (sqr_ssa_name);
>>> +
>>> +      gimple_assign_set_rhs1 (stmt, mult_ssa_name);
>>> +      gimple_assign_set_rhs2 (stmt, sqr_ssa_name);
>>> +      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
>>> +      fold_stmt_inplace (def_gsi);
>>> +      update_stmt (stmt);
>>> +    }
>>>
>>> so you are leaving the original stmt in place unchanged even if it is
>>> not used?  Why?  Note that with -fno-call-exceptions this stmt may
>>> throw, so you should arrange to code-generate 1./a in place of the
>>> original division to preserve EH behavior.  Watch out because then
>>> with other uses you have to find a place to insert its computation.
>> Ok. These are oversights on my part. I've updated the patch to modify the
>> original division in place. The multiplication is placed after it.
> If the division throws internally you can't insert after it, you
> have to insert on the single non-EH edge outgoing from the BB
> with the division.  Just try a C++ testcase with -fnon-call-exceptions
> and a try {} catch(...) {} around.
>
> Not sure if it's worth to handle so bailing out if
> stmt_can_throw_internal (stmt) would be an option as well.
>
>>> +      if (is_square_of (stmt2, x))
>>> +       {
>>> +         if (!sqr_stmts.contains (stmt2))
>>> +           sqr_stmts.safe_push (stmt2);
>>> +       }
>>>
>>> this is quadratic in the number of square stmts...  please consider
>>> making sqr_stmts a bitmap of SSA defs (so the stmt you have now
>>> is then SSA_NAME_DEF_STMT (ssa_name (bitmap-element))).
>>>
>> Done. In practice I didn't see there being more than one such use, I expect
>> them
>> to be CSE'd, but maybe if they're in different basic blocks...
> Oh, it just occured to me you could use FOR_EACH_IMM_USE_STMT ()
> which will only get you each stmt once... ;)  Sorry for the misleading
> bitmap suggestion.
>
>>> You do not seem to restrict placement of the use stmts but insert
>>> before the 1/sqrt(a) stmt.  That possibly hoists the multiplications
>>> where they are not needed.  Consider
>>>
>>>    x = 1./sqrt(a);
>>>    if (l)
>>>      l1 = x * 3.;
>>>    else if (l2)
>>>      l1 = x * x;
>>>    else if (l3)
>>>      l1 = a * x;
>>>
>>> or similar where on the path to x * 3. you now perform two extra
>>> multiplications.
>>>
>> Ok, I've restricted the optimisation somewhat.
>> Now if there's an other use it won't perform the transformation unless there
>> is already
>> a multiplication present on the main path. That way it won't introduce
>> multiplications
>> on paths where there aren't any.
> OK.
>
>>> Your testcases do not cover the case of other uses at all.  Or of
>>> EH.
>> gcc.dg/recip_sqrt_mult_1.c should handle the other uses case as tmp is a
>> global
>> being written to. I've added more testcases involving uses in different basic
>> blocks
>> and a g++.dg testcase with -fnon-call-exceptions.
>> I'm not sure what functionality to test though apart from that it doesn't ICE
>> and does
>> the transformations I expect.
> That's good enough I guess.  I see you do have a testcase with
> try/catch so I wonder why foo4 doesn't ICE when you insert after
> the throwing stmt...  maybe it doesn't throw?  Ah - they probably
> fail your new mult_on_main_path test?

Yeah. I did manage to reproduce an ICE eventually by removing that check
and enabling -ftrapping-math.

I think I'll just not enter this at all if stmt_can_throw_internal (stmt) like you suggested.

Thanks,
Kyrill

> Richard.
>
>> Bootstrapped and tested on aarch64-none-linux-gnu.
>> Is this version better?
>>
>> Thanks,
>> Kyrill
>>
>>
>> 2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>
>>      * tree-ssa-math-opts.c (is_mult_by): New function.
>>      (is_square_of): Use the above.
>>      (optimize_recip_sqrt): New function.
>>      (pass_cse_reciprocals::execute): Use the above.
>>
>> 2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>
>>      * gcc.dg/recip_sqrt_mult_1.c: New test.
>>      * gcc.dg/recip_sqrt_mult_2.c: Likewise.
>>      * gcc.dg/recip_sqrt_mult_3.c: Likewise.
>>      * gcc.dg/recip_sqrt_mult_4.c: Likewise.
>>      * gcc.dg/recip_sqrt_mult_5.c: Likewise.
>>      * g++.dg/recip_sqrt_mult_1.c: Likewise.
>>
Kyrill Tkachov Sept. 5, 2018, 11:38 a.m. UTC | #8
On 04/09/18 17:52, Kyrill Tkachov wrote:
 >
 > On 04/09/18 15:31, Richard Biener wrote:
 >> On Tue, 4 Sep 2018, Kyrill Tkachov wrote:
 >>> Hi Richard,
 >>>
 >>> On 31/08/18 12:07, Richard Biener wrote:
 >>>> On Thu, 30 Aug 2018, Kyrill Tkachov wrote:
 >>>>> Ping.
 >>>>>
 >>>>> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg01496.html
 >>>>>
 >>>>> Thanks,
 >>>>> Kyrill
 >>>>>
 >>>>> On 23/08/18 18:09, Kyrill Tkachov wrote:
 >>>>>> Hi Richard,
 >>>>>>
 >>>>>> On 23/08/18 11:13, Richard Sandiford wrote:
 >>>>>>> Kyrill  Tkachov <kyrylo.tkachov@foss.arm.com> writes:
 >>>>>>>> Hi all,
 >>>>>>>>
 >>>>>>>> This patch aims to optimise sequences involving uses of 1.0 / sqrt (a)
 >>>>>>>> under -freciprocal-math and -funsafe-math-optimizations.
 >>>>>>>> In particular consider:
 >>>>>>>>
 >>>>>>>> x = 1.0 / sqrt (a);
 >>>>>>>> r1 = x * x;  // same as 1.0 / a
 >>>>>>>> r2 = a * x; // same as sqrt (a)
 >>>>>>>>
 >>>>>>>> If x, r1 and r2 are all used further on in the code, this can be
 >>>>>>>> transformed into:
 >>>>>>>> tmp1 = 1.0 / a
 >>>>>>>> tmp2 = sqrt (a)
 >>>>>>>> tmp3 = tmp1 * tmp2
 >>>>>>>> x = tmp3
 >>>>>>>> r1 = tmp1
 >>>>>>>> r2 = tmp2
 >>>>>>> Nice optimisation :-)  Someone who knows the pass better should review,
 >>>>>>> but:
 >>>>>> Thanks for the review.
 >>>>>>> There seems to be an implicit assumption that this is a win even
 >>>>>>> when the r1 and r2 assignments are only conditionally executed.
 >>>>>>> That's probably true, but it might be worth saying explicitly.
 >>>>>> I'll admit I had not considered that case.
 >>>>>> I think it won't make a difference in practice, as the really expensive
 >>>>>> operations here
 >>>>>> are the sqrt and the division and they are on the executed path in either
 >>>>>> case and them
 >>>>>> becoming independent should be a benefit of its own.
 >>>>>>>> +/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
 >>>>>>>> +
 >>>>>>>> +static inline bool
 >>>>>>>> +is_mult_by (gimple *use_stmt, tree def, tree a)
 >>>>>>>> +{
 >>>>>>>> +  if (gimple_code (use_stmt) == GIMPLE_ASSIGN
 >>>>>>>> +      && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
 >>>>>>>> +    {
 >>>>>>>> +      tree op0 = gimple_assign_rhs1 (use_stmt);
 >>>>>>>> +      tree op1 = gimple_assign_rhs2 (use_stmt);
 >>>>>>>> +
 >>>>>>>> +      return (op0 == def && op1 == a)
 >>>>>>>> +          || (op0 == a && op1 == def);
 >>>>>>>> +    }
 >>>>>>>> +  return 0;
 >>>>>>>> +}
 >>>>>>> Seems like is_square_of could now be a light-weight wrapper around this.
 >>>>>> Indeed, I've done the wrapping now.
 >>>>>>>> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator
 >>>>>>>> *def_gsi, tree def)
 >>>>>>>>      occ_head = NULL;
 >>>>>>>>    }
 >>>>>>>>    +/* Transform sequences like
 >>>>>>>> +   x = 1.0 / sqrt (a);
 >>>>>>>> +   r1 = x * x;
 >>>>>>>> +   r2 = a * x;
 >>>>>>>> +   into:
 >>>>>>>> +   tmp1 = 1.0 / a;
 >>>>>>>> +   tmp2 = sqrt (a);
 >>>>>>>> +   tmp3 = tmp1 * tmp2;
 >>>>>>>> +   x = tmp3;
 >>>>>>>> +   r1 = tmp1;
 >>>>>>>> +   r2 = tmp2;
 >>>>>>>> +   depending on the uses of x, r1, r2.  This removes one multiplication
 >>>>>>>> and
 >>>>>>>> +   allows the sqrt and division operations to execute in parallel.
 >>>>>>>> +   DEF_GSI is the gsi of the initial division by sqrt that defines
 >>>>>>>> +   DEF (x in the example abovs). */
 >>>>>>>> +
 >>>>>>>> +static void
 >>>>>>>> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
 >>>>>>>> +{
 >>>>>>>> +  use_operand_p use_p;
 >>>>>>>> +  imm_use_iterator use_iter;
 >>>>>>>> +  gimple *stmt = gsi_stmt (*def_gsi);
 >>>>>>>> +  tree x = def;
 >>>>>>>> +  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
 >>>>>>>> +  tree div_rhs1 = gimple_assign_rhs1 (stmt);
 >>>>>>>> +
 >>>>>>>> +  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
 >>>>>>>> +      || TREE_CODE (div_rhs1) != REAL_CST
 >>>>>>>> +      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
 >>>>>>>> +    return;
 >>>>>>>> +
 >>>>>>>> +  gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
 >>>>>>>> +  if (!is_gimple_call (sqrt_stmt)
 >>>>>>>> +      || !gimple_call_lhs (sqrt_stmt))
 >>>>>>>> +    return;
 >>>>>>>> +
 >>>>>>>> +  gcall *call = as_a <gcall *> (sqrt_stmt);
 >>>>>>> Very minor, but:
 >>>>>>>
 >>>>>>>     gcall *sqrt_stmt
 >>>>>>>       = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
 >>>>>>>     if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
 >>>>>>>       return;
 >>>>>>>
 >>>>>>> would avoid the need for the separate as_a<>, and would mean that
 >>>>>>> we only call gimple_call_* on gcalls.
 >>>>>> Ok.
 >>>>>>>> +  if (has_other_use)
 >>>>>>>> +    {
 >>>>>>>> +      /* Using the two temporaries tmp1, tmp2 from above
 >>>>>>>> +     the original x is now:
 >>>>>>>> +     x = tmp1 * tmp2.  */
 >>>>>>>> +      gcc_assert (mult_ssa_name);
 >>>>>>>> +      gcc_assert (sqr_ssa_name);
 >>>>>>>> +      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
 >>>>>>>> +
 >>>>>>>> +      tree new_ssa_name
 >>>>>>>> +    = make_temp_ssa_name (TREE_TYPE (a), NULL,
 >>>>>>>> "recip_sqrt_transformed");
 >>>>>>>> +      gimple *new_stmt
 >>>>>>>> +    = gimple_build_assign (new_ssa_name, MULT_EXPR,
 >>>>>>>> +                   mult_ssa_name, sqr_ssa_name);
 >>>>>>>> +      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
 >>>>>>>> +      gcc_assert (gsi_stmt (gsi2) == stmt);
 >>>>>>>> + gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
 >>>>>>>> +      fold_stmt (&gsi2);
 >>>>>>>> +      update_stmt (stmt);
 >>>>>>> In this case we're replacing the statement in its original position,
 >>>>>>> so there's no real need to use a temporary.  It seems better to
 >>>>>>> change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same
 >>>>>>> lhs as before.
 >>>>>> Yes, that's cleaner.
 >>>>>>>> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
 >>>>>>>>          if (optimize_bb_for_size_p (bb))
 >>>>>>>>            continue;
 >>>>>>> Seems unnecessary to skip the new optimisation when optimising for size.
 >>>>>>> Like you say, it saves a multiplication overall. Also:
 >>>>>> Indeed.
 >>>>>>>> +      if (flag_unsafe_math_optimizations)
 >>>>>>>> +    {
 >>>>>>>> +      for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
 >>>>>>>> +           !gsi_end_p (gsi);
 >>>>>>>> +           gsi_next (&gsi))
 >>>>>>>> +        {
 >>>>>>>> +          gimple *stmt = gsi_stmt (gsi);
 >>>>>>>> +
 >>>>>>>> +          if (gimple_has_lhs (stmt)
 >>>>>>>> +          && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 >>>>>>>> +          && FLOAT_TYPE_P (TREE_TYPE (def))
 >>>>>>>> +          && TREE_CODE (def) == SSA_NAME
 >>>>>>>> +          && is_gimple_assign (stmt)
 >>>>>>>> +          && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 >>>>>>>> +        optimize_recip_sqrt (&gsi, def);
 >>>>>>>> +        }
 >>>>>>>> +    }
 >>>>>>> It looks like this could safely be done in one of the existing walks
 >>>>>>> (e.g. the execute_cse_reciprocals_1 one, if we do this when optimising
 >>>>>>> for size).
 >>>>>> You're right. I've moved this into one of the walks above this.
 >>>>>>
 >>>>>> Bootstrapped and tested on aarch64-none-linux-gnu and
 >>>>>> x86_64-unknown-linux-gnu.
 >>>>>> CC'ing richi as he's reviewed these kinds of patches in the past.
 >>>>>>
 >>>>>> Is this ok for trunk?
 >>>> I wonder how it interacts with execute_cse_reciprocals_1 given it
 >>>> introduces a division 1.0 / a which will be not processed by
 >>>> execute_cse_reciprocals_1 given that operates by walking uses of 'a'.
 >>>> That may be just a missed optimization of course.
 >>> Hmm, I believe right now it doesn't interact with execute_cse_reciprocals_1 as
 >>> it's either one or the other.
 >>> I've left it as it is for now, but would you like us to call
 >>> execute_cse_reciprocals_1 on the potentially transformed division?
 >> I think that wouldn't "fix" it given execute_cse_reciprocals_1 works by
 >> seeing all reciprocal uses of 'a' so it may have alrady seen two and
 >> you introduced the third.  But we can leave "solving" this for
 >> future enhacements (I'd avoid doing two passes over the IL just because
 >> of said possibility right now).
 >>>> +  if (has_other_use)
 >>>> +    {
 >>>> +      /* Using the two temporaries tmp1, tmp2 from above
 >>>> +        the original x is now:
 >>>> +        x = tmp1 * tmp2.  */
 >>>> +      gcc_assert (mult_ssa_name);
 >>>> +      gcc_assert (sqr_ssa_name);
 >>>> +
 >>>> +      gimple_assign_set_rhs1 (stmt, mult_ssa_name);
 >>>> +      gimple_assign_set_rhs2 (stmt, sqr_ssa_name);
 >>>> +      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 >>>> +      fold_stmt_inplace (def_gsi);
 >>>> +      update_stmt (stmt);
 >>>> +    }
 >>>>
 >>>> so you are leaving the original stmt in place unchanged even if it is
 >>>> not used?  Why?  Note that with -fno-call-exceptions this stmt may
 >>>> throw, so you should arrange to code-generate 1./a in place of the
 >>>> original division to preserve EH behavior.  Watch out because then
 >>>> with other uses you have to find a place to insert its computation.
 >>> Ok. These are oversights on my part. I've updated the patch to modify the
 >>> original division in place. The multiplication is placed after it.
 >> If the division throws internally you can't insert after it, you
 >> have to insert on the single non-EH edge outgoing from the BB
 >> with the division.  Just try a C++ testcase with -fnon-call-exceptions
 >> and a try {} catch(...) {} around.
 >>
 >> Not sure if it's worth to handle so bailing out if
 >> stmt_can_throw_internal (stmt) would be an option as well.
 >>>> +      if (is_square_of (stmt2, x))
 >>>> +       {
 >>>> +         if (!sqr_stmts.contains (stmt2))
 >>>> +           sqr_stmts.safe_push (stmt2);
 >>>> +       }
 >>>>
 >>>> this is quadratic in the number of square stmts... please consider
 >>>> making sqr_stmts a bitmap of SSA defs (so the stmt you have now
 >>>> is then SSA_NAME_DEF_STMT (ssa_name (bitmap-element))).
 >>> Done. In practice I didn't see there being more than one such use, I expect
 >>> them
 >>> to be CSE'd, but maybe if they're in different basic blocks...
 >> Oh, it just occured to me you could use FOR_EACH_IMM_USE_STMT ()
 >> which will only get you each stmt once... ;)  Sorry for the misleading
 >> bitmap suggestion.
 >>>> You do not seem to restrict placement of the use stmts but insert
 >>>> before the 1/sqrt(a) stmt.  That possibly hoists the multiplications
 >>>> where they are not needed.  Consider
 >>>>
 >>>>    x = 1./sqrt(a);
 >>>>    if (l)
 >>>>      l1 = x * 3.;
 >>>>    else if (l2)
 >>>>      l1 = x * x;
 >>>>    else if (l3)
 >>>>      l1 = a * x;
 >>>>
 >>>> or similar where on the path to x * 3. you now perform two extra
 >>>> multiplications.
 >>> Ok, I've restricted the optimisation somewhat.
 >>> Now if there's an other use it won't perform the transformation unless there
 >>> is already
 >>> a multiplication present on the main path. That way it won't introduce
 >>> multiplications
 >>> on paths where there aren't any.
 >> OK.
 >>>> Your testcases do not cover the case of other uses at all.  Or of
 >>>> EH.
 >>> gcc.dg/recip_sqrt_mult_1.c should handle the other uses case as tmp is a
 >>> global
 >>> being written to. I've added more testcases involving uses in different basic
 >>> blocks
 >>> and a g++.dg testcase with -fnon-call-exceptions.
 >>> I'm not sure what functionality to test though apart from that it doesn't ICE
 >>> and does
 >>> the transformations I expect.
 >> That's good enough I guess.  I see you do have a testcase with
 >> try/catch so I wonder why foo4 doesn't ICE when you insert after
 >> the throwing stmt...  maybe it doesn't throw?  Ah - they probably
 >> fail your new mult_on_main_path test?
 >
 > Yeah. I did manage to reproduce an ICE eventually by removing that check
 > and enabling -ftrapping-math.
 >
 > I think I'll just not enter this at all if stmt_can_throw_internal (stmt) like you suggested.


And here is the version that does that. It also reverts to using a vector for the sqr_stmts for consistency
and uses FOR_EACH_IMM_USE_STMT to iterate over the use statements.

Bootstrapped and tested on aarch64-none-linux-gnu.

Ok?

Thanks,
Kyrill

2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * tree-ssa-math-opts.c (is_mult_by): New function.
     (is_square_of): Use the above.
     (optimize_recip_sqrt): New function.
     (pass_cse_reciprocals::execute): Use the above.

2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * gcc.dg/recip_sqrt_mult_1.c: New test.
     * gcc.dg/recip_sqrt_mult_2.c: Likewise.
     * gcc.dg/recip_sqrt_mult_3.c: Likewise.
     * gcc.dg/recip_sqrt_mult_4.c: Likewise.
     * gcc.dg/recip_sqrt_mult_5.c: Likewise.
     * g++.dg/recip_sqrt_mult_1.C: Likewise.
     * g++.dg/recip_sqrt_mult_2.C: Likewise.
diff --git a/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C
new file mode 100644
index 0000000000000000000000000000000000000000..11d9c6f758f1529d8ed4cadf85010f6ce379c195
--- /dev/null
+++ b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fnon-call-exceptions -fdump-tree-recip" } */
+
+double res, res2, tmp;
+void
+foo1 (double a, double b)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+    res = tmp * tmp;
+    res2 = a * tmp;
+  }
+  catch (...)
+    { ; }
+}
+
+void
+foo4 (double a, double b, int c, int d)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+  }
+  catch (...)
+    {
+      if (c)
+	res = tmp * tmp;
+
+      if (d)
+	res2 = a * tmp;
+    }
+}
+
+void
+foo5 (double a, double b, int c, int d)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+    res = tmp * tmp;
+
+    if (d)
+      res2 = a * tmp;
+  }
+  catch (...)
+    { ; }
+}
+
+/* { dg-final { scan-tree-dump-times "Optimizing reciprocal sqrt multiplications" 2 "recip" } } */
+/* { dg-final { scan-tree-dump-times "Replacing squaring multiplication" 2 "recip" } } */
+/* { dg-final { scan-tree-dump-times "Replacing original division" 2 "recip" } } */
diff --git a/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C b/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C
new file mode 100644
index 0000000000000000000000000000000000000000..cca12caf4d79bfa69933d9b8fce41f38bb5b7a19
--- /dev/null
+++ b/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-w -Ofast -fnon-call-exceptions -ftrapping-math -fdump-tree-recip" } */
+
+/* Check that the recip_sqrt optimization does not trigger here, causing an
+   ICE due to EH info.  */
+
+
+double res, res2, tmp;
+void
+foo1 (double a, double b)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+    res = tmp * tmp;
+    res2 = a * tmp;
+  }
+  catch (...)
+    { ; }
+}
+
+void
+foo4 (double a, double b, int c, int d)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+  }
+  catch (...)
+    {
+      if (c)
+	res = tmp * tmp;
+
+      if (d)
+	res2 = a * tmp;
+    }
+}
+
+void
+foo5 (double a, double b, int c, int d)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+    res = tmp * tmp;
+
+    if (d)
+      res2 = a * tmp;
+  }
+  catch (...)
+    { ; }
+}
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..188390a4ecffca1b21f05c4f19abecfb7ebd188f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+double res, res2, tmp;
+void
+foo (double a, double b)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  res = tmp * tmp;
+  res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..c5fc3de7b657b1769e76254b4bc874e0595e43ef
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+float
+foo (float a)
+{
+  float tmp = 1.0f / __builtin_sqrtf (a);
+  return a * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not " / " "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..e7d185ba7e22cfc7cca72296d5ccc544f24fdb14
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+double
+foo (double a)
+{
+  double tmp = 1.0f / __builtin_sqrt (a);
+  return tmp * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not "__builtin_sqrt" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..e3005f2feb6f4bacbb6eafc0155e196cb866fcdf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+/* The main path doesn't have any multiplications.
+   Avoid introducing them in the recip pass.  */
+
+double res, res2, tmp;
+void
+foo (double a, double b, int c, int d)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  if (c)
+    res = tmp * tmp;
+
+  if (d)
+    res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump-not "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump-not "Replacing original division" "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c
new file mode 100644
index 0000000000000000000000000000000000000000..e871f0fcd4feb1687f9815e4babf4d0667a15ea8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+/* We want to do the recip_sqrt transformations here there is already
+   a multiplication on the main path.  */
+
+double res, res2, tmp;
+void
+foo (double a, double b, int c, int d)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  res = tmp * tmp;
+
+  if (d)
+    res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 25378da6f4ab27dbce51e10461efc18cc416a4d7..19bff5c3c3715f3e194e1c091ae01f6c4d6d2ce8 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -337,9 +337,9 @@ is_division_by (gimple *use_stmt, tree def)
 	 && gimple_assign_rhs1 (use_stmt) != def;
 }
 
-/* Return whether USE_STMT is DEF * DEF.  */
+/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
 static inline bool
-is_square_of (gimple *use_stmt, tree def)
+is_mult_by (gimple *use_stmt, tree def, tree a)
 {
   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
       && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
@@ -347,11 +347,19 @@ is_square_of (gimple *use_stmt, tree def)
       tree op0 = gimple_assign_rhs1 (use_stmt);
       tree op1 = gimple_assign_rhs2 (use_stmt);
 
-      return op0 == op1 && op0 == def;
+      return (op0 == def && op1 == a)
+	      || (op0 == a && op1 == def);
     }
   return 0;
 }
 
+/* Return whether USE_STMT is DEF * DEF.  */
+static inline bool
+is_square_of (gimple *use_stmt, tree def)
+{
+  return is_mult_by (use_stmt, def, def);
+}
+
 /* Return whether USE_STMT is a floating-point division by
    DEF * DEF.  */
 static inline bool
@@ -526,6 +534,188 @@ free_bb (struct occurrence *occ)
     }
 }
 
+/* Transform sequences like
+   t = sqrt (a)
+   x = 1.0 / t;
+   r1 = x * x;
+   r2 = a * x;
+   into:
+   t = sqrt (a)
+   r1 = 1.0 / a;
+   r2 = t;
+   x = r1 * r2;
+   depending on the uses of x, r1, r2.  This removes one multiplication and
+   allows the sqrt and division operations to execute in parallel.
+   DEF_GSI is the gsi of the initial division by sqrt that defines
+   DEF (x in the example abovs).  */
+
+static void
+optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
+{
+  gimple *use_stmt;
+  imm_use_iterator use_iter;
+  gimple *stmt = gsi_stmt (*def_gsi);
+  tree x = def;
+  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
+  tree div_rhs1 = gimple_assign_rhs1 (stmt);
+
+  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
+      || TREE_CODE (div_rhs1) != REAL_CST
+      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
+    return;
+
+  gcall *sqrt_stmt
+    = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
+
+  if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
+    return;
+
+  switch (gimple_call_combined_fn (sqrt_stmt))
+    {
+    CASE_CFN_SQRT:
+    CASE_CFN_SQRT_FN:
+      break;
+
+    default:
+      return;
+    }
+  tree a = gimple_call_arg (sqrt_stmt, 0);
+
+  /* We have 'a' and 'x'.  Now analyze the uses of 'x'.  */
+
+  /* Statements that use x in x * x.  */
+  auto_vec<gimple *> sqr_stmts;
+  /* Statements that use x in a * x.  */
+  auto_vec<gimple *> mult_stmts;
+  bool has_other_use = false;
+  bool mult_on_main_path = false;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x)
+    {
+      if (is_gimple_debug (use_stmt))
+	continue;
+      if (is_square_of (use_stmt, x))
+	{
+	  sqr_stmts.safe_push (use_stmt);
+	  if (gimple_bb (use_stmt) == gimple_bb (stmt))
+	    mult_on_main_path = true;
+	}
+      else if (is_mult_by (use_stmt, x, a))
+	{
+	  mult_stmts.safe_push (use_stmt);
+	  if (gimple_bb (use_stmt) == gimple_bb (stmt))
+	    mult_on_main_path = true;
+	}
+      else
+	has_other_use = true;
+    }
+
+  /* In the x * x and a * x cases we just rewire stmt operands or
+     remove multiplications.  In the has_other_use case we introduce
+     a multiplication so make sure we don't introduce a multiplication
+     on a path where there was none.  */
+  if (has_other_use && !mult_on_main_path)
+    return;
+
+  if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
+    return;
+
+  /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
+     to be able to compose it from the sqr and mult cases.  */
+  if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
+    return;
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
+      print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
+      print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+      fprintf (dump_file, "\n");
+    }
+
+  bool delete_div = !has_other_use;
+  tree sqr_ssa_name = NULL_TREE;
+  if (!sqr_stmts.is_empty ())
+    {
+      /* r1 = x * x.  Transform the original
+	 x = 1.0 / t
+	 into
+	 tmp1 = 1.0 / a
+	 r1 = tmp1.  */
+
+      sqr_ssa_name
+	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
+
+      if (dump_file)
+	{
+	  fprintf (dump_file, "Replacing original division\n");
+	  print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+	  fprintf (dump_file, "with new division\n");
+	}
+      gimple_assign_set_lhs (stmt, sqr_ssa_name);
+      gimple_assign_set_rhs2 (stmt, a);
+      fold_stmt_inplace (def_gsi);
+      update_stmt (stmt);
+
+      if (dump_file)
+	print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+
+      delete_div = false;
+      gimple *sqr_stmt;
+      unsigned int i;
+      FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
+	{
+	  gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
+	  gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
+	  update_stmt (sqr_stmt);
+	}
+    }
+  if (!mult_stmts.is_empty ())
+    {
+      /* r2 = a * x.  Transform this into:
+	 r2 = t (The original sqrt (a)).  */
+      unsigned int i;
+      gimple *mult_stmt = NULL;
+      FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
+	{
+	  gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
+
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "Replacing squaring multiplication\n");
+	      print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
+	      fprintf (dump_file, "with assignment\n");
+	    }
+	  gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
+	  fold_stmt_inplace (&gsi2);
+	  update_stmt (mult_stmt);
+	  if (dump_file)
+	    print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
+      }
+    }
+
+  if (has_other_use)
+    {
+      /* Using the two temporaries tmp1, tmp2 from above
+	 the original x is now:
+	 x = tmp1 * tmp2.  */
+      gcc_assert (orig_sqrt_ssa_name);
+      gcc_assert (sqr_ssa_name);
+
+      gimple *new_stmt
+	= gimple_build_assign (x, MULT_EXPR,
+				orig_sqrt_ssa_name, sqr_ssa_name);
+      gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
+      update_stmt (stmt);
+    }
+  else if (delete_div)
+    {
+      /* Remove the original division.  */
+      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
+      gsi_remove (&gsi2, true);
+      release_defs (stmt);
+    }
+}
 
 /* Look for floating-point divisions among DEF's uses, and try to
    replace them by multiplications with the reciprocal.  Add
@@ -756,7 +946,15 @@ pass_cse_reciprocals::execute (function *fun)
 	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 	      && FLOAT_TYPE_P (TREE_TYPE (def))
 	      && TREE_CODE (def) == SSA_NAME)
-	    execute_cse_reciprocals_1 (&gsi, def);
+	    {
+	      if (flag_unsafe_math_optimizations
+		  && is_gimple_assign (stmt)
+		  && !stmt_can_throw_internal (stmt)
+		  && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
+		optimize_recip_sqrt (&gsi, def);
+	      else
+		execute_cse_reciprocals_1 (&gsi, def);
+	    }
 	}
 
       if (optimize_bb_for_size_p (bb))
Richard Biener Sept. 5, 2018, 11:42 a.m. UTC | #9
On Wed, 5 Sep 2018, Kyrill Tkachov wrote:

> On 04/09/18 17:52, Kyrill Tkachov wrote:
> >
> > On 04/09/18 15:31, Richard Biener wrote:
> >> On Tue, 4 Sep 2018, Kyrill Tkachov wrote:
> >>> Hi Richard,
> >>>
> >>> On 31/08/18 12:07, Richard Biener wrote:
> >>>> On Thu, 30 Aug 2018, Kyrill Tkachov wrote:
> >>>>> Ping.
> >>>>>
> >>>>> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg01496.html
> >>>>>
> >>>>> Thanks,
> >>>>> Kyrill
> >>>>>
> >>>>> On 23/08/18 18:09, Kyrill Tkachov wrote:
> >>>>>> Hi Richard,
> >>>>>>
> >>>>>> On 23/08/18 11:13, Richard Sandiford wrote:
> >>>>>>> Kyrill  Tkachov <kyrylo.tkachov@foss.arm.com> writes:
> >>>>>>>> Hi all,
> >>>>>>>>
> >>>>>>>> This patch aims to optimise sequences involving uses of 1.0 / sqrt
> (a)
> >>>>>>>> under -freciprocal-math and -funsafe-math-optimizations.
> >>>>>>>> In particular consider:
> >>>>>>>>
> >>>>>>>> x = 1.0 / sqrt (a);
> >>>>>>>> r1 = x * x;  // same as 1.0 / a
> >>>>>>>> r2 = a * x; // same as sqrt (a)
> >>>>>>>>
> >>>>>>>> If x, r1 and r2 are all used further on in the code, this can be
> >>>>>>>> transformed into:
> >>>>>>>> tmp1 = 1.0 / a
> >>>>>>>> tmp2 = sqrt (a)
> >>>>>>>> tmp3 = tmp1 * tmp2
> >>>>>>>> x = tmp3
> >>>>>>>> r1 = tmp1
> >>>>>>>> r2 = tmp2
> >>>>>>> Nice optimisation :-)  Someone who knows the pass better should
> review,
> >>>>>>> but:
> >>>>>> Thanks for the review.
> >>>>>>> There seems to be an implicit assumption that this is a win even
> >>>>>>> when the r1 and r2 assignments are only conditionally executed.
> >>>>>>> That's probably true, but it might be worth saying explicitly.
> >>>>>> I'll admit I had not considered that case.
> >>>>>> I think it won't make a difference in practice, as the really expensive
> >>>>>> operations here
> >>>>>> are the sqrt and the division and they are on the executed path in
> either
> >>>>>> case and them
> >>>>>> becoming independent should be a benefit of its own.
> >>>>>>>> +/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
> >>>>>>>> +
> >>>>>>>> +static inline bool
> >>>>>>>> +is_mult_by (gimple *use_stmt, tree def, tree a)
> >>>>>>>> +{
> >>>>>>>> +  if (gimple_code (use_stmt) == GIMPLE_ASSIGN
> >>>>>>>> +      && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
> >>>>>>>> +    {
> >>>>>>>> +      tree op0 = gimple_assign_rhs1 (use_stmt);
> >>>>>>>> +      tree op1 = gimple_assign_rhs2 (use_stmt);
> >>>>>>>> +
> >>>>>>>> +      return (op0 == def && op1 == a)
> >>>>>>>> +          || (op0 == a && op1 == def);
> >>>>>>>> +    }
> >>>>>>>> +  return 0;
> >>>>>>>> +}
> >>>>>>> Seems like is_square_of could now be a light-weight wrapper around
> this.
> >>>>>> Indeed, I've done the wrapping now.
> >>>>>>>> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator
> >>>>>>>> *def_gsi, tree def)
> >>>>>>>>      occ_head = NULL;
> >>>>>>>>    }
> >>>>>>>>    +/* Transform sequences like
> >>>>>>>> +   x = 1.0 / sqrt (a);
> >>>>>>>> +   r1 = x * x;
> >>>>>>>> +   r2 = a * x;
> >>>>>>>> +   into:
> >>>>>>>> +   tmp1 = 1.0 / a;
> >>>>>>>> +   tmp2 = sqrt (a);
> >>>>>>>> +   tmp3 = tmp1 * tmp2;
> >>>>>>>> +   x = tmp3;
> >>>>>>>> +   r1 = tmp1;
> >>>>>>>> +   r2 = tmp2;
> >>>>>>>> +   depending on the uses of x, r1, r2.  This removes one
> multiplication
> >>>>>>>> and
> >>>>>>>> +   allows the sqrt and division operations to execute in parallel.
> >>>>>>>> +   DEF_GSI is the gsi of the initial division by sqrt that defines
> >>>>>>>> +   DEF (x in the example abovs). */
> >>>>>>>> +
> >>>>>>>> +static void
> >>>>>>>> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
> >>>>>>>> +{
> >>>>>>>> +  use_operand_p use_p;
> >>>>>>>> +  imm_use_iterator use_iter;
> >>>>>>>> +  gimple *stmt = gsi_stmt (*def_gsi);
> >>>>>>>> +  tree x = def;
> >>>>>>>> +  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
> >>>>>>>> +  tree div_rhs1 = gimple_assign_rhs1 (stmt);
> >>>>>>>> +
> >>>>>>>> +  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
> >>>>>>>> +      || TREE_CODE (div_rhs1) != REAL_CST
> >>>>>>>> +      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
> >>>>>>>> +    return;
> >>>>>>>> +
> >>>>>>>> +  gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
> >>>>>>>> +  if (!is_gimple_call (sqrt_stmt)
> >>>>>>>> +      || !gimple_call_lhs (sqrt_stmt))
> >>>>>>>> +    return;
> >>>>>>>> +
> >>>>>>>> +  gcall *call = as_a <gcall *> (sqrt_stmt);
> >>>>>>> Very minor, but:
> >>>>>>>
> >>>>>>>     gcall *sqrt_stmt
> >>>>>>>       = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
> >>>>>>>     if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
> >>>>>>>       return;
> >>>>>>>
> >>>>>>> would avoid the need for the separate as_a<>, and would mean that
> >>>>>>> we only call gimple_call_* on gcalls.
> >>>>>> Ok.
> >>>>>>>> +  if (has_other_use)
> >>>>>>>> +    {
> >>>>>>>> +      /* Using the two temporaries tmp1, tmp2 from above
> >>>>>>>> +     the original x is now:
> >>>>>>>> +     x = tmp1 * tmp2.  */
> >>>>>>>> +      gcc_assert (mult_ssa_name);
> >>>>>>>> +      gcc_assert (sqr_ssa_name);
> >>>>>>>> +      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
> >>>>>>>> +
> >>>>>>>> +      tree new_ssa_name
> >>>>>>>> +    = make_temp_ssa_name (TREE_TYPE (a), NULL,
> >>>>>>>> "recip_sqrt_transformed");
> >>>>>>>> +      gimple *new_stmt
> >>>>>>>> +    = gimple_build_assign (new_ssa_name, MULT_EXPR,
> >>>>>>>> +                   mult_ssa_name, sqr_ssa_name);
> >>>>>>>> +      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
> >>>>>>>> +      gcc_assert (gsi_stmt (gsi2) == stmt);
> >>>>>>>> + gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
> >>>>>>>> +      fold_stmt (&gsi2);
> >>>>>>>> +      update_stmt (stmt);
> >>>>>>> In this case we're replacing the statement in its original position,
> >>>>>>> so there's no real need to use a temporary.  It seems better to
> >>>>>>> change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same
> >>>>>>> lhs as before.
> >>>>>> Yes, that's cleaner.
> >>>>>>>> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
> >>>>>>>>          if (optimize_bb_for_size_p (bb))
> >>>>>>>>            continue;
> >>>>>>> Seems unnecessary to skip the new optimisation when optimising for
> size.
> >>>>>>> Like you say, it saves a multiplication overall. Also:
> >>>>>> Indeed.
> >>>>>>>> +      if (flag_unsafe_math_optimizations)
> >>>>>>>> +    {
> >>>>>>>> +      for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
> >>>>>>>> +           !gsi_end_p (gsi);
> >>>>>>>> +           gsi_next (&gsi))
> >>>>>>>> +        {
> >>>>>>>> +          gimple *stmt = gsi_stmt (gsi);
> >>>>>>>> +
> >>>>>>>> +          if (gimple_has_lhs (stmt)
> >>>>>>>> +          && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) !=
> NULL
> >>>>>>>> +          && FLOAT_TYPE_P (TREE_TYPE (def))
> >>>>>>>> +          && TREE_CODE (def) == SSA_NAME
> >>>>>>>> +          && is_gimple_assign (stmt)
> >>>>>>>> +          && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
> >>>>>>>> +        optimize_recip_sqrt (&gsi, def);
> >>>>>>>> +        }
> >>>>>>>> +    }
> >>>>>>> It looks like this could safely be done in one of the existing walks
> >>>>>>> (e.g. the execute_cse_reciprocals_1 one, if we do this when optimising
> >>>>>>> for size).
> >>>>>> You're right. I've moved this into one of the walks above this.
> >>>>>>
> >>>>>> Bootstrapped and tested on aarch64-none-linux-gnu and
> >>>>>> x86_64-unknown-linux-gnu.
> >>>>>> CC'ing richi as he's reviewed these kinds of patches in the past.
> >>>>>>
> >>>>>> Is this ok for trunk?
> >>>> I wonder how it interacts with execute_cse_reciprocals_1 given it
> >>>> introduces a division 1.0 / a which will be not processed by
> >>>> execute_cse_reciprocals_1 given that operates by walking uses of 'a'.
> >>>> That may be just a missed optimization of course.
> >>> Hmm, I believe right now it doesn't interact with
> execute_cse_reciprocals_1 as
> >>> it's either one or the other.
> >>> I've left it as it is for now, but would you like us to call
> >>> execute_cse_reciprocals_1 on the potentially transformed division?
> >> I think that wouldn't "fix" it given execute_cse_reciprocals_1 works by
> >> seeing all reciprocal uses of 'a' so it may have alrady seen two and
> >> you introduced the third.  But we can leave "solving" this for
> >> future enhacements (I'd avoid doing two passes over the IL just because
> >> of said possibility right now).
> >>>> +  if (has_other_use)
> >>>> +    {
> >>>> +      /* Using the two temporaries tmp1, tmp2 from above
> >>>> +        the original x is now:
> >>>> +        x = tmp1 * tmp2.  */
> >>>> +      gcc_assert (mult_ssa_name);
> >>>> +      gcc_assert (sqr_ssa_name);
> >>>> +
> >>>> +      gimple_assign_set_rhs1 (stmt, mult_ssa_name);
> >>>> +      gimple_assign_set_rhs2 (stmt, sqr_ssa_name);
> >>>> +      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
> >>>> +      fold_stmt_inplace (def_gsi);
> >>>> +      update_stmt (stmt);
> >>>> +    }
> >>>>
> >>>> so you are leaving the original stmt in place unchanged even if it is
> >>>> not used?  Why?  Note that with -fno-call-exceptions this stmt may
> >>>> throw, so you should arrange to code-generate 1./a in place of the
> >>>> original division to preserve EH behavior.  Watch out because then
> >>>> with other uses you have to find a place to insert its computation.
> >>> Ok. These are oversights on my part. I've updated the patch to modify the
> >>> original division in place. The multiplication is placed after it.
> >> If the division throws internally you can't insert after it, you
> >> have to insert on the single non-EH edge outgoing from the BB
> >> with the division.  Just try a C++ testcase with -fnon-call-exceptions
> >> and a try {} catch(...) {} around.
> >>
> >> Not sure if it's worth to handle so bailing out if
> >> stmt_can_throw_internal (stmt) would be an option as well.
> >>>> +      if (is_square_of (stmt2, x))
> >>>> +       {
> >>>> +         if (!sqr_stmts.contains (stmt2))
> >>>> +           sqr_stmts.safe_push (stmt2);
> >>>> +       }
> >>>>
> >>>> this is quadratic in the number of square stmts... please consider
> >>>> making sqr_stmts a bitmap of SSA defs (so the stmt you have now
> >>>> is then SSA_NAME_DEF_STMT (ssa_name (bitmap-element))).
> >>> Done. In practice I didn't see there being more than one such use, I
> expect
> >>> them
> >>> to be CSE'd, but maybe if they're in different basic blocks...
> >> Oh, it just occured to me you could use FOR_EACH_IMM_USE_STMT ()
> >> which will only get you each stmt once... ;)  Sorry for the misleading
> >> bitmap suggestion.
> >>>> You do not seem to restrict placement of the use stmts but insert
> >>>> before the 1/sqrt(a) stmt.  That possibly hoists the multiplications
> >>>> where they are not needed.  Consider
> >>>>
> >>>>    x = 1./sqrt(a);
> >>>>    if (l)
> >>>>      l1 = x * 3.;
> >>>>    else if (l2)
> >>>>      l1 = x * x;
> >>>>    else if (l3)
> >>>>      l1 = a * x;
> >>>>
> >>>> or similar where on the path to x * 3. you now perform two extra
> >>>> multiplications.
> >>> Ok, I've restricted the optimisation somewhat.
> >>> Now if there's an other use it won't perform the transformation unless
> there
> >>> is already
> >>> a multiplication present on the main path. That way it won't introduce
> >>> multiplications
> >>> on paths where there aren't any.
> >> OK.
> >>>> Your testcases do not cover the case of other uses at all.  Or of
> >>>> EH.
> >>> gcc.dg/recip_sqrt_mult_1.c should handle the other uses case as tmp is a
> >>> global
> >>> being written to. I've added more testcases involving uses in different
> basic
> >>> blocks
> >>> and a g++.dg testcase with -fnon-call-exceptions.
> >>> I'm not sure what functionality to test though apart from that it doesn't
> ICE
> >>> and does
> >>> the transformations I expect.
> >> That's good enough I guess.  I see you do have a testcase with
> >> try/catch so I wonder why foo4 doesn't ICE when you insert after
> >> the throwing stmt...  maybe it doesn't throw?  Ah - they probably
> >> fail your new mult_on_main_path test?
> >
> > Yeah. I did manage to reproduce an ICE eventually by removing that check
> > and enabling -ftrapping-math.
> >
> > I think I'll just not enter this at all if stmt_can_throw_internal (stmt)
> like you suggested.
> 
> 
> And here is the version that does that. It also reverts to using a vector for
> the sqr_stmts for consistency
> and uses FOR_EACH_IMM_USE_STMT to iterate over the use statements.
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok?

OK.

Thanks,
Richard.

> Thanks,
> Kyrill
> 
> 2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 
>     * tree-ssa-math-opts.c (is_mult_by): New function.
>     (is_square_of): Use the above.
>     (optimize_recip_sqrt): New function.
>     (pass_cse_reciprocals::execute): Use the above.
> 
> 2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 
>     * gcc.dg/recip_sqrt_mult_1.c: New test.
>     * gcc.dg/recip_sqrt_mult_2.c: Likewise.
>     * gcc.dg/recip_sqrt_mult_3.c: Likewise.
>     * gcc.dg/recip_sqrt_mult_4.c: Likewise.
>     * gcc.dg/recip_sqrt_mult_5.c: Likewise.
>     * g++.dg/recip_sqrt_mult_1.C: Likewise.
>     * g++.dg/recip_sqrt_mult_2.C: Likewise.
H.J. Lu Sept. 11, 2018, 5:04 p.m. UTC | #10
On Wed, Sep 5, 2018 at 4:38 AM, Kyrill  Tkachov
<kyrylo.tkachov@foss.arm.com> wrote:
> On 04/09/18 17:52, Kyrill Tkachov wrote:
>>
>> On 04/09/18 15:31, Richard Biener wrote:
>>> On Tue, 4 Sep 2018, Kyrill Tkachov wrote:
>>>> Hi Richard,
>>>>
>>>> On 31/08/18 12:07, Richard Biener wrote:
>>>>> On Thu, 30 Aug 2018, Kyrill Tkachov wrote:
>>>>>> Ping.
>>>>>>
>>>>>> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg01496.html
>>>>>>
>>>>>> Thanks,
>>>>>> Kyrill
>>>>>>
>>>>>> On 23/08/18 18:09, Kyrill Tkachov wrote:
>>>>>>> Hi Richard,
>>>>>>>
>>>>>>> On 23/08/18 11:13, Richard Sandiford wrote:
>>>>>>>> Kyrill  Tkachov <kyrylo.tkachov@foss.arm.com> writes:
>>>>>>>>> Hi all,
>>>>>>>>>
>>>>>>>>> This patch aims to optimise sequences involving uses of 1.0 / sqrt
>>>>>>>>> (a)
>>>>>>>>> under -freciprocal-math and -funsafe-math-optimizations.
>>>>>>>>> In particular consider:
>>>>>>>>>
>>>>>>>>> x = 1.0 / sqrt (a);
>>>>>>>>> r1 = x * x;  // same as 1.0 / a
>>>>>>>>> r2 = a * x; // same as sqrt (a)
>>>>>>>>>
>>>>>>>>> If x, r1 and r2 are all used further on in the code, this can be
>>>>>>>>> transformed into:
>>>>>>>>> tmp1 = 1.0 / a
>>>>>>>>> tmp2 = sqrt (a)
>>>>>>>>> tmp3 = tmp1 * tmp2
>>>>>>>>> x = tmp3
>>>>>>>>> r1 = tmp1
>>>>>>>>> r2 = tmp2
>>>>>>>> Nice optimisation :-)  Someone who knows the pass better should
>>>>>>>> review,
>>>>>>>> but:
>>>>>>> Thanks for the review.
>>>>>>>> There seems to be an implicit assumption that this is a win even
>>>>>>>> when the r1 and r2 assignments are only conditionally executed.
>>>>>>>> That's probably true, but it might be worth saying explicitly.
>>>>>>> I'll admit I had not considered that case.
>>>>>>> I think it won't make a difference in practice, as the really
>>>>>>> expensive
>>>>>>> operations here
>>>>>>> are the sqrt and the division and they are on the executed path in
>>>>>>> either
>>>>>>> case and them
>>>>>>> becoming independent should be a benefit of its own.
>>>>>>>>> +/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
>>>>>>>>> +
>>>>>>>>> +static inline bool
>>>>>>>>> +is_mult_by (gimple *use_stmt, tree def, tree a)
>>>>>>>>> +{
>>>>>>>>> +  if (gimple_code (use_stmt) == GIMPLE_ASSIGN
>>>>>>>>> +      && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
>>>>>>>>> +    {
>>>>>>>>> +      tree op0 = gimple_assign_rhs1 (use_stmt);
>>>>>>>>> +      tree op1 = gimple_assign_rhs2 (use_stmt);
>>>>>>>>> +
>>>>>>>>> +      return (op0 == def && op1 == a)
>>>>>>>>> +          || (op0 == a && op1 == def);
>>>>>>>>> +    }
>>>>>>>>> +  return 0;
>>>>>>>>> +}
>>>>>>>> Seems like is_square_of could now be a light-weight wrapper around
>>>>>>>> this.
>>>>>>> Indeed, I've done the wrapping now.
>>>>>>>>> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1
>>>>>>>>> (gimple_stmt_iterator
>>>>>>>>> *def_gsi, tree def)
>>>>>>>>>      occ_head = NULL;
>>>>>>>>>    }
>>>>>>>>>    +/* Transform sequences like
>>>>>>>>> +   x = 1.0 / sqrt (a);
>>>>>>>>> +   r1 = x * x;
>>>>>>>>> +   r2 = a * x;
>>>>>>>>> +   into:
>>>>>>>>> +   tmp1 = 1.0 / a;
>>>>>>>>> +   tmp2 = sqrt (a);
>>>>>>>>> +   tmp3 = tmp1 * tmp2;
>>>>>>>>> +   x = tmp3;
>>>>>>>>> +   r1 = tmp1;
>>>>>>>>> +   r2 = tmp2;
>>>>>>>>> +   depending on the uses of x, r1, r2.  This removes one
>>>>>>>>> multiplication
>>>>>>>>> and
>>>>>>>>> +   allows the sqrt and division operations to execute in parallel.
>>>>>>>>> +   DEF_GSI is the gsi of the initial division by sqrt that defines
>>>>>>>>> +   DEF (x in the example abovs). */
>>>>>>>>> +
>>>>>>>>> +static void
>>>>>>>>> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
>>>>>>>>> +{
>>>>>>>>> +  use_operand_p use_p;
>>>>>>>>> +  imm_use_iterator use_iter;
>>>>>>>>> +  gimple *stmt = gsi_stmt (*def_gsi);
>>>>>>>>> +  tree x = def;
>>>>>>>>> +  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
>>>>>>>>> +  tree div_rhs1 = gimple_assign_rhs1 (stmt);
>>>>>>>>> +
>>>>>>>>> +  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
>>>>>>>>> +      || TREE_CODE (div_rhs1) != REAL_CST
>>>>>>>>> +      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
>>>>>>>>> +    return;
>>>>>>>>> +
>>>>>>>>> +  gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
>>>>>>>>> +  if (!is_gimple_call (sqrt_stmt)
>>>>>>>>> +      || !gimple_call_lhs (sqrt_stmt))
>>>>>>>>> +    return;
>>>>>>>>> +
>>>>>>>>> +  gcall *call = as_a <gcall *> (sqrt_stmt);
>>>>>>>> Very minor, but:
>>>>>>>>
>>>>>>>>     gcall *sqrt_stmt
>>>>>>>>       = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
>>>>>>>>     if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
>>>>>>>>       return;
>>>>>>>>
>>>>>>>> would avoid the need for the separate as_a<>, and would mean that
>>>>>>>> we only call gimple_call_* on gcalls.
>>>>>>> Ok.
>>>>>>>>> +  if (has_other_use)
>>>>>>>>> +    {
>>>>>>>>> +      /* Using the two temporaries tmp1, tmp2 from above
>>>>>>>>> +     the original x is now:
>>>>>>>>> +     x = tmp1 * tmp2.  */
>>>>>>>>> +      gcc_assert (mult_ssa_name);
>>>>>>>>> +      gcc_assert (sqr_ssa_name);
>>>>>>>>> +      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
>>>>>>>>> +
>>>>>>>>> +      tree new_ssa_name
>>>>>>>>> +    = make_temp_ssa_name (TREE_TYPE (a), NULL,
>>>>>>>>> "recip_sqrt_transformed");
>>>>>>>>> +      gimple *new_stmt
>>>>>>>>> +    = gimple_build_assign (new_ssa_name, MULT_EXPR,
>>>>>>>>> +                   mult_ssa_name, sqr_ssa_name);
>>>>>>>>> +      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
>>>>>>>>> +      gcc_assert (gsi_stmt (gsi2) == stmt);
>>>>>>>>> + gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
>>>>>>>>> +      fold_stmt (&gsi2);
>>>>>>>>> +      update_stmt (stmt);
>>>>>>>> In this case we're replacing the statement in its original position,
>>>>>>>> so there's no real need to use a temporary.  It seems better to
>>>>>>>> change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same
>>>>>>>> lhs as before.
>>>>>>> Yes, that's cleaner.
>>>>>>>>> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
>>>>>>>>>          if (optimize_bb_for_size_p (bb))
>>>>>>>>>            continue;
>>>>>>>> Seems unnecessary to skip the new optimisation when optimising for
>>>>>>>> size.
>>>>>>>> Like you say, it saves a multiplication overall. Also:
>>>>>>> Indeed.
>>>>>>>>> +      if (flag_unsafe_math_optimizations)
>>>>>>>>> +    {
>>>>>>>>> +      for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
>>>>>>>>> +           !gsi_end_p (gsi);
>>>>>>>>> +           gsi_next (&gsi))
>>>>>>>>> +        {
>>>>>>>>> +          gimple *stmt = gsi_stmt (gsi);
>>>>>>>>> +
>>>>>>>>> +          if (gimple_has_lhs (stmt)
>>>>>>>>> +          && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) !=
>>>>>>>>> NULL
>>>>>>>>> +          && FLOAT_TYPE_P (TREE_TYPE (def))
>>>>>>>>> +          && TREE_CODE (def) == SSA_NAME
>>>>>>>>> +          && is_gimple_assign (stmt)
>>>>>>>>> +          && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
>>>>>>>>> +        optimize_recip_sqrt (&gsi, def);
>>>>>>>>> +        }
>>>>>>>>> +    }
>>>>>>>> It looks like this could safely be done in one of the existing walks
>>>>>>>> (e.g. the execute_cse_reciprocals_1 one, if we do this when
>>>>>>>> optimising
>>>>>>>> for size).
>>>>>>> You're right. I've moved this into one of the walks above this.
>>>>>>>
>>>>>>> Bootstrapped and tested on aarch64-none-linux-gnu and
>>>>>>> x86_64-unknown-linux-gnu.
>>>>>>> CC'ing richi as he's reviewed these kinds of patches in the past.
>>>>>>>
>>>>>>> Is this ok for trunk?
>>>>> I wonder how it interacts with execute_cse_reciprocals_1 given it
>>>>> introduces a division 1.0 / a which will be not processed by
>>>>> execute_cse_reciprocals_1 given that operates by walking uses of 'a'.
>>>>> That may be just a missed optimization of course.
>>>> Hmm, I believe right now it doesn't interact with
>>>> execute_cse_reciprocals_1 as
>>>> it's either one or the other.
>>>> I've left it as it is for now, but would you like us to call
>>>> execute_cse_reciprocals_1 on the potentially transformed division?
>>> I think that wouldn't "fix" it given execute_cse_reciprocals_1 works by
>>> seeing all reciprocal uses of 'a' so it may have alrady seen two and
>>> you introduced the third.  But we can leave "solving" this for
>>> future enhacements (I'd avoid doing two passes over the IL just because
>>> of said possibility right now).
>>>>> +  if (has_other_use)
>>>>> +    {
>>>>> +      /* Using the two temporaries tmp1, tmp2 from above
>>>>> +        the original x is now:
>>>>> +        x = tmp1 * tmp2.  */
>>>>> +      gcc_assert (mult_ssa_name);
>>>>> +      gcc_assert (sqr_ssa_name);
>>>>> +
>>>>> +      gimple_assign_set_rhs1 (stmt, mult_ssa_name);
>>>>> +      gimple_assign_set_rhs2 (stmt, sqr_ssa_name);
>>>>> +      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
>>>>> +      fold_stmt_inplace (def_gsi);
>>>>> +      update_stmt (stmt);
>>>>> +    }
>>>>>
>>>>> so you are leaving the original stmt in place unchanged even if it is
>>>>> not used?  Why?  Note that with -fno-call-exceptions this stmt may
>>>>> throw, so you should arrange to code-generate 1./a in place of the
>>>>> original division to preserve EH behavior.  Watch out because then
>>>>> with other uses you have to find a place to insert its computation.
>>>> Ok. These are oversights on my part. I've updated the patch to modify
>>>> the
>>>> original division in place. The multiplication is placed after it.
>>> If the division throws internally you can't insert after it, you
>>> have to insert on the single non-EH edge outgoing from the BB
>>> with the division.  Just try a C++ testcase with -fnon-call-exceptions
>>> and a try {} catch(...) {} around.
>>>
>>> Not sure if it's worth to handle so bailing out if
>>> stmt_can_throw_internal (stmt) would be an option as well.
>>>>> +      if (is_square_of (stmt2, x))
>>>>> +       {
>>>>> +         if (!sqr_stmts.contains (stmt2))
>>>>> +           sqr_stmts.safe_push (stmt2);
>>>>> +       }
>>>>>
>>>>> this is quadratic in the number of square stmts... please consider
>>>>> making sqr_stmts a bitmap of SSA defs (so the stmt you have now
>>>>> is then SSA_NAME_DEF_STMT (ssa_name (bitmap-element))).
>>>> Done. In practice I didn't see there being more than one such use, I
>>>> expect
>>>> them
>>>> to be CSE'd, but maybe if they're in different basic blocks...
>>> Oh, it just occured to me you could use FOR_EACH_IMM_USE_STMT ()
>>> which will only get you each stmt once... ;)  Sorry for the misleading
>>> bitmap suggestion.
>>>>> You do not seem to restrict placement of the use stmts but insert
>>>>> before the 1/sqrt(a) stmt.  That possibly hoists the multiplications
>>>>> where they are not needed.  Consider
>>>>>
>>>>>    x = 1./sqrt(a);
>>>>>    if (l)
>>>>>      l1 = x * 3.;
>>>>>    else if (l2)
>>>>>      l1 = x * x;
>>>>>    else if (l3)
>>>>>      l1 = a * x;
>>>>>
>>>>> or similar where on the path to x * 3. you now perform two extra
>>>>> multiplications.
>>>> Ok, I've restricted the optimisation somewhat.
>>>> Now if there's an other use it won't perform the transformation unless
>>>> there
>>>> is already
>>>> a multiplication present on the main path. That way it won't introduce
>>>> multiplications
>>>> on paths where there aren't any.
>>> OK.
>>>>> Your testcases do not cover the case of other uses at all.  Or of
>>>>> EH.
>>>> gcc.dg/recip_sqrt_mult_1.c should handle the other uses case as tmp is a
>>>> global
>>>> being written to. I've added more testcases involving uses in different
>>>> basic
>>>> blocks
>>>> and a g++.dg testcase with -fnon-call-exceptions.
>>>> I'm not sure what functionality to test though apart from that it
>>>> doesn't ICE
>>>> and does
>>>> the transformations I expect.
>>> That's good enough I guess.  I see you do have a testcase with
>>> try/catch so I wonder why foo4 doesn't ICE when you insert after
>>> the throwing stmt...  maybe it doesn't throw?  Ah - they probably
>>> fail your new mult_on_main_path test?
>>
>> Yeah. I did manage to reproduce an ICE eventually by removing that check
>> and enabling -ftrapping-math.
>>
>> I think I'll just not enter this at all if stmt_can_throw_internal (stmt)
>> like you suggested.
>
>
> And here is the version that does that. It also reverts to using a vector
> for the sqr_stmts for consistency
> and uses FOR_EACH_IMM_USE_STMT to iterate over the use statements.
>
> Bootstrapped and tested on aarch64-none-linux-gnu.
>
> Ok?
>
> Thanks,
> Kyrill
>
> 2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
>     * tree-ssa-math-opts.c (is_mult_by): New function.
>     (is_square_of): Use the above.
>     (optimize_recip_sqrt): New function.
>     (pass_cse_reciprocals::execute): Use the above.
>
> 2018-09-03  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
>     * gcc.dg/recip_sqrt_mult_1.c: New test.
>     * gcc.dg/recip_sqrt_mult_2.c: Likewise.
>     * gcc.dg/recip_sqrt_mult_3.c: Likewise.
>     * gcc.dg/recip_sqrt_mult_4.c: Likewise.
>     * gcc.dg/recip_sqrt_mult_5.c: Likewise.
>     * g++.dg/recip_sqrt_mult_1.C: Likewise.
>     * g++.dg/recip_sqrt_mult_2.C: Likewise.

This caused:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87283
diff mbox series

Patch

commit ea11c9eb018abf4e21c61b8a7305291b0d9a7ec8
Author: kyrtka01 <kyrylo.tkachov@arm.com>
Date:   Tue Aug 21 13:54:07 2018 +0100

    Optimise sqrt reciprocal multiplications

diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
new file mode 100644
index 0000000..86429e4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+double res, res2, tmp;
+void
+foo (double a, double b)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  res = tmp * tmp;
+  res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing multiplication" "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
new file mode 100644
index 0000000..c5fc3de
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+float
+foo (float a)
+{
+  float tmp = 1.0f / __builtin_sqrtf (a);
+  return a * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not " / " "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
new file mode 100644
index 0000000..e7d185b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+double
+foo (double a)
+{
+  double tmp = 1.0f / __builtin_sqrt (a);
+  return tmp * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not "__builtin_sqrt" "optimized" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index a90d9d2..b25beaf 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -352,6 +352,23 @@  is_square_of (gimple *use_stmt, tree def)
   return 0;
 }
 
+/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
+
+static inline bool
+is_mult_by (gimple *use_stmt, tree def, tree a)
+{
+  if (gimple_code (use_stmt) == GIMPLE_ASSIGN
+      && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
+    {
+      tree op0 = gimple_assign_rhs1 (use_stmt);
+      tree op1 = gimple_assign_rhs2 (use_stmt);
+
+      return (op0 == def && op1 == a)
+	      || (op0 == a && op1 == def);
+    }
+  return 0;
+}
+
 /* Return whether USE_STMT is a floating-point division by
    DEF * DEF.  */
 static inline bool
@@ -652,6 +669,180 @@  execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
   occ_head = NULL;
 }
 
+/* Transform sequences like
+   x = 1.0 / sqrt (a);
+   r1 = x * x;
+   r2 = a * x;
+   into:
+   tmp1 = 1.0 / a;
+   tmp2 = sqrt (a);
+   tmp3 = tmp1 * tmp2;
+   x = tmp3;
+   r1 = tmp1;
+   r2 = tmp2;
+   depending on the uses of x, r1, r2.  This removes one multiplication and
+   allows the sqrt and division operations to execute in parallel.
+   DEF_GSI is the gsi of the initial division by sqrt that defines
+   DEF (x in the example abovs).  */
+
+static void
+optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
+{
+  use_operand_p use_p;
+  imm_use_iterator use_iter;
+  gimple *stmt = gsi_stmt (*def_gsi);
+  tree x = def;
+  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
+  tree div_rhs1 = gimple_assign_rhs1 (stmt);
+
+  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
+      || TREE_CODE (div_rhs1) != REAL_CST
+      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
+    return;
+
+  gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
+  if (!is_gimple_call (sqrt_stmt)
+      || !gimple_call_lhs (sqrt_stmt))
+    return;
+
+  gcall *call = as_a <gcall *> (sqrt_stmt);
+  switch (gimple_call_combined_fn (call))
+    {
+    CASE_CFN_SQRT:
+    CASE_CFN_SQRT_FN:
+      break;
+
+    default:
+      return;
+    }
+  tree a = gimple_call_arg (sqrt_stmt, 0);
+
+  /* We have 'a' and 'x'.  Now analyze the uses of 'x'.  */
+
+  /* Statements that use x in x * x.  */
+  auto_vec<gimple *> sqr_stmts;
+  /* Statements that use x in a * x.  */
+  auto_vec<gimple *> mult_stmts;
+  bool has_other_use = false;
+
+  FOR_EACH_IMM_USE_FAST (use_p, use_iter, x)
+    {
+      gimple *stmt2 = USE_STMT (use_p);
+      if (is_gimple_debug (stmt2))
+	continue;
+      if (is_square_of (stmt2, x))
+	{
+	  if (!sqr_stmts.contains (stmt2))
+	    sqr_stmts.safe_push (stmt2);
+	}
+      else if (is_mult_by (stmt2, x, a))
+	mult_stmts.safe_push (stmt2);
+      else
+	has_other_use = true;
+    }
+  if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
+    return;
+
+  /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
+     to be able to compose it from the sqr and mult cases.  */
+  if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
+    return;
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
+      print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
+      print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+      fprintf (dump_file, "\n");
+    }
+
+  tree mult_ssa_name = NULL_TREE;
+  tree sqr_ssa_name = NULL_TREE;
+  if (!sqr_stmts.is_empty ())
+    {
+      /* r1 = x * x.  Transform this into:
+	 tmp1 = 1.0 / a
+	 r1 = tmp1.  */
+
+      sqr_ssa_name
+	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
+      gimple *new_stmt
+	= gimple_build_assign (sqr_ssa_name, RDIV_EXPR,
+				build_real (TREE_TYPE (a), dconst1), a);
+      gsi_insert_before (def_gsi, new_stmt, GSI_NEW_STMT);
+
+      unsigned int i;
+      gimple *sqr_stmt;
+      FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
+	{
+	  gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
+
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "Replacing squaring multiplication\n");
+	      print_gimple_stmt (dump_file, sqr_stmt, 0, TDF_NONE);
+	      fprintf (dump_file, "with\n");
+	      print_gimple_stmt (dump_file, new_stmt, 0, TDF_NONE);
+	      fprintf (dump_file, "\n");
+	    }
+	  gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
+	  fold_stmt_inplace (&gsi2);
+	  update_stmt (sqr_stmt);
+      }
+    }
+  if (!mult_stmts.is_empty ())
+    {
+      /* r2 = a * x.  Transform this into:
+	 tmp2 = sqrt (a)
+	 r2 = tmp2.  */
+      mult_ssa_name
+	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_mult");
+      gimple *new_stmt
+	= gimple_build_assign (mult_ssa_name, orig_sqrt_ssa_name);
+      gsi_insert_before (def_gsi, new_stmt, GSI_NEW_STMT);
+
+      unsigned int i;
+      gimple *mult_stmt = NULL;
+      FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
+	{
+	  gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
+
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "Replacing multiplication\n");
+	      print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
+	      fprintf (dump_file, "with\n");
+	      print_gimple_stmt (dump_file, new_stmt, 0, TDF_NONE);
+	      fprintf (dump_file, "\n");
+	    }
+	  gimple_assign_set_rhs_from_tree (&gsi2, mult_ssa_name);
+	  fold_stmt_inplace (&gsi2);
+	  update_stmt (mult_stmt);
+      }
+    }
+
+  if (has_other_use)
+    {
+      /* Using the two temporaries tmp1, tmp2 from above
+	 the original x is now:
+	 x = tmp1 * tmp2.  */
+      gcc_assert (mult_ssa_name);
+      gcc_assert (sqr_ssa_name);
+      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
+
+      tree new_ssa_name
+	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_transformed");
+      gimple *new_stmt
+	= gimple_build_assign (new_ssa_name, MULT_EXPR,
+			       mult_ssa_name, sqr_ssa_name);
+      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
+      gcc_assert (gsi_stmt (gsi2) == stmt);
+      gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
+      fold_stmt (&gsi2);
+      update_stmt (stmt);
+    }
+}
+
 /* Return an internal function that implements the reciprocal of CALL,
    or IFN_LAST if there is no such function that the target supports.  */
 
@@ -762,6 +953,23 @@  pass_cse_reciprocals::execute (function *fun)
       if (optimize_bb_for_size_p (bb))
         continue;
 
+      if (flag_unsafe_math_optimizations)
+	{
+	  for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
+	       !gsi_end_p (gsi);
+	       gsi_next (&gsi))
+	    {
+	      gimple *stmt = gsi_stmt (gsi);
+
+	      if (gimple_has_lhs (stmt)
+		  && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
+		  && FLOAT_TYPE_P (TREE_TYPE (def))
+		  && TREE_CODE (def) == SSA_NAME
+		  && is_gimple_assign (stmt)
+		  && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
+		optimize_recip_sqrt (&gsi, def);
+	    }
+	}
       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 	   gsi_next (&gsi))