Patchwork RFA: Fix tree-optimization/55524

login
register
mail settings
Submitter Joern Rennecke
Date April 8, 2013, 3:10 p.m.
Message ID <20130408111056.gvvtnyc8w04cww8g-nzlynne@webmail.spamcop.net>
Download mbox | patch
Permalink /patch/234804/
State New
Headers show

Comments

Joern Rennecke - April 8, 2013, 3:10 p.m.
This is basically the same patch as attached to the PR, except that I
have changed the goto-loop into a do-while loop with a new comment;
this caused the need for a lot of reformatting.

bootstrapped & regtested on i686-pc-linux-gnu.
2013-04-08  Joern Rennecke  <joern.rennecke@embecosm.com>

	* tree-ssa-math-opts.c (mult_to_fma_pass): New file static struct.
	(convert_mult_to_fma): In first pass, don't use an fms construct
	when we don't have an fms operation, but fmna.
	(execute_optimize_widening_mul): Add a second pass if
	convert_mult_to_fma requests it.
Richard Guenther - April 9, 2013, 8:47 a.m.
On Mon, Apr 8, 2013 at 5:10 PM, Joern Rennecke
<joern.rennecke@embecosm.com> wrote:
> This is basically the same patch as attached to the PR, except that I
> have changed the goto-loop into a do-while loop with a new comment;
> this caused the need for a lot of reformatting.

Can you please include a testcase that shows the effect of this?

> bootstrapped & regtested on i686-pc-linux-gnu.
>
> 2013-04-08  Joern Rennecke  <joern.rennecke@embecosm.com>
>
>         * tree-ssa-math-opts.c (mult_to_fma_pass): New file static struct.
>         (convert_mult_to_fma): In first pass, don't use an fms construct
>         when we don't have an fms operation, but fmna.

it's fnma I believe.

>         (execute_optimize_widening_mul): Add a second pass if
>         convert_mult_to_fma requests it.
>
> Index: gcc/tree-ssa-math-opts.c
> ===================================================================
> --- gcc/tree-ssa-math-opts.c    (revision 197578)
> +++ gcc/tree-ssa-math-opts.c    (working copy)
> @@ -2461,6 +2461,12 @@ convert_plusminus_to_widen (gimple_stmt_
>    return true;
>  }
>
> +static struct
> +{
> +  bool second_pass;
> +  bool retry_request;
> +} mult_to_fma_pass;
> +
>  /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
>     with uses in additions and subtractions to form fused multiply-add
>     operations.  Returns true if successful and MUL_STMT should be removed.
> */
> @@ -2570,6 +2576,22 @@ convert_mult_to_fma (gimple mul_stmt, tr
>           return false;
>         }
>
> +      /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
> +        by a MULT_EXPR that we'll visit later, we might be able to
> +        get a more profitable match with fnma.
> +        OTOH, if we don't, a negate / fma pair has likely lower latency
> +        that a mult / subtract pair.  */

This makes it sound that this is purely an ordering issue, thus for

  x = a * b;
  y = c * d;
  z = x - y;

instead of

  y = c * d;
  z = a * b + (-y);

you want to generate

  x = a * b;
  z = -(c * d + (-x));

I fail to see why you need two passes for this rather than considering the
case that the immediate use stmt of the multiplication we start from
combines another multiplication with a MINUS_EXPR.  That is ...

> +      if (use_code == MINUS_EXPR && !negate_p
> +         && gimple_assign_rhs1 (use_stmt) == result
> +         && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
> +         && optab_handler (fnma_optab, TYPE_MODE (type)) !=
> CODE_FOR_nothing
> +         && mult_to_fma_pass.second_pass == false)

see if that is the case here and simply not do the transform.  A second pass
will not recover from that without destroying the fnma pattern (testcase?)

Richard.

> +       {
> +         /* ??? Could make setting of retry_request dependent on some
> +            rtx_cost measure we evaluate beforehand.  */
> +         mult_to_fma_pass.retry_request = true;
> +         return false;
> +       }
>        /* We can't handle a * b + a * b.  */
>        if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
>         return false;
> @@ -2657,76 +2679,89 @@ execute_optimize_widening_mul (void)
>
>    memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
>
> -  FOR_EACH_BB (bb)
> -    {
> -      gimple_stmt_iterator gsi;
>
> -      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
> -        {
> -         gimple stmt = gsi_stmt (gsi);
> -         enum tree_code code;
> +  /* We may run one or two passes.  In the first pass, if have fnma,
> +     but not fms, we don't synthesize fms so that we can get the maximum
> +     matches for fnma.  If we have therefore skipped opportunities to
> +     synthesize fms, we'll run a second pass where we use any such
> +     opportunities that still remain.  */
> +  mult_to_fma_pass.retry_request = false;
> +  do
> +    {
> +      mult_to_fma_pass.second_pass = mult_to_fma_pass.retry_request;
> +      FOR_EACH_BB (bb)
> +       {
> +         gimple_stmt_iterator gsi;
>
> -         if (is_gimple_assign (stmt))
> +         for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
>             {
> -             code = gimple_assign_rhs_code (stmt);
> -             switch (code)
> +             gimple stmt = gsi_stmt (gsi);
> +             enum tree_code code;
> +
> +             if (is_gimple_assign (stmt))
>                 {
> -               case MULT_EXPR:
> -                 if (!convert_mult_to_widen (stmt, &gsi)
> -                     && convert_mult_to_fma (stmt,
> -                                             gimple_assign_rhs1 (stmt),
> -                                             gimple_assign_rhs2 (stmt)))
> +                 code = gimple_assign_rhs_code (stmt);
> +                 switch (code)
>                     {
> -                     gsi_remove (&gsi, true);
> -                     release_defs (stmt);
> -                     continue;
> -                   }
> -                 break;
> -
> -               case PLUS_EXPR:
> -               case MINUS_EXPR:
> -                 convert_plusminus_to_widen (&gsi, stmt, code);
> -                 break;
> +                   case MULT_EXPR:
> +                     if (!convert_mult_to_widen (stmt, &gsi)
> +                         && convert_mult_to_fma (stmt,
> +                                                 gimple_assign_rhs1 (stmt),
> +                                                 gimple_assign_rhs2
> (stmt)))
> +                       {
> +                         gsi_remove (&gsi, true);
> +                         release_defs (stmt);
> +                         continue;
> +                       }
> +                     break;
> +
> +                   case PLUS_EXPR:
> +                   case MINUS_EXPR:
> +                     convert_plusminus_to_widen (&gsi, stmt, code);
> +                     break;
>
> -               default:;
> +                   default:;
> +                   }
>                 }
> -           }
> -         else if (is_gimple_call (stmt)
> -                  && gimple_call_lhs (stmt))
> -           {
> -             tree fndecl = gimple_call_fndecl (stmt);
> -             if (fndecl
> -                 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
> +             else if (is_gimple_call (stmt)
> +                      && gimple_call_lhs (stmt))
>                 {
> -                 switch (DECL_FUNCTION_CODE (fndecl))
> +                 tree fndecl = gimple_call_fndecl (stmt);
> +                 if (fndecl
> +                     && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
>                     {
> -                     case BUILT_IN_POWF:
> -                     case BUILT_IN_POW:
> -                     case BUILT_IN_POWL:
> -                       if (TREE_CODE (gimple_call_arg (stmt, 1)) ==
> REAL_CST
> -                           && REAL_VALUES_EQUAL
> -                                (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
> -                                 dconst2)
> -                           && convert_mult_to_fma (stmt,
> -                                                   gimple_call_arg (stmt,
> 0),
> -                                                   gimple_call_arg (stmt,
> 0)))
> -                         {
> -                           unlink_stmt_vdef (stmt);
> -                           if (gsi_remove (&gsi, true)
> -                               && gimple_purge_dead_eh_edges (bb))
> -                             cfg_changed = true;
> -                           release_defs (stmt);
> -                           continue;
> -                         }
> +                     switch (DECL_FUNCTION_CODE (fndecl))
> +                       {
> +                       case BUILT_IN_POWF:
> +                       case BUILT_IN_POW:
> +                       case BUILT_IN_POWL:
> +                         if ((TREE_CODE (gimple_call_arg (stmt, 1))
> +                              == REAL_CST)
> +                             && (REAL_VALUES_EQUAL
> +                                 (TREE_REAL_CST (gimple_call_arg (stmt,
> 1)),
> +                                  dconst2))
> +                             && (convert_mult_to_fma
> +                                 (stmt, gimple_call_arg (stmt, 0),
> +                                  gimple_call_arg (stmt, 0))))
> +                           {
> +                             unlink_stmt_vdef (stmt);
> +                             if (gsi_remove (&gsi, true)
> +                                 && gimple_purge_dead_eh_edges (bb))
> +                               cfg_changed = true;
> +                             release_defs (stmt);
> +                             continue;
> +                           }
>                           break;
>
> -                     default:;
> +                         default:;
> +                       }
>                     }
>                 }
> +             gsi_next (&gsi);
>             }
> -         gsi_next (&gsi);
>         }
>      }
> +  while (!mult_to_fma_pass.second_pass && mult_to_fma_pass.retry_request);
>
>    statistics_counter_event (cfun, "widening multiplications inserted",
>                             widen_mul_stats.widen_mults_inserted);
>
Joern Rennecke - April 9, 2013, 2:53 p.m.
Quoting Richard Biener <richard.guenther@gmail.com>:

Oops, I missed your final comment when I wrote my first reply.

> I fail to see why you need two passes for this rather than considering the
> case that the immediate use stmt of the multiplication we start from
> combines another multiplication with a MINUS_EXPR.  That is ...
>
>> +      if (use_code == MINUS_EXPR && !negate_p
>> +         && gimple_assign_rhs1 (use_stmt) == result
>> +         && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
>> +         && optab_handler (fnma_optab, TYPE_MODE (type)) !=
>> CODE_FOR_nothing
>> +         && mult_to_fma_pass.second_pass == false)
>
> see if that is the case here and simply not do the transform.

In other words, I would have to take gimple_assign_rh2 of use_stmt, then
find the statement that sets that value, check if this is a multiply,
and that it is not expanded as a widening multiply (thus duplicating most of
the code in execute_optimize_widening_mul), and also check that if there are
any other uses of the multiply result, that these are likewise reduced to
fma somehow.  At which point we hit recursion.  This could easily double
the size of this source file.  Also, we are alternating between forward
and backward searches, which could lead to quadratic complexity where
we had linear otherwise.

> A second pass
> will not recover from that without destroying the fnma pattern (testcase?)

The second pass is there to generate the fms where we didn't generate fnma.
If fnma (as negate (fma)) has been generated, it will use up the MINUS_EXPR,
so that fms cannot be formed anymore.
This arrangement is a lot simpler than going for a depth-first decision.
Richard Guenther - April 9, 2013, 3:09 p.m.
On Tue, Apr 9, 2013 at 4:53 PM, Joern Rennecke
<joern.rennecke@embecosm.com> wrote:
> Quoting Richard Biener <richard.guenther@gmail.com>:
>
> Oops, I missed your final comment when I wrote my first reply.
>
>
>> I fail to see why you need two passes for this rather than considering the
>> case that the immediate use stmt of the multiplication we start from
>> combines another multiplication with a MINUS_EXPR.  That is ...
>>
>>> +      if (use_code == MINUS_EXPR && !negate_p
>>> +         && gimple_assign_rhs1 (use_stmt) == result
>>> +         && optab_handler (fms_optab, TYPE_MODE (type)) ==
>>> CODE_FOR_nothing
>>> +         && optab_handler (fnma_optab, TYPE_MODE (type)) !=
>>> CODE_FOR_nothing
>>> +         && mult_to_fma_pass.second_pass == false)
>>
>>
>> see if that is the case here and simply not do the transform.
>
>
> In other words, I would have to take gimple_assign_rh2 of use_stmt, then
> find the statement that sets that value, check if this is a multiply,
> and that it is not expanded as a widening multiply (thus duplicating most of
> the code in execute_optimize_widening_mul), and also check that if there are
> any other uses of the multiply result, that these are likewise reduced to
> fma somehow.  At which point we hit recursion.  This could easily double
> the size of this source file.  Also, we are alternating between forward
> and backward searches, which could lead to quadratic complexity where
> we had linear otherwise.

I don't see that.  It's merely a complication of optimal handling of
a * b +- c * d vs. just a * b +- c.  The pass does simple pattern matching
only, not doing a global optimal transform, so adding another special-case
is reasonable.  Special-casing just for single-use 2nd multiplication
simplifies the cases for example.

Richard.

>> A second pass
>> will not recover from that without destroying the fnma pattern (testcase?)
>
>
> The second pass is there to generate the fms where we didn't generate fnma.
> If fnma (as negate (fma)) has been generated, it will use up the MINUS_EXPR,
> so that fms cannot be formed anymore.
> This arrangement is a lot simpler than going for a depth-first decision.

Patch

Index: gcc/tree-ssa-math-opts.c
===================================================================
--- gcc/tree-ssa-math-opts.c	(revision 197578)
+++ gcc/tree-ssa-math-opts.c	(working copy)
@@ -2461,6 +2461,12 @@  convert_plusminus_to_widen (gimple_stmt_
   return true;
 }
 
+static struct 
+{
+  bool second_pass;
+  bool retry_request;
+} mult_to_fma_pass;
+
 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
    with uses in additions and subtractions to form fused multiply-add
    operations.  Returns true if successful and MUL_STMT should be removed.  */
@@ -2570,6 +2576,22 @@  convert_mult_to_fma (gimple mul_stmt, tr
 	  return false;
 	}
 
+      /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
+	 by a MULT_EXPR that we'll visit later, we might be able to
+	 get a more profitable match with fnma.
+	 OTOH, if we don't, a negate / fma pair has likely lower latency
+	 that a mult / subtract pair.  */
+      if (use_code == MINUS_EXPR && !negate_p
+	  && gimple_assign_rhs1 (use_stmt) == result
+	  && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
+	  && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing
+	  && mult_to_fma_pass.second_pass == false)
+	{
+	  /* ??? Could make setting of retry_request dependent on some
+	     rtx_cost measure we evaluate beforehand.  */
+	  mult_to_fma_pass.retry_request = true;
+	  return false;
+	}
       /* We can't handle a * b + a * b.  */
       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
 	return false;
@@ -2657,76 +2679,89 @@  execute_optimize_widening_mul (void)
 
   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
 
-  FOR_EACH_BB (bb)
-    {
-      gimple_stmt_iterator gsi;
 
-      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
-        {
-	  gimple stmt = gsi_stmt (gsi);
-	  enum tree_code code;
+  /* We may run one or two passes.  In the first pass, if have fnma,
+     but not fms, we don't synthesize fms so that we can get the maximum
+     matches for fnma.  If we have therefore skipped opportunities to
+     synthesize fms, we'll run a second pass where we use any such
+     opportunities that still remain.  */
+  mult_to_fma_pass.retry_request = false;
+  do
+    {
+      mult_to_fma_pass.second_pass = mult_to_fma_pass.retry_request;
+      FOR_EACH_BB (bb)
+	{
+	  gimple_stmt_iterator gsi;
 
-	  if (is_gimple_assign (stmt))
+	  for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
 	    {
-	      code = gimple_assign_rhs_code (stmt);
-	      switch (code)
+	      gimple stmt = gsi_stmt (gsi);
+	      enum tree_code code;
+
+	      if (is_gimple_assign (stmt))
 		{
-		case MULT_EXPR:
-		  if (!convert_mult_to_widen (stmt, &gsi)
-		      && convert_mult_to_fma (stmt,
-					      gimple_assign_rhs1 (stmt),
-					      gimple_assign_rhs2 (stmt)))
+		  code = gimple_assign_rhs_code (stmt);
+		  switch (code)
 		    {
-		      gsi_remove (&gsi, true);
-		      release_defs (stmt);
-		      continue;
-		    }
-		  break;
-
-		case PLUS_EXPR:
-		case MINUS_EXPR:
-		  convert_plusminus_to_widen (&gsi, stmt, code);
-		  break;
+		    case MULT_EXPR:
+		      if (!convert_mult_to_widen (stmt, &gsi)
+			  && convert_mult_to_fma (stmt,
+						  gimple_assign_rhs1 (stmt),
+						  gimple_assign_rhs2 (stmt)))
+			{
+			  gsi_remove (&gsi, true);
+			  release_defs (stmt);
+			  continue;
+			}
+		      break;
+
+		    case PLUS_EXPR:
+		    case MINUS_EXPR:
+		      convert_plusminus_to_widen (&gsi, stmt, code);
+		      break;
 
-		default:;
+		    default:;
+		    }
 		}
-	    }
-	  else if (is_gimple_call (stmt)
-		   && gimple_call_lhs (stmt))
-	    {
-	      tree fndecl = gimple_call_fndecl (stmt);
-	      if (fndecl
-		  && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
+	      else if (is_gimple_call (stmt)
+		       && gimple_call_lhs (stmt))
 		{
-		  switch (DECL_FUNCTION_CODE (fndecl))
+		  tree fndecl = gimple_call_fndecl (stmt);
+		  if (fndecl
+		      && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
 		    {
-		      case BUILT_IN_POWF:
-		      case BUILT_IN_POW:
-		      case BUILT_IN_POWL:
-			if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
-			    && REAL_VALUES_EQUAL
-			         (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
-				  dconst2)
-			    && convert_mult_to_fma (stmt,
-						    gimple_call_arg (stmt, 0),
-						    gimple_call_arg (stmt, 0)))
-			  {
-			    unlink_stmt_vdef (stmt);
-			    if (gsi_remove (&gsi, true)
-				&& gimple_purge_dead_eh_edges (bb))
-			      cfg_changed = true;
-			    release_defs (stmt);
-			    continue;
-			  }
+		      switch (DECL_FUNCTION_CODE (fndecl))
+			{
+			case BUILT_IN_POWF:
+			case BUILT_IN_POW:
+			case BUILT_IN_POWL:
+			  if ((TREE_CODE (gimple_call_arg (stmt, 1))
+			       == REAL_CST)
+			      && (REAL_VALUES_EQUAL
+				  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
+				   dconst2))
+			      && (convert_mult_to_fma
+				  (stmt, gimple_call_arg (stmt, 0),
+				   gimple_call_arg (stmt, 0))))
+			    {
+			      unlink_stmt_vdef (stmt);
+			      if (gsi_remove (&gsi, true)
+				  && gimple_purge_dead_eh_edges (bb))
+				cfg_changed = true;
+			      release_defs (stmt);
+			      continue;
+			    }
 			  break;
 
-		      default:;
+			  default:;
+			}
 		    }
 		}
+	      gsi_next (&gsi);
 	    }
-	  gsi_next (&gsi);
 	}
     }
+  while (!mult_to_fma_pass.second_pass && mult_to_fma_pass.retry_request);
 
   statistics_counter_event (cfun, "widening multiplications inserted",
 			    widen_mul_stats.widen_mults_inserted);