diff mbox

Add FMA_EXPR, un-cse multiplications before expansion

Message ID 4CC1FE2F.4020802@redhat.com
State New
Headers show

Commit Message

Richard Henderson Oct. 22, 2010, 9:12 p.m. UTC
On 10/21/2010 02:41 AM, Richard Guenther wrote:
> +   /* Leave single uses to the RTL combiner, we otherwise regress
> +      in some circumstances.  */
> +   if (single_imm_use (mul_result, &use_p, &use_stmt))
> +     return false;

Like what?  We can't honor those PAREN_EXPR rules in rtl, and
we can't leave this to combine without that.

Ignoring that, consider the following addition to your patch,
which handles the negate during expansion via 4 optabs.


r~
diff mbox

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d80be88..b61a3b5 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1834,7 +1834,7 @@ 
 
 ;; Intrinsic FMA operations.
 
-;; The standard name for fma is only available with SSE math enabled.
+;; The standard names for fma is only available with SSE math enabled.
 (define_expand "fma<mode>4"
   [(set (match_operand:FMAMODE 0 "register_operand")
 	(fma:FMAMODE
@@ -1844,6 +1844,33 @@ 
   "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH"
   "")
 
+(define_expand "fms<mode>4"
+  [(set (match_operand:FMAMODE 0 "register_operand")
+	(fma:FMAMODE
+	  (match_operand:FMAMODE 1 "nonimmediate_operand")
+	  (match_operand:FMAMODE 2 "nonimmediate_operand")
+	  (neg:FMAMODE (match_operand:FMAMODE 3 "nonimmediate_operand"))))]
+  "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH"
+  "")
+
+(define_expand "fnma<mode>4"
+  [(set (match_operand:FMAMODE 0 "register_operand")
+	(fma:FMAMODE
+	  (neg:FMAMODE (match_operand:FMAMODE 1 "nonimmediate_operand"))
+	  (match_operand:FMAMODE 2 "nonimmediate_operand")
+	  (match_operand:FMAMODE 3 "nonimmediate_operand")))]
+  "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH"
+  "")
+
+(define_expand "fnms<mode>4"
+  [(set (match_operand:FMAMODE 0 "register_operand")
+	(fma:FMAMODE
+	  (neg:FMAMODE (match_operand:FMAMODE 1 "nonimmediate_operand"))
+	  (match_operand:FMAMODE 2 "nonimmediate_operand")
+	  (neg:FMAMODE (match_operand:FMAMODE 3 "nonimmediate_operand"))))]
+  "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH"
+  "")
+
 ;; The builtin for fma4intrin.h is not constrained by SSE math enabled.
 (define_expand "fma4i_fmadd_<mode>"
   [(set (match_operand:FMAMODE 0 "register_operand")
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 6de4f36..8418564 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -3958,6 +3958,36 @@  pattern is used to implement the @code{fma}, @code{fmaf}, and
 multiply followed by the add if the machine does not perform a
 rounding step between the operations.
 
+@cindex @code{fms@var{m}4} instruction pattern
+@item @samp{fms@var{m}4}
+Like @code{fma@var{m}4}, except operand 3 subtracted from the
+product instead of added to the product.  This is represented
+in the rtl as
+
+@smallexample
+(fma:@var{m} @var{op1} @var{op2} (neg:@var{m} @var{op3}))
+@end smallexample
+
+@cindex @code{fnma@var{m}4} instruction pattern
+@item @samp{fnma@var{m}4}
+Like @code{fma@var{m}4} except that the intermediate product
+is negated before being added to operand 3.  This is represented
+in the rtl as
+
+@smallexample
+(fma:@var{m} (neg:@var{m} @var{op1}) @var{op2} @var{op3})
+@end smallexample
+
+@cindex @code{fnms@var{m}4} instruction pattern
+@item @samp{fnms@var{m}4}
+Like @code{fms@var{m}4} except that the intermediate product
+is negated before subtracting operand 3.  This is represented
+in the rtl as
+
+@smallexample
+(fma:@var{m} (neg:@var{m} @var{op1}) @var{op2} (neg:@var{m} @var{op3}))
+@end smallexample
+
 @cindex @code{min@var{m}3} instruction pattern
 @cindex @code{max@var{m}3} instruction pattern
 @item @samp{smin@var{m}3}, @samp{smax@var{m}3}
diff --git a/gcc/expr.c b/gcc/expr.c
index 56f6eda..f29f6dc 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -7254,7 +7254,7 @@  expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode,
   int ignore;
   bool reduce_bit_field;
   location_t loc = ops->location;
-  tree treeop0, treeop1;
+  tree treeop0, treeop1, treeop2;
 #define REDUCE_BIT_FIELD(expr)	(reduce_bit_field			  \
 				 ? reduce_to_bit_field_precision ((expr), \
 								  target, \
@@ -7267,6 +7267,7 @@  expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode,
 
   treeop0 = ops->op0;
   treeop1 = ops->op1;
+  treeop2 = ops->op2;
 
   /* We should be called only on simple (binary or unary) expressions,
      exactly those that are valid in gimple expressions that aren't
@@ -7624,7 +7625,7 @@  expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode,
     case WIDEN_MULT_PLUS_EXPR:
     case WIDEN_MULT_MINUS_EXPR:
       expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
-      op2 = expand_normal (ops->op2);
+      op2 = expand_normal (treeop2);
       target = expand_widen_pattern_expr (ops, op0, op1, op2,
 					  target, unsignedp);
       return target;
@@ -7711,6 +7712,46 @@  expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode,
       expand_operands (treeop0, treeop1, subtarget, &op0, &op1, EXPAND_NORMAL);
       return REDUCE_BIT_FIELD (expand_mult (mode, op0, op1, target, unsignedp));
 
+    case FMA_EXPR:
+      {
+	optab opt = fma_optab;
+	gimple def0, def2;
+
+	def0 = get_def_for_expr (treeop0, NEGATE_EXPR);
+	def2 = get_def_for_expr (treeop2, NEGATE_EXPR);
+
+	op0 = op2 = NULL;
+
+	if (def0 && def2
+	    && optab_handler (fnms_optab, mode) != CODE_FOR_nothing)
+	  {
+	    opt = fnms_optab;
+	    op0 = expand_normal (gimple_assign_rhs1 (def0));
+	    op2 = expand_normal (gimple_assign_rhs1 (def2));
+	  }
+	else if (def0
+		 && optab_handler (fnma_optab, mode) != CODE_FOR_nothing)
+	  {
+	    opt = fnma_optab;
+	    op0 = expand_normal (gimple_assign_rhs1 (def0));
+	  }
+	else if (def2
+		 && optab_handler (fms_optab, mode) != CODE_FOR_nothing)
+	  {
+	    opt = fms_optab;
+	    op2 = expand_normal (gimple_assign_rhs1 (def2));
+	  }
+
+	if (op0 == NULL)
+	  op0 = expand_expr (treeop0, subtarget, VOIDmode, EXPAND_NORMAL);
+	if (op2 == NULL)
+	  op2 = expand_normal (treeop2);
+	op1 = expand_normal (treeop1);
+
+	return expand_ternary_op (TYPE_MODE (type), opt,
+				  op0, op1, op2, target, 0);
+      }
+
     case MULT_EXPR:
       /* If this is a fixed-point operation, then we cannot use the code
 	 below because "expand_mult" doesn't support sat/no-sat fixed-point
diff --git a/gcc/genopinit.c b/gcc/genopinit.c
index 6e0a714..eee9ef8 100644
--- a/gcc/genopinit.c
+++ b/gcc/genopinit.c
@@ -160,6 +160,9 @@  static const char * const optabs[] =
   "set_optab_handler (floor_optab, $A, CODE_FOR_$(floor$a2$))",
   "set_convert_optab_handler (lfloor_optab, $B, $A, CODE_FOR_$(lfloor$F$a$I$b2$))",
   "set_optab_handler (fma_optab, $A, CODE_FOR_$(fma$a4$))",
+  "set_optab_handler (fms_optab, $A, CODE_FOR_$(fms$a4$))",
+  "set_optab_handler (fnma_optab, $A, CODE_FOR_$(fnma$a4$))",
+  "set_optab_handler (fnms_optab, $A, CODE_FOR_$(fnms$a4$))",
   "set_optab_handler (ceil_optab, $A, CODE_FOR_$(ceil$a2$))",
   "set_convert_optab_handler (lceil_optab, $B, $A, CODE_FOR_$(lceil$F$a$I$b2$))",
   "set_optab_handler (round_optab, $A, CODE_FOR_$(round$a2$))",
diff --git a/gcc/gimple-pretty-print.c b/gcc/gimple-pretty-print.c
index c74dd0e..057f35b 100644
--- a/gcc/gimple-pretty-print.c
+++ b/gcc/gimple-pretty-print.c
@@ -400,6 +400,14 @@  dump_ternary_rhs (pretty_printer *buffer, gimple gs, int spc, int flags)
       pp_character (buffer, '>');
       break;
 
+    case FMA_EXPR:
+      dump_generic_node (buffer, gimple_assign_rhs1 (gs), spc, flags, false);
+      pp_string (buffer, " * ");
+      dump_generic_node (buffer, gimple_assign_rhs2 (gs), spc, flags, false);
+      pp_string (buffer, " + ");
+      dump_generic_node (buffer, gimple_assign_rhs3 (gs), spc, flags, false);
+      break;
+
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/gimple.c b/gcc/gimple.c
index dea0b83..c008ec0 100644
--- a/gcc/gimple.c
+++ b/gcc/gimple.c
@@ -2530,7 +2530,8 @@  get_gimple_rhs_num_ops (enum tree_code code)
       || (SYM) == TRUTH_XOR_EXPR) ? GIMPLE_BINARY_RHS			    \
    : (SYM) == TRUTH_NOT_EXPR ? GIMPLE_UNARY_RHS				    \
    : ((SYM) == WIDEN_MULT_PLUS_EXPR					    \
-      || (SYM) == WIDEN_MULT_MINUS_EXPR) ? GIMPLE_TERNARY_RHS		    \
+      || (SYM) == WIDEN_MULT_MINUS_EXPR					    \
+      || (SYM) == FMA_EXPR) ? GIMPLE_TERNARY_RHS			    \
    : ((SYM) == COND_EXPR						    \
       || (SYM) == CONSTRUCTOR						    \
       || (SYM) == OBJ_TYPE_REF						    \
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 8b9c9a7..c4dfa60 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -192,6 +192,9 @@  enum optab_index
   OTI_atan2,
   /* Floating multiply/add */
   OTI_fma,
+  OTI_fms,
+  OTI_fnma,
+  OTI_fnms,
 
   /* Move instruction.  */
   OTI_mov,
@@ -435,6 +438,9 @@  enum optab_index
 #define pow_optab (&optab_table[OTI_pow])
 #define atan2_optab (&optab_table[OTI_atan2])
 #define fma_optab (&optab_table[OTI_fma])
+#define fms_optab (&optab_table[OTI_fms])
+#define fnma_optab (&optab_table[OTI_fnma])
+#define fnms_optab (&optab_table[OTI_fnms])
 
 #define mov_optab (&optab_table[OTI_mov])
 #define movstrict_optab (&optab_table[OTI_movstrict])
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index bffa679..2eefd90 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -3655,6 +3655,20 @@  verify_gimple_assign_ternary (gimple stmt)
 	}
       break;
 
+    case FMA_EXPR:
+      if (!useless_type_conversion_p (lhs_type, rhs1_type)
+	  || !useless_type_conversion_p (lhs_type, rhs2_type)
+	  || !useless_type_conversion_p (lhs_type, rhs3_type))
+	{
+	  error ("type mismatch in fused multiply-add expression");
+	  debug_generic_expr (lhs_type);
+	  debug_generic_expr (rhs1_type);
+	  debug_generic_expr (rhs2_type);
+	  debug_generic_expr (rhs3_type);
+	  return true;
+	}
+      break;
+
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c
index 133d916..96e0eb4 100644
--- a/gcc/tree-inline.c
+++ b/gcc/tree-inline.c
@@ -3284,6 +3284,7 @@  estimate_operator_cost (enum tree_code code, eni_weights *weights,
     case POINTER_PLUS_EXPR:
     case MINUS_EXPR:
     case MULT_EXPR:
+    case FMA_EXPR:
 
     case ADDR_SPACE_CONVERT_EXPR:
     case FIXED_CONVERT_EXPR:
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index a814f6f..4ee5080 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -1494,6 +1494,112 @@  convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
   return true;
 }
 
+/* Combine the multiplication at MUL_STMT with uses in additions and
+   subtractions to form fused multiply-add operations.  Returns true
+   if successful and MUL_STMT should be removed.  */
+
+static bool
+convert_mult_to_fma (gimple mul_stmt)
+{
+  tree mul_result = gimple_assign_lhs (mul_stmt);
+  tree type = TREE_TYPE (mul_result);
+  gimple use_stmt, fma_stmt;
+  use_operand_p use_p;
+  imm_use_iterator imm_iter;
+
+  /* If the target doesn't support it, don't generate it.
+     ???  We have no way of querying support for the various variants
+     with negated operands, so for the following we simply assume
+     they are all available ((-a)*b+c, a*b-c and (-a)*b-c).  */
+  if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
+    return false;
+
+  /* We don't want to do bitfield reduction ops.  */
+  if (INTEGRAL_TYPE_P (type)
+      && (TYPE_PRECISION (type)
+	  != GET_MODE_PRECISION (TYPE_MODE (type))))
+    return false;
+
+  /* Leave single uses to the RTL combiner, we otherwise regress
+     in some circumstances.  */
+  if (single_imm_use (mul_result, &use_p, &use_stmt))
+    return false;
+
+  /* Make sure that the multiplication statement becomes dead after
+     the transformation, thus that all uses are transformed to FMAs.
+     This means we assume that an FMA operation has the same cost
+     as an addition.  */
+  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
+    {
+      enum tree_code use_code;
+
+      use_stmt = USE_STMT (use_p);
+
+      if (!is_gimple_assign (use_stmt))
+	return false;
+      use_code = gimple_assign_rhs_code (use_stmt);
+      /* ???  Handle NEGATE_EXPR.  */
+      if (use_code != PLUS_EXPR
+	  && use_code != MINUS_EXPR)
+	return false;
+
+      /* We can't handle a * b + a * b.  */
+      if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
+	return false;
+
+      /* For now restrict this operations to single basic blocks.  In theory
+	 we would want to support sinking the multiplication in
+	 m = a*b;
+	 if ()
+	   ma = m + c;
+	 else
+	   d = m;
+	 to form a fma in the then block and sink the multiplication to the
+	 else block.  */
+      if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
+	return false;
+    }
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
+    {
+      tree addop, mulop1;
+      gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
+
+      mulop1 = gimple_assign_rhs1 (mul_stmt);
+      if (gimple_assign_rhs1 (use_stmt) == mul_result)
+	{
+	  addop = gimple_assign_rhs2 (use_stmt);
+	  /* a * b - c -> a * b + (-c)  */
+	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
+	    addop = force_gimple_operand_gsi (&gsi,
+					      build1 (NEGATE_EXPR,
+						      type, addop),
+					      true, NULL_TREE, true,
+					      GSI_SAME_STMT);
+	}
+      else
+	{
+	  addop = gimple_assign_rhs1 (use_stmt);
+	  /* a - b * c -> (-b) * c + a */
+	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
+	    mulop1 = force_gimple_operand_gsi (&gsi,
+					       build1 (NEGATE_EXPR,
+						       type, mulop1),
+					       true, NULL_TREE, true,
+					       GSI_SAME_STMT);
+	}
+
+      fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR,
+						gimple_assign_lhs (use_stmt),
+						mulop1,
+						gimple_assign_rhs2 (mul_stmt),
+						addop);
+      gsi_replace (&gsi, fma_stmt, true);
+    }
+
+  return true;
+}
+
 /* Find integer multiplications where the operands are extended from
    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
    where appropriate.  */
@@ -1501,31 +1607,45 @@  convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
 static unsigned int
 execute_optimize_widening_mul (void)
 {
-  bool changed = false;
   basic_block bb;
 
   FOR_EACH_BB (bb)
     {
       gimple_stmt_iterator gsi;
 
-      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
         {
 	  gimple stmt = gsi_stmt (gsi);
 	  enum tree_code code;
 
-	  if (!is_gimple_assign (stmt))
-	    continue;
+	  if (is_gimple_assign (stmt))
+	    {
+	      code = gimple_assign_rhs_code (stmt);
+	      switch (code)
+		{
+		case MULT_EXPR:
+		  if (!convert_mult_to_widen (stmt)
+		      && convert_mult_to_fma (stmt))
+		    {
+		      gsi_remove (&gsi, true);
+		      release_defs (stmt);
+		      continue;
+		    }
+		  break;
 
-	  code = gimple_assign_rhs_code (stmt);
-	  if (code == MULT_EXPR)
-	    changed |= convert_mult_to_widen (stmt);
-	  else if (code == PLUS_EXPR || code == MINUS_EXPR)
-	    changed |= convert_plusminus_to_widen (&gsi, stmt, code);
+		case PLUS_EXPR:
+		case MINUS_EXPR:
+		  convert_plusminus_to_widen (&gsi, stmt, code);
+		  break;
+
+		default:;
+		}
+	    }
+	  gsi_next (&gsi);
 	}
     }
 
-  return (changed ? TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
-	  | TODO_verify_stmts : 0);
+  return 0;
 }
 
 static bool
@@ -1549,6 +1669,9 @@  struct gimple_opt_pass pass_optimize_widening_mul =
   0,					/* properties_provided */
   0,					/* properties_destroyed */
   0,					/* todo_flags_start */
-  0                                     /* todo_flags_finish */
+  TODO_verify_ssa
+  | TODO_verify_stmts
+  | TODO_dump_func
+  | TODO_update_ssa                     /* todo_flags_finish */
  }
 };
diff --git a/gcc/tree.def b/gcc/tree.def
index 24729e8..791d699 100644
--- a/gcc/tree.def
+++ b/gcc/tree.def
@@ -1092,6 +1092,12 @@  DEFTREECODE (WIDEN_MULT_PLUS_EXPR, "widen_mult_plus_expr", tcc_expression, 3)
    is subtracted from t3.  */
 DEFTREECODE (WIDEN_MULT_MINUS_EXPR, "widen_mult_plus_expr", tcc_expression, 3)
 
+/* Fused multiply-add.
+   All operands and the result are of the same type.  No intermediate
+   rounding is performed after multiplying operand one with operand two
+   before adding operand three.  */
+DEFTREECODE (FMA_EXPR, "fma_expr", tcc_expression, 3)
+
 /* Whole vector left/right shift in bits.
    Operand 0 is a vector to be shifted.
    Operand 1 is an integer shift amount in bits.  */