Patchwork [ia64] Add fma insn patterns

login
register
mail settings
Submitter Richard Henderson
Date Oct. 18, 2010, 6:40 p.m.
Message ID <4CBC9481.808@redhat.com>
Download mbox | patch
Permalink /patch/68227/
State New
Headers show

Comments

Richard Henderson - Oct. 18, 2010, 6:40 p.m.
As with powerpc, as distinguished from that achievable with -mfused-madd.

Tested on ia64-linux.  Steve, can you make sure I got the dejaspoo right
for hpux?  Given that we don't have a __builtin_fma variant that operates
on __float80 as opposed to long double, I can't see that it's possible to
test that case there.


r~
Richard Henderson - Oct. 18, 2010, 7:15 p.m.
> +;; This insn is officially "-(a * b) + c" which is "-((a * b) - c)",
> +;; except for the sign of a zero result.  E.g.
> +;;	-(1 * 0) + 0 => -(0) + 0 => -0 + 0 => +0
> +;;	-((1 * 0) - 0) => -(0 - 0) => -(0) => -0
> +(define_insn "*nfmssf4"
> +  [(set (match_operand:SF 0 "fr_register_operand" "=f")
> +	(neg:SF
> +	 (fma:SF (match_operand:SF 1 "fr_reg_or_fp01_operand" "fG")
> +		 (match_operand:SF 2 "fr_reg_or_fp01_operand" "fG")
> +		 (neg:SF
> +		  (match_operand:SF 3 "fr_reg_or_signed_fp01_operand" "fZ")))))]
> +  "!flag_signed_zeros"
> +  "fnma.s %0 = %F1, %F2, %F3"
> +  [(set_attr "itanium_class" "fmac")])

I'm not sure why I didn't see it before but

  -(a * b) + c

simplifies better to 

  (-a) * b + c

i.e.

  [(set (match_operand 0)
	(fma (neg (match_operand 1))
	     (match_operand 2)
	     (match_operand 3)))]

I could either replace the pattern above (assuming that some simplification
routine could be induced to handle it), or leave it and add the new pattern.

Thoughts?


r~
Richard Guenther - Oct. 19, 2010, 2:31 p.m.
On Mon, 18 Oct 2010, Richard Henderson wrote:

> > +;; This insn is officially "-(a * b) + c" which is "-((a * b) - c)",
> > +;; except for the sign of a zero result.  E.g.
> > +;;	-(1 * 0) + 0 => -(0) + 0 => -0 + 0 => +0
> > +;;	-((1 * 0) - 0) => -(0 - 0) => -(0) => -0
> > +(define_insn "*nfmssf4"
> > +  [(set (match_operand:SF 0 "fr_register_operand" "=f")
> > +	(neg:SF
> > +	 (fma:SF (match_operand:SF 1 "fr_reg_or_fp01_operand" "fG")
> > +		 (match_operand:SF 2 "fr_reg_or_fp01_operand" "fG")
> > +		 (neg:SF
> > +		  (match_operand:SF 3 "fr_reg_or_signed_fp01_operand" "fZ")))))]
> > +  "!flag_signed_zeros"
> > +  "fnma.s %0 = %F1, %F2, %F3"
> > +  [(set_attr "itanium_class" "fmac")])
> 
> I'm not sure why I didn't see it before but
> 
>   -(a * b) + c
> 
> simplifies better to 
> 
>   (-a) * b + c
> 
> i.e.
> 
>   [(set (match_operand 0)
> 	(fma (neg (match_operand 1))
> 	     (match_operand 2)
> 	     (match_operand 3)))]
> 
> I could either replace the pattern above (assuming that some simplification
> routine could be induced to handle it), or leave it and add the new pattern.
> 
> Thoughts?

On trees I also will end up with (-a) * b + c and expect the expander
to grok it.  So that indeed sounds better.  Though I can't get
expand / combine convince to generate vfmsubps from

float r[256];
float x[256];
float y[256];
float z[256];
void foo (void)
{
  int i;
  for (i = 0; i < 256; ++i)
    {
      r[i] = x[i] * y[i] - z[i];
    }
}

it seems that expand already generates

(insn 31 30 32 4 (set (reg:V8SF 99)
        (xor:V8SF (reg:V8SF 97)
            (reg:V8SF 100))) t.c:10 -1
     (expr_list:REG_EQUAL (neg:V8SF (reg:V8SF 97))
        (nil)))

(insn 32 31 33 4 (set (reg:V8SF 98)
        (fma:V8SF (reg:V8SF 89 [ vect_var_.15 ])
            (reg:V8SF 88 [ vect_var_.10 ])
            (reg:V8SF 99))) t.c:10 -1
     (nil))

from my gen_rtx_NEG.  Well, preliminary patch below.

Richard.


Index: gcc/tree.def
===================================================================
*** gcc/tree.def.orig	2010-10-14 14:00:03.000000000 +0200
--- gcc/tree.def	2010-10-19 14:36:28.000000000 +0200
*************** DEFTREECODE (WIDEN_MULT_PLUS_EXPR, "wide
*** 1092,1097 ****
--- 1092,1103 ----
     is subtracted from t3.  */
  DEFTREECODE (WIDEN_MULT_MINUS_EXPR, "widen_mult_plus_expr", tcc_expression, 3)
  
+ /* Fused multiply-add.
+    All operands and the result are of the same type.  No intermediate
+    rounding is performed after multiplying operand one with operand two
+    before adding operand three.  */
+ DEFTREECODE (FMA_EXPR, "fma_expr", tcc_expression, 3)
+ 
  /* Whole vector left/right shift in bits.
     Operand 0 is a vector to be shifted.
     Operand 1 is an integer shift amount in bits.  */
Index: gcc/expr.c
===================================================================
*** gcc/expr.c.orig	2010-10-14 10:33:08.000000000 +0200
--- gcc/expr.c	2010-10-19 16:12:55.000000000 +0200
*************** expand_expr_real_2 (sepops ops, rtx targ
*** 7254,7260 ****
    int ignore;
    bool reduce_bit_field;
    location_t loc = ops->location;
!   tree treeop0, treeop1;
  #define REDUCE_BIT_FIELD(expr)	(reduce_bit_field			  \
  				 ? reduce_to_bit_field_precision ((expr), \
  								  target, \
--- 7254,7260 ----
    int ignore;
    bool reduce_bit_field;
    location_t loc = ops->location;
!   tree treeop0, treeop1, treeop2;
  #define REDUCE_BIT_FIELD(expr)	(reduce_bit_field			  \
  				 ? reduce_to_bit_field_precision ((expr), \
  								  target, \
*************** expand_expr_real_2 (sepops ops, rtx targ
*** 7267,7272 ****
--- 7267,7273 ----
  
    treeop0 = ops->op0;
    treeop1 = ops->op1;
+   treeop2 = ops->op2;
  
    /* We should be called only on simple (binary or unary) expressions,
       exactly those that are valid in gimple expressions that aren't
*************** expand_expr_real_2 (sepops ops, rtx targ
*** 7624,7630 ****
      case WIDEN_MULT_PLUS_EXPR:
      case WIDEN_MULT_MINUS_EXPR:
        expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
!       op2 = expand_normal (ops->op2);
        target = expand_widen_pattern_expr (ops, op0, op1, op2,
  					  target, unsignedp);
        return target;
--- 7625,7631 ----
      case WIDEN_MULT_PLUS_EXPR:
      case WIDEN_MULT_MINUS_EXPR:
        expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
!       op2 = expand_normal (treeop2);
        target = expand_widen_pattern_expr (ops, op0, op1, op2,
  					  target, unsignedp);
        return target;
*************** expand_expr_real_2 (sepops ops, rtx targ
*** 7711,7716 ****
--- 7712,7743 ----
        expand_operands (treeop0, treeop1, subtarget, &op0, &op1, EXPAND_NORMAL);
        return REDUCE_BIT_FIELD (expand_mult (mode, op0, op1, target, unsignedp));
  
+     case FMA_EXPR:
+       {
+ 	gimple def;
+ 	def = get_def_for_expr (treeop0, NEGATE_EXPR);
+ 	if (def)
+ 	  {
+ 	    op0 = expand_normal (gimple_assign_rhs1 (def));
+ 	    op0 = force_reg (mode, op0);
+ 	    op0 = gen_rtx_NEG (mode, op0);
+ 	  }
+ 	else
+ 	  op0 = expand_expr (treeop0, subtarget, VOIDmode, EXPAND_NORMAL);
+ 	op1 = expand_normal (treeop1);
+ 	def = get_def_for_expr (treeop2, NEGATE_EXPR);
+ 	if (def)
+ 	  {
+ 	    op2 = expand_normal (gimple_assign_rhs1 (def));
+ 	    op2 = force_reg (mode, op2);
+ 	    op2 = gen_rtx_NEG (mode, op2);
+ 	  }
+ 	else
+ 	  op2 = expand_normal (treeop2);
+ 	return expand_ternary_op (TYPE_MODE (type), fma_optab,
+ 				  op0, op1, op2, target, 0);
+       }
+ 
      case MULT_EXPR:
        /* If this is a fixed-point operation, then we cannot use the code
  	 below because "expand_mult" doesn't support sat/no-sat fixed-point
Index: gcc/gimple.c
===================================================================
*** gcc/gimple.c.orig	2010-10-14 14:00:03.000000000 +0200
--- gcc/gimple.c	2010-10-19 14:36:28.000000000 +0200
*************** get_gimple_rhs_num_ops (enum tree_code c
*** 2528,2534 ****
        || (SYM) == TRUTH_XOR_EXPR) ? GIMPLE_BINARY_RHS			    \
     : (SYM) == TRUTH_NOT_EXPR ? GIMPLE_UNARY_RHS				    \
     : ((SYM) == WIDEN_MULT_PLUS_EXPR					    \
!       || (SYM) == WIDEN_MULT_MINUS_EXPR) ? GIMPLE_TERNARY_RHS		    \
     : ((SYM) == COND_EXPR						    \
        || (SYM) == CONSTRUCTOR						    \
        || (SYM) == OBJ_TYPE_REF						    \
--- 2528,2535 ----
        || (SYM) == TRUTH_XOR_EXPR) ? GIMPLE_BINARY_RHS			    \
     : (SYM) == TRUTH_NOT_EXPR ? GIMPLE_UNARY_RHS				    \
     : ((SYM) == WIDEN_MULT_PLUS_EXPR					    \
!       || (SYM) == WIDEN_MULT_MINUS_EXPR					    \
!       || (SYM) == FMA_EXPR) ? GIMPLE_TERNARY_RHS			    \
     : ((SYM) == COND_EXPR						    \
        || (SYM) == CONSTRUCTOR						    \
        || (SYM) == OBJ_TYPE_REF						    \
Index: gcc/tree-cfg.c
===================================================================
*** gcc/tree-cfg.c.orig	2010-10-19 12:24:32.000000000 +0200
--- gcc/tree-cfg.c	2010-10-19 14:36:28.000000000 +0200
*************** verify_gimple_assign_ternary (gimple stm
*** 3749,3754 ****
--- 3749,3768 ----
  	}
        break;
  
+     case FMA_EXPR:
+       if (!useless_type_conversion_p (lhs_type, rhs1_type)
+ 	  || !useless_type_conversion_p (lhs_type, rhs2_type)
+ 	  || !useless_type_conversion_p (lhs_type, rhs3_type))
+ 	{
+ 	  error ("type mismatch in fused multiply-add expression");
+ 	  debug_generic_expr (lhs_type);
+ 	  debug_generic_expr (rhs1_type);
+ 	  debug_generic_expr (rhs2_type);
+ 	  debug_generic_expr (rhs3_type);
+ 	  return true;
+ 	} 
+       break;
+ 
      default:
        gcc_unreachable ();
      }
Index: gcc/tree-inline.c
===================================================================
*** gcc/tree-inline.c.orig	2010-10-19 12:24:32.000000000 +0200
--- gcc/tree-inline.c	2010-10-19 14:36:28.000000000 +0200
*************** estimate_operator_cost (enum tree_code c
*** 3284,3289 ****
--- 3284,3290 ----
      case POINTER_PLUS_EXPR:
      case MINUS_EXPR:
      case MULT_EXPR:
+     case FMA_EXPR:
  
      case ADDR_SPACE_CONVERT_EXPR:
      case FIXED_CONVERT_EXPR:
Index: gcc/gimple-pretty-print.c
===================================================================
*** gcc/gimple-pretty-print.c.orig	2010-08-30 15:45:07.000000000 +0200
--- gcc/gimple-pretty-print.c	2010-10-19 15:02:41.000000000 +0200
*************** dump_ternary_rhs (pretty_printer *buffer
*** 400,405 ****
--- 400,413 ----
        pp_character (buffer, '>');
        break;
  
+     case FMA_EXPR:
+       dump_generic_node (buffer, gimple_assign_rhs1 (gs), spc, flags, false);
+       pp_string (buffer, " * ");
+       dump_generic_node (buffer, gimple_assign_rhs2 (gs), spc, flags, false);
+       pp_string (buffer, " + ");
+       dump_generic_node (buffer, gimple_assign_rhs3 (gs), spc, flags, false);
+       break;
+ 
      default:
        gcc_unreachable ();
      }
Index: gcc/tree-ssa-math-opts.c
===================================================================
*** gcc/tree-ssa-math-opts.c.orig	2010-09-24 13:58:07.000000000 +0200
--- gcc/tree-ssa-math-opts.c	2010-10-19 16:15:07.000000000 +0200
*************** convert_plusminus_to_widen (gimple_stmt_
*** 1494,1499 ****
--- 1494,1601 ----
    return true;
  }
  
+ /* Combine the multiplication at MUL_STMT with uses in additions and
+    subtractions to form fused multiply-add operations.  Returns true
+    if successful.  */
+ 
+ static bool
+ convert_mult_to_fma (gimple mul_stmt)
+ {
+   tree mul_result = gimple_assign_lhs (mul_stmt);
+   tree type = TREE_TYPE (mul_result);
+   gimple use_stmt, fma_stmt;
+   use_operand_p use_p;
+   imm_use_iterator imm_iter;
+ 
+   /* If the target doesn't support it, don't generate it.  */
+   if (optab_handler (fma_optab, TYPE_MODE (type))
+       == CODE_FOR_nothing)
+     return false;
+ 
+   /* We don't want to do bitfield reduction ops.  */
+   if (INTEGRAL_TYPE_P (type)
+       && (TYPE_PRECISION (type)
+ 	  != GET_MODE_PRECISION (TYPE_MODE (type))))
+     return false;
+ 
+   /* Make sure that the multiplication statement becomes dead after
+      the transformation, thus that all uses are transformed to FMAs.
+      This means we assume that an FMA operation has the same cost
+      as an addition.  */
+   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
+     {
+       enum tree_code use_code;
+ 
+       use_stmt = USE_STMT (use_p);
+ 
+       if (!is_gimple_assign (use_stmt))
+ 	return false;
+       use_code = gimple_assign_rhs_code (use_stmt);
+       /* ???  Handle MINUS_EXPR and NEGATE_EXPR.  We have no way to
+          query HW support for this though.  */
+       if (use_code != PLUS_EXPR
+ 	  && use_code != MINUS_EXPR)
+ 	return false;
+ 
+       /* We can't handle a * b + a * b.  */
+       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
+ 	return false;
+ 
+       /* For now restrict this operations to single basic blocks.  In theory
+ 	 we would want to support sinking the multiplication in
+ 	 m = a*b;
+ 	 if ()
+ 	 ma = m + c;
+ 	 else
+ 	 d = m;
+ 	 to form a fma in the then block and sink the multiplication to the
+ 	 else block.  */
+       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
+ 	return false;
+     }
+ 
+   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
+     {
+       tree addop, mulop1;
+       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
+ 
+       mulop1 = gimple_assign_rhs1 (mul_stmt);
+       if (gimple_assign_rhs1 (use_stmt) == mul_result)
+ 	{
+ 	  addop = gimple_assign_rhs2 (use_stmt);
+ 	  /* a * b - c -> a * b + (-c)  */
+ 	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
+ 	    addop = force_gimple_operand_gsi (&gsi,
+ 					      build1 (NEGATE_EXPR,
+ 						      type, addop),
+ 					      true, NULL_TREE, true,
+ 					      GSI_SAME_STMT);
+ 	}
+       else
+ 	{
+ 	  addop = gimple_assign_rhs1 (use_stmt);
+ 	  /* a - b * c -> (-b) * c + a */
+ 	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
+ 	    mulop1 = force_gimple_operand_gsi (&gsi,
+ 					       build1 (NEGATE_EXPR,
+ 						       type, mulop1),
+ 					       true, NULL_TREE, true,
+ 					       GSI_SAME_STMT);
+ 	}
+ 
+       fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR,
+ 						gimple_assign_lhs (use_stmt),
+ 						mulop1,
+ 						gimple_assign_rhs2 (mul_stmt),
+ 						addop);
+       gsi_replace (&gsi, fma_stmt, true);
+     }
+ 
+   /* ???  Remove the current statement if it is unused.  */
+ 
+   return true;
+ }
+ 
  /* Find integer multiplications where the operands are extended from
     smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
     where appropriate.  */
*************** convert_plusminus_to_widen (gimple_stmt_
*** 1501,1507 ****
  static unsigned int
  execute_optimize_widening_mul (void)
  {
-   bool changed = false;
    basic_block bb;
  
    FOR_EACH_BB (bb)
--- 1603,1608 ----
*************** execute_optimize_widening_mul (void)
*** 1518,1531 ****
  
  	  code = gimple_assign_rhs_code (stmt);
  	  if (code == MULT_EXPR)
! 	    changed |= convert_mult_to_widen (stmt);
  	  else if (code == PLUS_EXPR || code == MINUS_EXPR)
! 	    changed |= convert_plusminus_to_widen (&gsi, stmt, code);
  	}
      }
  
!   return (changed ? TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
! 	  | TODO_verify_stmts : 0);
  }
  
  static bool
--- 1619,1634 ----
  
  	  code = gimple_assign_rhs_code (stmt);
  	  if (code == MULT_EXPR)
! 	    {
! 	      if (!convert_mult_to_widen (stmt))
! 		convert_mult_to_fma (stmt);
! 	    }
  	  else if (code == PLUS_EXPR || code == MINUS_EXPR)
! 	    convert_plusminus_to_widen (&gsi, stmt, code);
  	}
      }
  
!   return 0;
  }
  
  static bool
*************** struct gimple_opt_pass pass_optimize_wid
*** 1549,1554 ****
    0,					/* properties_provided */
    0,					/* properties_destroyed */
    0,					/* todo_flags_start */
!   0                                     /* todo_flags_finish */
   }
  };
--- 1652,1660 ----
    0,					/* properties_provided */
    0,					/* properties_destroyed */
    0,					/* todo_flags_start */
!   TODO_verify_ssa
!   | TODO_verify_stmts
!   | TODO_dump_func
!   | TODO_update_ssa                     /* todo_flags_finish */
   }
  };
Steve Ellcey - Oct. 19, 2010, 3:49 p.m.
On Mon, 2010-10-18 at 11:40 -0700, Richard Henderson wrote:
> As with powerpc, as distinguished from that achievable with -mfused-madd.
> 
> Tested on ia64-linux.  Steve, can you make sure I got the dejaspoo right
> for hpux?  Given that we don't have a __builtin_fma variant that operates
> on __float80 as opposed to long double, I can't see that it's possible to
> test that case there.
> 
> 
> r~

Yes, the HP-UX stuff looks good.  I applied the patch to my local tree
and ran a bootstrap and test last night and everything passed.

Steve Ellcey
sje@cup.hp.com

Patch

Index: testsuite/gcc.target/ia64/builtin-fma-1.c
===================================================================
--- testsuite/gcc.target/ia64/builtin-fma-1.c	(revision 0)
+++ testsuite/gcc.target/ia64/builtin-fma-1.c	(revision 0)
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+/* Don't confuse the fma insn with the fma in the filename.  */
+/* { dg-final { scan-assembler-times "fma\\." 2 } } */
+/* { dg-final { scan-assembler-times "fms" 4 } } */
+/* { dg-final { scan-assembler-times "fnma." 0 } } */
+
+#ifndef __FP_FAST_FMAF
+# error "__FP_FAST_FMAF should be defined"
+#endif
+#ifndef __FP_FAST_FMA
+# error "__FP_FAST_FMA should be defined"
+#endif
+
+float f0(float x, float y, float z) { return __builtin_fmaf(x,y,z); }
+float f1(float x, float y, float z) { return __builtin_fmaf(x,y,-z); }
+float f2(float x, float y, float z) { return -__builtin_fmaf(x,y,-z); }
+
+double d0(double x, double y, double z) { return __builtin_fma(x,y,z); }
+double d1(double x, double y, double z) { return __builtin_fma(x,y,-z); }
+double d2(double x, double y, double z) { return -__builtin_fma(x,y,-z); }
Index: testsuite/gcc.target/ia64/builtin-fma-2.c
===================================================================
--- testsuite/gcc.target/ia64/builtin-fma-2.c	(revision 0)
+++ testsuite/gcc.target/ia64/builtin-fma-2.c	(revision 0)
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O -fno-signed-zeros" } */
+/* Don't confuse the fma insn with the fma in the filename.  */
+/* { dg-final { scan-assembler-times "fma\\." 2 } } */
+/* { dg-final { scan-assembler-times "fms" 2 } } */
+/* { dg-final { scan-assembler-times "fnma" 2 } } */
+
+float f0(float x, float y, float z) { return __builtin_fmaf(x,y,z); }
+float f1(float x, float y, float z) { return __builtin_fmaf(x,y,-z); }
+float f2(float x, float y, float z) { return -__builtin_fmaf(x,y,-z); }
+
+double d0(double x, double y, double z) { return __builtin_fma(x,y,z); }
+double d1(double x, double y, double z) { return __builtin_fma(x,y,-z); }
+double d2(double x, double y, double z) { return -__builtin_fma(x,y,-z); }
Index: testsuite/gcc.target/ia64/builtin-fma-3.c
===================================================================
--- testsuite/gcc.target/ia64/builtin-fma-3.c	(revision 0)
+++ testsuite/gcc.target/ia64/builtin-fma-3.c	(revision 0)
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-skip-if "128-bit long double" { *-*-hpux* } { "*" } { "" } } */
+/* { dg-options "-O" } */
+/* Don't confuse the fma insn with the fma in the filename.  */
+/* { dg-final { scan-assembler-times "fma\[ 	\]" 1 } } */
+/* { dg-final { scan-assembler-times "fms" 2 } } */
+/* { dg-final { scan-assembler-times "fnma" 0 } } */
+
+#ifndef __FP_FAST_FMAL
+# error "__FP_FAST_FMAL should be defined"
+#endif
+
+typedef long double LD;
+
+LD f0(LD x, LD y, LD z) { return __builtin_fmal(x,y,z); }
+LD f1(LD x, LD y, LD z) { return __builtin_fmal(x,y,-z); }
+LD f2(LD x, LD y, LD z) { return -__builtin_fmal(x,y,-z); }
Index: testsuite/gcc.target/ia64/builtin-fma-4.c
===================================================================
--- testsuite/gcc.target/ia64/builtin-fma-4.c	(revision 0)
+++ testsuite/gcc.target/ia64/builtin-fma-4.c	(revision 0)
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-skip-if "128-bit long double" { *-*-hpux* } { "*" } { "" } } */
+/* { dg-options "-O -fno-signed-zeros" } */
+/* Don't confuse the fma insn with the fma in the filename.  */
+/* { dg-final { scan-assembler-times "fma\[ 	\]" 1 } } */
+/* { dg-final { scan-assembler-times "fms" 1 } } */
+/* { dg-final { scan-assembler-times "fnma" 1 } } */
+
+#ifndef __FP_FAST_FMAL
+# error "__FP_FAST_FMAL should be defined"
+#endif
+
+typedef long double LD;
+
+LD f0(LD x, LD y, LD z) { return __builtin_fmal(x,y,z); }
+LD f1(LD x, LD y, LD z) { return __builtin_fmal(x,y,-z); }
+LD f2(LD x, LD y, LD z) { return -__builtin_fmal(x,y,-z); }
Index: config/ia64/ia64.md
===================================================================
--- config/ia64/ia64.md	(revision 165650)
+++ config/ia64/ia64.md	(working copy)
@@ -2791,6 +2791,41 @@ 
   "TARGET_FUSED_MADD"
   "fnma.s %0 = %F1, %F2, %F3"
   [(set_attr "itanium_class" "fmac")])
+
+;; Official C99 versions of the fmaf family of operations.
+(define_insn "fmasf4"
+  [(set (match_operand:SF 0 "fr_register_operand" "=f")
+	(fma:SF (match_operand:SF 1 "fr_reg_or_fp01_operand" "fG")
+		(match_operand:SF 2 "fr_reg_or_fp01_operand" "fG")
+		(match_operand:SF 3 "fr_reg_or_signed_fp01_operand" "fZ")))]
+  ""
+  "fma.s %0 = %F1, %F2, %F3"
+  [(set_attr "itanium_class" "fmac")])
+
+(define_insn "*fmssf4"
+  [(set (match_operand:SF 0 "fr_register_operand" "=f")
+	(fma:SF (match_operand:SF 1 "fr_reg_or_fp01_operand" "fG")
+		(match_operand:SF 2 "fr_reg_or_fp01_operand" "fG")
+		(neg:SF
+		  (match_operand:SF 3 "fr_reg_or_signed_fp01_operand" "fZ"))))]
+  ""
+  "fms.s %0 = %F1, %F2, %F3"
+  [(set_attr "itanium_class" "fmac")])
+
+;; This insn is officially "-(a * b) + c" which is "-((a * b) - c)",
+;; except for the sign of a zero result.  E.g.
+;;	-(1 * 0) + 0 => -(0) + 0 => -0 + 0 => +0
+;;	-((1 * 0) - 0) => -(0 - 0) => -(0) => -0
+(define_insn "*nfmssf4"
+  [(set (match_operand:SF 0 "fr_register_operand" "=f")
+	(neg:SF
+	 (fma:SF (match_operand:SF 1 "fr_reg_or_fp01_operand" "fG")
+		 (match_operand:SF 2 "fr_reg_or_fp01_operand" "fG")
+		 (neg:SF
+		  (match_operand:SF 3 "fr_reg_or_signed_fp01_operand" "fZ")))))]
+  "!flag_signed_zeros"
+  "fnma.s %0 = %F1, %F2, %F3"
+  [(set_attr "itanium_class" "fmac")])
 
 ;; ::::::::::::::::::::
 ;; ::
@@ -2977,6 +3012,38 @@ 
   "TARGET_FUSED_MADD"
   "fnma.s %0 = %F1, %F2, %F3"
   [(set_attr "itanium_class" "fmac")])
+
+;; Official C99 versions of the fma family of operations.
+(define_insn "fmadf4"
+  [(set (match_operand:DF 0 "fr_register_operand" "=f")
+	(fma:DF (match_operand:DF 1 "fr_reg_or_fp01_operand" "fG")
+		(match_operand:DF 2 "fr_reg_or_fp01_operand" "fG")
+		(match_operand:DF 3 "fr_reg_or_signed_fp01_operand" "fZ")))]
+  ""
+  "fma.d %0 = %F1, %F2, %F3"
+  [(set_attr "itanium_class" "fmac")])
+
+(define_insn "*fmsdf4"
+  [(set (match_operand:DF 0 "fr_register_operand" "=f")
+	(fma:DF (match_operand:DF 1 "fr_reg_or_fp01_operand" "fG")
+		(match_operand:DF 2 "fr_reg_or_fp01_operand" "fG")
+		(neg:DF
+		  (match_operand:DF 3 "fr_reg_or_signed_fp01_operand" "fZ"))))]
+  ""
+  "fms.d %0 = %F1, %F2, %F3"
+  [(set_attr "itanium_class" "fmac")])
+
+;; See comment for nfmssf4.
+(define_insn "*nfmsdf4"
+  [(set (match_operand:DF 0 "fr_register_operand" "=f")
+	(neg:DF
+	 (fma:DF (match_operand:DF 1 "fr_reg_or_fp01_operand" "fG")
+		 (match_operand:DF 2 "fr_reg_or_fp01_operand" "fG")
+		 (neg:DF
+		  (match_operand:DF 3 "fr_reg_or_signed_fp01_operand" "fZ")))))]
+  "!flag_signed_zeros"
+  "fnma.d %0 = %F1, %F2, %F3"
+  [(set_attr "itanium_class" "fmac")])
 
 ;; ::::::::::::::::::::
 ;; ::
@@ -3234,6 +3301,38 @@ 
   "TARGET_FUSED_MADD"
   "fnma.d %0 = %F1, %F2, %F3"
   [(set_attr "itanium_class" "fmac")])
+
+;; Official C99 versions of the fmal family of operations.
+(define_insn "fmaxf4"
+  [(set (match_operand:XF 0 "fr_register_operand" "=f")
+	(fma:XF (match_operand:XF 1 "fr_reg_or_fp01_operand" "fG")
+		(match_operand:XF 2 "fr_reg_or_fp01_operand" "fG")
+		(match_operand:XF 3 "fr_reg_or_signed_fp01_operand" "fZ")))]
+  ""
+  "fma %0 = %F1, %F2, %F3"
+  [(set_attr "itanium_class" "fmac")])
+
+(define_insn "*fmsxf4"
+  [(set (match_operand:XF 0 "fr_register_operand" "=f")
+	(fma:XF (match_operand:XF 1 "fr_reg_or_fp01_operand" "fG")
+		(match_operand:XF 2 "fr_reg_or_fp01_operand" "fG")
+		(neg:XF
+		  (match_operand:XF 3 "fr_reg_or_signed_fp01_operand" "fZ"))))]
+  ""
+  "fms %0 = %F1, %F2, %F3"
+  [(set_attr "itanium_class" "fmac")])
+
+;; See comment for nfmssf4.
+(define_insn "*nfmsxf4"
+  [(set (match_operand:XF 0 "fr_register_operand" "=f")
+	(neg:XF
+	 (fma:XF (match_operand:XF 1 "fr_reg_or_fp01_operand" "fG")
+		 (match_operand:XF 2 "fr_reg_or_fp01_operand" "fG")
+		 (neg:XF
+		  (match_operand:XF 3 "fr_reg_or_signed_fp01_operand" "fZ")))))]
+  "!flag_signed_zeros"
+  "fnma %0 = %F1, %F2, %F3"
+  [(set_attr "itanium_class" "fmac")])
 
 ;; ::::::::::::::::::::
 ;; ::