Patchwork RFC for patch to add C99 fma/fmaf/fmal builtins

login
register
mail settings
Submitter Michael Meissner
Date Oct. 4, 2010, 9:02 p.m.
Message ID <20101004210212.GA25595@hungry-tiger.westford.ibm.com>
Download mbox | patch
Permalink /patch/66739/
State New
Headers show

Comments

Michael Meissner - Oct. 4, 2010, 9:02 p.m.
I forgot to include the patch.
Richard Henderson - Oct. 4, 2010, 9:13 p.m.
On 10/04/2010 02:02 PM, Michael Meissner wrote:
> +   (UNSPEC_FMA			59)

I think it would be better to create an FMA rtx code,
since essentially every port will have to add this
unspec code in order to implement this.


r~
Richard Guenther - Oct. 4, 2010, 10:06 p.m.
On Mon, Oct 4, 2010 at 11:13 PM, Richard Henderson <rth@redhat.com> wrote:
> On 10/04/2010 02:02 PM, Michael Meissner wrote:
>> +   (UNSPEC_FMA                       59)
>
> I think it would be better to create an FMA rtx code,
> since essentially every port will have to add this
> unspec code in order to implement this.

Yes, that would be useful.  I also agree with staying with (1), expand to a
libcall if the HW does not provide an IEEE compliant fma.

Richard.

>
> r~
>
Michael Meissner - Oct. 4, 2010, 10:13 p.m.
On Mon, Oct 04, 2010 at 02:13:11PM -0700, Richard Henderson wrote:
> On 10/04/2010 02:02 PM, Michael Meissner wrote:
> > +   (UNSPEC_FMA			59)
> 
> I think it would be better to create an FMA rtx code,
> since essentially every port will have to add this
> unspec code in order to implement this.

Ok, makes sense.  Though I suspect ports will still want to have the standard
combiner(s) to do the merging without using FMA.  I wonder if any code assumes
that RTX_TERNARY class is either IF_THEN_ELSE or VEC_MERGE.

Patch

Index: gcc/doc/md.texi
===================================================================
--- gcc/doc/md.texi	(revision 164946)
+++ gcc/doc/md.texi	(working copy)
@@ -3948,6 +3948,16 @@  means of constraints requiring operands 
 @itemx @samp{and@var{m}3}, @samp{ior@var{m}3}, @samp{xor@var{m}3}
 Similar, for other arithmetic operations.
 
+@cindex @code{fma@var{m}4} instruction pattern
+@item @samp{fma@var{m}4}
+Multiply operand 2 and operand 1, then add operand 3, storing the
+result in operand 0.  All operands must have mode @var{m}.  This
+pattern is used to implement the @code{fma}, @code{fmas}, and
+@code{fmal} builtin functions from the ISO C99 standard.  The
+@code{fma} operation may produce different results than doing the
+multiply followed by the add if the machine does not perform a
+rounding step between the operations.
+
 @cindex @code{min@var{m}3} instruction pattern
 @cindex @code{max@var{m}3} instruction pattern
 @item @samp{smin@var{m}3}, @samp{smax@var{m}3}
Index: gcc/optabs.h
===================================================================
--- gcc/optabs.h	(revision 164946)
+++ gcc/optabs.h	(working copy)
@@ -190,6 +190,8 @@  enum optab_index
   OTI_pow,
   /* Arc tangent of y/x */
   OTI_atan2,
+  /* Floating multiply/add */
+  OTI_fma,
 
   /* Move instruction.  */
   OTI_mov,
@@ -432,6 +434,7 @@  enum optab_index
 #define umax_optab (&optab_table[OTI_umax])
 #define pow_optab (&optab_table[OTI_pow])
 #define atan2_optab (&optab_table[OTI_atan2])
+#define fma_optab (&optab_table[OTI_fma])
 
 #define mov_optab (&optab_table[OTI_mov])
 #define movstrict_optab (&optab_table[OTI_movstrict])
Index: gcc/genopinit.c
===================================================================
--- gcc/genopinit.c	(revision 164946)
+++ gcc/genopinit.c	(working copy)
@@ -159,6 +159,7 @@  static const char * const optabs[] =
   "set_optab_handler (sqrt_optab, $A, CODE_FOR_$(sqrt$a2$))",
   "set_optab_handler (floor_optab, $A, CODE_FOR_$(floor$a2$))",
   "set_convert_optab_handler (lfloor_optab, $B, $A, CODE_FOR_$(lfloor$F$a$I$b2$))",
+  "set_optab_handler (fma_optab, $A, CODE_FOR_$(fma$a4$))",
   "set_optab_handler (ceil_optab, $A, CODE_FOR_$(ceil$a2$))",
   "set_convert_optab_handler (lceil_optab, $B, $A, CODE_FOR_$(lceil$F$a$I$b2$))",
   "set_optab_handler (round_optab, $A, CODE_FOR_$(round$a2$))",
Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c	(revision 164946)
+++ gcc/builtins.c	(working copy)
@@ -106,6 +106,7 @@  static void expand_errno_check (tree, rt
 static rtx expand_builtin_mathfn (tree, rtx, rtx);
 static rtx expand_builtin_mathfn_2 (tree, rtx, rtx);
 static rtx expand_builtin_mathfn_3 (tree, rtx, rtx);
+static rtx expand_builtin_mathfn_ternary (tree, rtx, rtx);
 static rtx expand_builtin_interclass_mathfn (tree, rtx);
 static rtx expand_builtin_sincos (tree);
 static rtx expand_builtin_cexpi (tree, rtx);
@@ -2185,6 +2186,79 @@  expand_builtin_mathfn_2 (tree exp, rtx t
   return target;
 }
 
+/* Expand a call to the builtin trinary math functions (fma).
+   Return NULL_RTX if a normal call should be emitted rather than expanding the
+   function in-line.  EXP is the expression that is a call to the builtin
+   function; if convenient, the result should be placed in TARGET.
+   SUBTARGET may be used as the target for computing one of EXP's
+   operands.  */
+
+static rtx
+expand_builtin_mathfn_ternary (tree exp, rtx target, rtx subtarget)
+{
+  optab builtin_optab;
+  rtx op0, op1, op2, insns;
+  tree fndecl = get_callee_fndecl (exp);
+  tree arg0, arg1, arg2;
+  enum machine_mode mode;
+
+  if (!validate_arglist (exp, REAL_TYPE, REAL_TYPE, REAL_TYPE, VOID_TYPE))
+    return NULL_RTX;
+
+  arg0 = CALL_EXPR_ARG (exp, 0);
+  arg1 = CALL_EXPR_ARG (exp, 1);
+  arg2 = CALL_EXPR_ARG (exp, 2);
+
+  switch (DECL_FUNCTION_CODE (fndecl))
+    {
+    CASE_FLT_FN (BUILT_IN_FMA):
+      builtin_optab = fma_optab; break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Make a suitable register to place result in.  */
+  mode = TYPE_MODE (TREE_TYPE (exp));
+
+  /* Before working hard, check whether the instruction is available.  */
+  if (optab_handler (builtin_optab, mode) == CODE_FOR_nothing)
+    return NULL_RTX;
+
+  target = gen_reg_rtx (mode);
+
+  /* Always stabilize the argument list.  */
+  CALL_EXPR_ARG (exp, 0) = arg0 = builtin_save_expr (arg0);
+  CALL_EXPR_ARG (exp, 1) = arg1 = builtin_save_expr (arg1);
+  CALL_EXPR_ARG (exp, 2) = arg2 = builtin_save_expr (arg2);
+
+  op0 = expand_expr (arg0, subtarget, VOIDmode, EXPAND_NORMAL);
+  op1 = expand_normal (arg1);
+  op2 = expand_normal (arg2);
+
+  start_sequence ();
+
+  /* Compute into TARGET.
+     Set TARGET to wherever the result comes back.  */
+  target = expand_ternary_op (mode, builtin_optab, op0, op1, op2,
+			      target, 0);
+
+  /* If we were unable to expand via the builtin, stop the sequence
+     (without outputting the insns) and call to the library function
+     with the stabilized argument list.  */
+  if (target == 0)
+    {
+      end_sequence ();
+      return expand_call (exp, target, target == const0_rtx);
+    }
+
+  /* Output the entire sequence.  */
+  insns = get_insns ();
+  end_sequence ();
+  emit_insn (insns);
+
+  return target;
+}
+
 /* Expand a call to the builtin sin and cos math functions.
    Return NULL_RTX if a normal call should be emitted rather than expanding the
    function in-line.  EXP is the expression that is a call to the builtin
@@ -5829,6 +5903,12 @@  expand_builtin (tree exp, rtx target, rt
 	return target;
       break;
 
+    CASE_FLT_FN (BUILT_IN_FMA):
+      target = expand_builtin_mathfn_ternary (exp, target, subtarget);
+      if (target)
+	return target;
+      break;
+
     CASE_FLT_FN (BUILT_IN_ILOGB):
       if (! flag_unsafe_math_optimizations)
 	break;
Index: gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c	(revision 0)
@@ -0,0 +1,84 @@ 
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O3 -ftree-vectorize -mcpu=power7 -ffast-math -mno-fused-madd" } */
+/* { dg-final { scan-assembler-times "xvmadd" 2 } } */
+/* { dg-final { scan-assembler-times "xsmadd" 1 } } */
+/* { dg-final { scan-assembler-times "fmadds" 1 } } */
+
+/* Only the functions calling the bulitin should generate an appropriate (a *
+   b) + c instruction.  */
+
+double
+builtin_fma (double b, double c, double d)
+{
+  return __builtin_fma (b, c, d);
+}
+
+float
+builtin_fmaf (float b, float c, float d)
+{
+  return __builtin_fmaf (b, c, d);
+}
+
+double
+normal_fma (double b, double c, double d)
+{
+  return (b * c) + d;
+}
+
+float
+normal_fmaf (float b, float c, float d)
+{
+  return (b * c) + d;
+}
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+double vda[SIZE] __attribute__((__aligned__(32)));
+double vdb[SIZE] __attribute__((__aligned__(32)));
+double vdc[SIZE] __attribute__((__aligned__(32)));
+double vdd[SIZE] __attribute__((__aligned__(32)));
+
+float vfa[SIZE] __attribute__((__aligned__(32)));
+float vfb[SIZE] __attribute__((__aligned__(32)));
+float vfc[SIZE] __attribute__((__aligned__(32)));
+float vfd[SIZE] __attribute__((__aligned__(32)));
+
+void
+vector_fma (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    vda[i] = __builtin_fma (vdb[i], vdc[i], vdd[i]);
+}
+
+void
+vector_fmaf (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]);
+}
+
+void
+vnormal_fma (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    vda[i] = (vdb[i] * vdc[i]) + vdd[i];
+}
+
+void
+vnormal_fmaf (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    vfa[i] = (vfb[i] * vfc[i]) + vfd[i];
+}
Index: gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c	(revision 0)
@@ -0,0 +1,61 @@ 
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_altivec_ok } */
+/* { dg-options "-O3 -ftree-vectorize -mcpu=power6 -maltivec -ffast-math" } */
+/* { dg-final { scan-assembler-times "vmaddfp" 2 } } */
+/* { dg-final { scan-assembler-times "fmadd " 2 } } */
+/* { dg-final { scan-assembler-times "fmadds" 2 } } */
+
+/* All functions should generate an appropriate (a * b) + c instruction
+   since -mfused-madd is on by default.  */
+
+double
+builtin_fma (double b, double c, double d)
+{
+  return __builtin_fma (b, c, d);
+}
+
+float
+builtin_fmaf (float b, float c, float d)
+{
+  return __builtin_fmaf (b, c, d);
+}
+
+double
+normal_fma (double b, double c, double d)
+{
+  return (b * c) + d;
+}
+
+float
+normal_fmaf (float b, float c, float d)
+{
+  return (b * c) + d;
+}
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+float vfa[SIZE] __attribute__((__aligned__(32)));
+float vfb[SIZE] __attribute__((__aligned__(32)));
+float vfc[SIZE] __attribute__((__aligned__(32)));
+float vfd[SIZE] __attribute__((__aligned__(32)));
+
+void
+vector_fmaf (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]);
+}
+
+void
+vnormal_fmaf (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    vfa[i] = (vfb[i] * vfc[i]) + vfd[i];
+}
Index: gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c	(revision 0)
@@ -0,0 +1,61 @@ 
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_altivec_ok } */
+/* { dg-options "-O3 -ftree-vectorize -mcpu=power6 -maltivec -ffast-math -mno-fused-madd" } */
+/* { dg-final { scan-assembler-times "vmaddfp" 1 } } */
+/* { dg-final { scan-assembler-times "fmadd " 1 } } */
+/* { dg-final { scan-assembler-times "fmadds" 1 } } */
+
+/* Only the functions calling the builtin should generate an appropriate
+   (a * b) + c instruction.  */
+
+double
+builtin_fma (double b, double c, double d)
+{
+  return __builtin_fma (b, c, d);
+}
+
+float
+builtin_fmaf (float b, float c, float d)
+{
+  return __builtin_fmaf (b, c, d);
+}
+
+double
+normal_fma (double b, double c, double d)
+{
+  return (b * c) + d;
+}
+
+float
+normal_fmaf (float b, float c, float d)
+{
+  return (b * c) + d;
+}
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+float vfa[SIZE] __attribute__((__aligned__(32)));
+float vfb[SIZE] __attribute__((__aligned__(32)));
+float vfc[SIZE] __attribute__((__aligned__(32)));
+float vfd[SIZE] __attribute__((__aligned__(32)));
+
+void
+vector_fmaf (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]);
+}
+
+void
+vnormal_fmaf (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    vfa[i] = (vfb[i] * vfc[i]) + vfd[i];
+}
Index: gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c	(revision 0)
@@ -0,0 +1,84 @@ 
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O3 -ftree-vectorize -mcpu=power7 -ffast-math" } */
+/* { dg-final { scan-assembler-times "xvmadd" 4 } } */
+/* { dg-final { scan-assembler-times "xsmadd" 2 } } */
+/* { dg-final { scan-assembler-times "fmadds" 2 } } */
+
+/* All functions should generate an appropriate (a * b) + c instruction
+   since -mfused-madd is on by default.  */
+
+double
+builtin_fma (double b, double c, double d)
+{
+  return __builtin_fma (b, c, d);
+}
+
+float
+builtin_fmaf (float b, float c, float d)
+{
+  return __builtin_fmaf (b, c, d);
+}
+
+double
+normal_fma (double b, double c, double d)
+{
+  return (b * c) + d;
+}
+
+float
+normal_fmaf (float b, float c, float d)
+{
+  return (b * c) + d;
+}
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+double vda[SIZE] __attribute__((__aligned__(32)));
+double vdb[SIZE] __attribute__((__aligned__(32)));
+double vdc[SIZE] __attribute__((__aligned__(32)));
+double vdd[SIZE] __attribute__((__aligned__(32)));
+
+float vfa[SIZE] __attribute__((__aligned__(32)));
+float vfb[SIZE] __attribute__((__aligned__(32)));
+float vfc[SIZE] __attribute__((__aligned__(32)));
+float vfd[SIZE] __attribute__((__aligned__(32)));
+
+void
+vector_fma (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    vda[i] = __builtin_fma (vdb[i], vdc[i], vdd[i]);
+}
+
+void
+vector_fmaf (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]);
+}
+
+void
+vnormal_fma (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    vda[i] = (vdb[i] * vdc[i]) + vdd[i];
+}
+
+void
+vnormal_fmaf (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    vfa[i] = (vfb[i] * vfc[i]) + vfd[i];
+}
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c	(revision 164946)
+++ gcc/tree-vect-stmts.c	(working copy)
@@ -1357,10 +1357,10 @@  vectorizable_call (gimple stmt, gimple_s
   vectype_in = NULL_TREE;
   nargs = gimple_call_num_args (stmt);
 
-  /* Bail out if the function has more than two arguments, we
-     do not have interesting builtin functions to vectorize with
-     more than two arguments.  No arguments is also not good.  */
-  if (nargs == 0 || nargs > 2)
+  /* Bail out if the function has more than three arguments, we do not have
+     interesting builtin functions to vectorize with more than two arguments
+     except for fma.  No arguments is also not good.  */
+  if (nargs == 0 || nargs > 3)
     return false;
 
   for (i = 0; i < nargs; i++)
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 164946)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -3895,6 +3895,22 @@  rs6000_builtin_vectorized_function (tree
 	  if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
 	    return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIM];
 	  break;
+	case BUILT_IN_FMA:
+	  if (VECTOR_UNIT_VSX_P (V2DFmode)
+	      && out_mode == DFmode && out_n == 2
+	      && in_mode == DFmode && in_n == 2)
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVMADDDP];
+	  break;
+	case BUILT_IN_FMAF:
+	  if (VECTOR_UNIT_VSX_P (V4SFmode)
+	      && out_mode == SFmode && out_n == 4
+	      && in_mode == SFmode && in_n == 4)
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVMADDSP];
+	  else if (VECTOR_UNIT_ALTIVEC_P (V4SFmode)
+	      && out_mode == SFmode && out_n == 4
+	      && in_mode == SFmode && in_n == 4)
+	    return rs6000_builtin_decls[ALTIVEC_BUILTIN_VMADDFP];
+	  break;
 	case BUILT_IN_TRUNC:
 	  if (VECTOR_UNIT_VSX_P (V2DFmode)
 	      && out_mode == DFmode && out_n == 2
Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md	(revision 164946)
+++ gcc/config/rs6000/rs6000.md	(working copy)
@@ -110,6 +110,7 @@  (define_constants
    (UNSPEC_LFIWAX		56)
    (UNSPEC_LFIWZX		57)
    (UNSPEC_FCTIWUZ		58)
+   (UNSPEC_FMA			59)
   ])
 
 ;;
@@ -5844,6 +5845,39 @@  (define_insn "fres"
   "fres %0,%1"
   [(set_attr "type" "fp")])
 
+(define_expand "fmasf4"
+  [(set (match_operand:SF 0 "gpc_reg_operand" "")
+	(plus:SF (mult:SF (match_operand:SF 1 "gpc_reg_operand" "")
+			  (match_operand:SF 2 "gpc_reg_operand" ""))
+		 (match_operand:SF 3 "gpc_reg_operand" "")))]
+  "TARGET_HARD_FLOAT && TARGET_FPRS
+   && ((TARGET_POWERPC && TARGET_SINGLE_FLOAT)
+       || ! TARGET_POWERPC)"
+{
+  if (!TARGET_FUSED_MADD)
+    {
+      if (TARGET_POWERPC)
+	emit_insn (gen_fmasf4_powerpc (operands[0], operands[1], operands[2],
+				       operands[3]));
+      else
+	emit_insn (gen_fmasf4_power (operands[0], operands[1], operands[2],
+				     operands[3]));
+      DONE;
+    }
+})
+
+; Fma builtin that will work even if -mno-fused-madd is used.
+(define_insn "fmasf4_powerpc"
+  [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
+	(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")
+		    (match_operand:SF 2 "gpc_reg_operand" "f")
+		    (match_operand:SF 3 "gpc_reg_operand" "f")]
+		   UNSPEC_FMA))]
+  "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_POWERPC"
+  "fmadds %0,%1,%2,%3"
+  [(set_attr "type" "fp")
+   (set_attr "fp_type" "fp_maddsub_s")])
+  
 (define_insn "*fmaddsf4_powerpc"
   [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
 	(plus:SF (mult:SF (match_operand:SF 1 "gpc_reg_operand" "%f")
@@ -5855,6 +5889,16 @@  (define_insn "*fmaddsf4_powerpc"
   [(set_attr "type" "fp")
    (set_attr "fp_type" "fp_maddsub_s")])
 
+(define_insn "fmasf4_power"
+  [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
+	(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")
+		    (match_operand:SF 2 "gpc_reg_operand" "f")
+		    (match_operand:SF 3 "gpc_reg_operand" "f")]
+		   UNSPEC_FMA))]
+  "TARGET_HARD_FLOAT && TARGET_FPRS && !TARGET_POWERPC"
+  "{fma|fmadd} %0,%1,%2,%3"
+  [(set_attr "type" "dmul")])
+  
 (define_insn "*fmaddsf4_power"
   [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
 	(plus:SF (mult:SF (match_operand:SF 1 "gpc_reg_operand" "%f")
@@ -6280,6 +6324,40 @@  (define_insn "*rsqrtdf_internal1"
   "frsqrte %0,%1"
   [(set_attr "type" "fp")])
 
+(define_expand "fmadf4"
+  [(set (match_operand:DF 0 "gpc_reg_operand" "")
+	(plus:DF (mult:DF (match_operand:DF 1 "gpc_reg_operand" "")
+			  (match_operand:DF 2 "gpc_reg_operand" ""))
+		 (match_operand:DF 3 "gpc_reg_operand" "")))]
+  "TARGET_HARD_FLOAT && TARGET_FPRS
+   && ((TARGET_POWERPC && TARGET_SINGLE_FLOAT)
+       || ! TARGET_POWERPC)"
+{
+  if (!TARGET_FUSED_MADD)
+    {
+      if (VECTOR_UNIT_VSX_P (DFmode))
+	emit_insn (gen_vsx_fmadddf4_2 (operands[0], operands[1], operands[2],
+				       operands[3]));
+      else
+	emit_insn (gen_fmadf4_fpr (operands[0], operands[1], operands[2],
+				   operands[3]));
+      DONE;
+    }
+})
+
+; Fma builtin that will work even if -mno-fused-madd is used.
+(define_insn "fmadf4_fpr"
+  [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
+	(unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d")
+		    (match_operand:DF 2 "gpc_reg_operand" "d")
+		    (match_operand:DF 3 "gpc_reg_operand" "d")]
+		   UNSPEC_FMA))]
+  "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT
+   && VECTOR_UNIT_NONE_P (DFmode)"
+  "{fma|fmadd} %0,%1,%2,%3"
+  [(set_attr "type" "dmul")
+   (set_attr "fp_type" "fp_maddsub_d")])
+
 (define_insn "*fmadddf4_fpr"
   [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
 	(plus:DF (mult:DF (match_operand:DF 1 "gpc_reg_operand" "%d")