Patchwork [i386,avx] add fma3 patterns

login
register
mail settings
Submitter Richard Henderson
Date Oct. 19, 2010, 7:56 p.m.
Message ID <4CBDF7F6.1070005@redhat.com>
Download mbox | patch
Permalink /patch/68376/
State New
Headers show

Comments

Richard Henderson - Oct. 19, 2010, 7:56 p.m.
FMA3 aka AVX three-operand FMA.

Tested on x86_64-linux with the included asm scan tests.

HJ, I expect you'll be able to test on real hardware, 
though I expect any Real Tests will have to wait until
all the bits for auto-vectorization get added.  It seems
fairly hard to force all 3 insn alternatives to be 
generated...

Committed.


r~




        * config/i386/i386.c (bdesc_multi_arg): Use fma4i_fmadd_<mode>.
        * config/i386/sse.md (fma<mode>4): Enable for FMA & SSE_MATH.
        (fma4i_fmadd_<mode>): New.
        (*split_fma, *split_fms, *split_fnma, *split_fnms): Rename from
        fma4_fm*_<mode> and adjust to be pre-reload splitters to the 
        standard fma patterns.
        (fmaddsub_<mode>): Rename from fma4i_fmaddsub_<mode> and
        enable for FMA.
        (*fma_fmadd_<mode>, *fma_fmsub_<mode>): New.
        (*fma_fmadd_<mode>, *fma_fmsub_<mode>): New.
        (*fma_fmaddsub_<mode>, *fma_fmsubadd_<mode>): New.

testsuite/
        * gcc.target/i386/fma3-fma.c: New.
        * gcc.target/i386/fma3-builtin.c: New.
        * gcc.target/i386/fma4-builtin.c: New.
H.J. Lu - Oct. 19, 2010, 8 p.m.
On Tue, Oct 19, 2010 at 12:56 PM, Richard Henderson <rth@redhat.com> wrote:
> FMA3 aka AVX three-operand FMA.
>
> Tested on x86_64-linux with the included asm scan tests.
>
> HJ, I expect you'll be able to test on real hardware,
> though I expect any Real Tests will have to wait until
> all the bits for auto-vectorization get added.  It seems
> fairly hard to force all 3 insn alternatives to be
> generated...
>

FMA isn't supported before Haswell.
Richard Guenther - Oct. 20, 2010, 8:51 a.m.
On Tue, Oct 19, 2010 at 9:56 PM, Richard Henderson <rth@redhat.com> wrote:
> FMA3 aka AVX three-operand FMA.
>
> Tested on x86_64-linux with the included asm scan tests.
>
> HJ, I expect you'll be able to test on real hardware,
> though I expect any Real Tests will have to wait until
> all the bits for auto-vectorization get added.  It seems
> fairly hard to force all 3 insn alternatives to be
> generated...

Note that I do not plan to handle FMA in autovectorization (which is
fine to vectorize it piecewise), but instead rely on combine and only
force un-CSE before expansion.  The only advantage for teaching
the vectorizer about FMA would be better cost estimates.

Richard.

> Committed.
>
>
> r~
>
>
>
>
>        * config/i386/i386.c (bdesc_multi_arg): Use fma4i_fmadd_<mode>.
>        * config/i386/sse.md (fma<mode>4): Enable for FMA & SSE_MATH.
>        (fma4i_fmadd_<mode>): New.
>        (*split_fma, *split_fms, *split_fnma, *split_fnms): Rename from
>        fma4_fm*_<mode> and adjust to be pre-reload splitters to the
>        standard fma patterns.
>        (fmaddsub_<mode>): Rename from fma4i_fmaddsub_<mode> and
>        enable for FMA.
>        (*fma_fmadd_<mode>, *fma_fmsub_<mode>): New.
>        (*fma_fmadd_<mode>, *fma_fmsub_<mode>): New.
>        (*fma_fmaddsub_<mode>, *fma_fmsubadd_<mode>): New.
>
> testsuite/
>        * gcc.target/i386/fma3-fma.c: New.
>        * gcc.target/i386/fma3-builtin.c: New.
>        * gcc.target/i386/fma4-builtin.c: New.
>

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 2f92497..5831956 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,17 @@ 
+2010-10-19  Richard Henderson  <rth@redhat.com>
+
+	* config/i386/i386.c (bdesc_multi_arg): Use fma4i_fmadd_<mode>.
+	* config/i386/sse.md (fma<mode>4): Enable for FMA & SSE_MATH.
+	(fma4i_fmadd_<mode>): New.
+	(*split_fma, *split_fms, *split_fnma, *split_fnms): Rename from
+	fma4_fm*_<mode> and adjust to be pre-reload splitters to the 
+	standard fma patterns.
+	(fmaddsub_<mode>): Rename from fma4i_fmaddsub_<mode> and
+	enable for FMA.
+	(*fma_fmadd_<mode>, *fma_fmsub_<mode>): New.
+	(*fma_fmadd_<mode>, *fma_fmsub_<mode>): New.
+	(*fma_fmaddsub_<mode>, *fma_fmsubadd_<mode>): New.
+
 2010-10-19  Paul Koning  <pkoning@equallogic.com>
 
 	* lower-subreg.c (resolve_shift_zext): Delete conditional code for
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7da2cfb..6668a62 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -23931,18 +23931,38 @@  static const struct builtin_description bdesc_args[] =
 
 static const struct builtin_description bdesc_multi_arg[] =
 {
-  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,     "__builtin_ia32_vfmaddss",    IX86_BUILTIN_VFMADDSS,    UNKNOWN,      (int)MULTI_ARG_3_SF },
-  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,     "__builtin_ia32_vfmaddsd",    IX86_BUILTIN_VFMADDSD,    UNKNOWN,      (int)MULTI_ARG_3_DF },
-
-  { OPTION_MASK_ISA_FMA4, CODE_FOR_fmav4sf4,               "__builtin_ia32_vfmaddps",    IX86_BUILTIN_VFMADDPS,    UNKNOWN,   (int)MULTI_ARG_3_SF },
-  { OPTION_MASK_ISA_FMA4, CODE_FOR_fmav2df4,               "__builtin_ia32_vfmaddpd",    IX86_BUILTIN_VFMADDPD,    UNKNOWN,   (int)MULTI_ARG_3_DF },
-  { OPTION_MASK_ISA_FMA4, CODE_FOR_fmav8sf4,               "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256, UNKNOWN,   (int)MULTI_ARG_3_SF2 },
-  { OPTION_MASK_ISA_FMA4, CODE_FOR_fmav4df4,               "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256, UNKNOWN,   (int)MULTI_ARG_3_DF2 },
-
-  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsub_v4sf,	   "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,        UNKNOWN,  (int)MULTI_ARG_3_SF },
-  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsub_v2df,	   "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,        UNKNOWN,  (int)MULTI_ARG_3_DF },
-  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsub_v8sf,	   "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,  UNKNOWN,  (int)MULTI_ARG_3_SF2 },
-  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsub_v4df,	   "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,  UNKNOWN,  (int)MULTI_ARG_3_DF2 },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
+    "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
+    UNKNOWN, (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
+    "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
+    UNKNOWN, (int)MULTI_ARG_3_DF },
+
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
+    "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
+    UNKNOWN, (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
+    "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
+    UNKNOWN, (int)MULTI_ARG_3_DF },
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
+    "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
+    UNKNOWN, (int)MULTI_ARG_3_SF2 },
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
+    "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
+    UNKNOWN, (int)MULTI_ARG_3_DF2 },
+
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
+    "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
+    UNKNOWN, (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
+    "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
+    UNKNOWN, (int)MULTI_ARG_3_DF },
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
+    "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
+    UNKNOWN, (int)MULTI_ARG_3_SF2 },
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
+    "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
+    UNKNOWN, (int)MULTI_ARG_3_DF2 },
 
   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di,        "__builtin_ia32_vpcmov",      IX86_BUILTIN_VPCMOV,	 UNKNOWN,      (int)MULTI_ARG_3_DI },
   { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di,        "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN,      (int)MULTI_ARG_3_DI },
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d6e1f12..2402c70 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1770,13 +1770,24 @@ 
 
 ;; Intrinsic FMA operations.
 
+;; The standard name for fma is only available with SSE math enabled.
 (define_expand "fma<mode>4"
   [(set (match_operand:FMAMODE 0 "register_operand")
 	(fma:FMAMODE
 	  (match_operand:FMAMODE 1 "nonimmediate_operand")
 	  (match_operand:FMAMODE 2 "nonimmediate_operand")
 	  (match_operand:FMAMODE 3 "nonimmediate_operand")))]
-  "TARGET_FMA4"
+  "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH"
+  "")
+
+;; The builtin for fma4intrin.h is not constrained by SSE math enabled.
+(define_expand "fma4i_fmadd_<mode>"
+  [(set (match_operand:FMAMODE 0 "register_operand")
+	(fma:FMAMODE
+	  (match_operand:FMAMODE 1 "nonimmediate_operand")
+	  (match_operand:FMAMODE 2 "nonimmediate_operand")
+	  (match_operand:FMAMODE 3 "nonimmediate_operand")))]
+  "TARGET_FMA || TARGET_FMA4"
   "")
 
 (define_insn "*fma4i_fmadd_<mode>"
@@ -1904,61 +1915,6 @@ 
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "<MODE>")])
 
-;; Non-intrinsic versions, matched when fused-multiply-add is allowed.
-
-(define_insn "*fma4_fmadd_<mode>"
-  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x")
-	(plus:FMAMODE
-	 (mult:FMAMODE
-	  (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x")
-	  (match_operand:FMAMODE 2 "nonimmediate_operand" " x,m"))
-	 (match_operand:FMAMODE 3 "nonimmediate_operand"  "xm,x")))]
-  "TARGET_FMA4 && TARGET_FUSED_MADD"
-  "vfmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "<MODE>")])
-
-;; Floating multiply and subtract.
-(define_insn "*fma4_fmsub_<mode>"
-  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x")
-	(minus:FMAMODE
-	 (mult:FMAMODE
-	  (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x")
-	  (match_operand:FMAMODE 2 "nonimmediate_operand" " x,m"))
-	 (match_operand:FMAMODE 3 "nonimmediate_operand"  "xm,x")))]
-  "TARGET_FMA4 && TARGET_FUSED_MADD"
-  "vfmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "<MODE>")])
-
-;; Floating point negative multiply and add.
-;; Rewrite (- (a * b) + c) into the canonical form: c - (a * b).
-(define_insn "*fma4_fnmadd_<mode>"
-  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x")
-	(minus:FMAMODE
-	 (match_operand:FMAMODE 3 "nonimmediate_operand"  "xm,x")
-	 (mult:FMAMODE
-	  (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x")
-	  (match_operand:FMAMODE 2 "nonimmediate_operand" " x,m"))))]
-  "TARGET_FMA4 && TARGET_FUSED_MADD"
-  "vfnmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "<MODE>")])
-
-;; Floating point negative multiply and subtract.
-(define_insn "*fma4_fnmsub_<mode>"
-  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x")
-	(minus:FMAMODE
-	 (mult:FMAMODE
-	  (neg:FMAMODE
-	   (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x"))
-	  (match_operand:FMAMODE 2 "nonimmediate_operand"  " x,m"))
-	 (match_operand:FMAMODE 3 "nonimmediate_operand"   "xm,x")))]
-  "TARGET_FMA4 && TARGET_FUSED_MADD"
-  "vfnmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "<MODE>")])
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; FMA4 Parallel floating point multiply addsub and subadd operations.
@@ -1974,7 +1930,17 @@ 
 ;;
 ;; But this doesn't seem useful in practice.
 
-(define_insn "fma4i_fmaddsub_<mode>"
+(define_expand "fmaddsub_<mode>"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "nonimmediate_operand")
+	   (match_operand:AVXMODEF2P 2 "nonimmediate_operand")
+	   (match_operand:AVXMODEF2P 3 "nonimmediate_operand")]
+	  UNSPEC_FMADDSUB))]
+  "TARGET_FMA || TARGET_FMA4"
+  "")
+
+(define_insn "*fma4_fmaddsub_<mode>"
   [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x,x")
 	(unspec:AVXMODEF2P
 	  [(match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x,x")
@@ -1984,9 +1950,9 @@ 
   "TARGET_FMA4"
   "vfmaddsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "V8SF")])
+   (set_attr "mode" "<MODE>")])
 
-(define_insn "*fma4i_fmsubadd_<mode>"
+(define_insn "*fma4_fmsubadd_<mode>"
   [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x,x")
 	(unspec:AVXMODEF2P
 	  [(match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x,x")
@@ -1997,7 +1963,198 @@ 
   "TARGET_FMA4"
   "vfmsubaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "V8SF")])
+   (set_attr "mode" "<MODE>")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; FMA3 floating point multiply/accumulate instructions.
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "*fma_fmadd_<mode>"
+  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x")
+	(fma:FMAMODE
+	  (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x")
+	  (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm")
+	  (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0")))]
+  "TARGET_FMA"
+  "@
+   vfmadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
+   vfmadd312<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vfmadd231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fma_fmsub_<mode>"
+  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x")
+	(fma:FMAMODE
+	  (match_operand:FMAMODE   1 "nonimmediate_operand" "%0, 0,x")
+	  (match_operand:FMAMODE   2 "nonimmediate_operand" "xm, x,xm")
+	  (neg:FMAMODE
+	    (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0"))))]
+  "TARGET_FMA"
+  "@
+   vfmsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
+   vfmsub312<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vfmsub231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fma_fmadd_<mode>"
+  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x")
+	(fma:FMAMODE
+	  (neg:FMAMODE
+	    (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x"))
+	  (match_operand:FMAMODE   2 "nonimmediate_operand" "xm, x,xm")
+	  (match_operand:FMAMODE   3 "nonimmediate_operand" " x,xm,0")))]
+  "TARGET_FMA"
+  "@
+   vfnmadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
+   vfnmadd312<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vfnmadd231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fma_fmsub_<mode>"
+  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x")
+	(fma:FMAMODE
+	  (neg:FMAMODE
+	    (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x"))
+	  (match_operand:FMAMODE   2 "nonimmediate_operand" "xm, x,xm")
+	  (neg:FMAMODE
+	    (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0"))))]
+  "TARGET_FMA"
+  "@
+   vfnmsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
+   vfnmsub312<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vfnmsub231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fma_fmaddsub_<mode>"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x,x,x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%0, 0,x")
+	   (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm, x,xm")
+	   (match_operand:AVXMODEF2P 3 "nonimmediate_operand" " x,xm,0")]
+	  UNSPEC_FMADDSUB))]
+  "TARGET_FMA"
+  "@
+   vfmaddsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
+   vfmaddsub213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vfmaddsub231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fma_fmsubadd_<mode>"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x,x,x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P   1 "nonimmediate_operand" "%0, 0,x")
+	   (match_operand:AVXMODEF2P   2 "nonimmediate_operand" "xm, x,xm")
+	   (neg:AVXMODEF2P
+	     (match_operand:AVXMODEF2P 3 "nonimmediate_operand" " x,xm,0"))]
+	  UNSPEC_FMADDSUB))]
+  "TARGET_FMA"
+  "@
+   vfmsubadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
+   vfmsubadd213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vfmsubadd231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Non-intrinsic versions, matched when fused-multiply-add is allowed.
+;;
+;; ??? If fused-madd were a generic flag, combine could do this without
+;; needing splitters here in the backend.  Irritatingly, combine won't
+;; recognize many of these with mere splits, since only 3 or more insns
+;; are allowed to split during combine.  Thankfully, there's always a
+;; split_all_insns pass that runs before reload.
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_and_split "*split_fma"
+  [(set (match_operand:FMAMODE 0 "register_operand")
+	(plus:FMAMODE
+	  (mult:FMAMODE
+	    (match_operand:FMAMODE 1 "nonimmediate_operand")
+	    (match_operand:FMAMODE 2 "nonimmediate_operand"))
+	  (match_operand:FMAMODE 3 "nonimmediate_operand")))]
+  "TARGET_SSE_MATH && TARGET_FUSED_MADD
+   && (TARGET_FMA || TARGET_FMA4)
+   && !(reload_in_progress || reload_completed)"
+  { gcc_unreachable (); }
+  "&& 1"
+  [(set (match_dup 0)
+	(fma:FMAMODE
+	  (match_dup 1)
+	  (match_dup 2)
+	  (match_dup 3)))]
+  "")
+
+;; Floating multiply and subtract.
+(define_insn_and_split "*split_fms"
+  [(set (match_operand:FMAMODE 0 "register_operand")
+	(minus:FMAMODE
+	  (mult:FMAMODE
+	    (match_operand:FMAMODE 1 "nonimmediate_operand")
+	    (match_operand:FMAMODE 2 "nonimmediate_operand"))
+	  (match_operand:FMAMODE 3 "nonimmediate_operand")))]
+  "TARGET_SSE_MATH && TARGET_FUSED_MADD
+   && (TARGET_FMA || TARGET_FMA4)
+   && !(reload_in_progress || reload_completed)"
+  { gcc_unreachable (); }
+  "&& 1"
+  [(set (match_dup 0)
+	(fma:FMAMODE
+	  (match_dup 1)
+	  (match_dup 2)
+	  (neg:FMAMODE (match_dup 3))))]
+  "")
+
+;; Floating point negative multiply and add.
+;; Recognize (-a * b + c) via the canonical form: c - (a * b).
+(define_insn_and_split "*split_fnma"
+  [(set (match_operand:FMAMODE 0 "register_operand")
+	(minus:FMAMODE
+	 (match_operand:FMAMODE 3 "nonimmediate_operand")
+	 (mult:FMAMODE
+	  (match_operand:FMAMODE 1 "nonimmediate_operand")
+	  (match_operand:FMAMODE 2 "nonimmediate_operand"))))]
+  "TARGET_SSE_MATH && TARGET_FUSED_MADD
+   && (TARGET_FMA || TARGET_FMA4)
+   && !(reload_in_progress || reload_completed)"
+  { gcc_unreachable (); }
+  "&& 1"
+  [(set (match_dup 0)
+	(fma:FMAMODE
+	  (neg:FMAMODE (match_dup 1))
+	  (match_dup 2)
+	  (match_dup 3)))]
+  "")
+
+;; Floating point negative multiply and subtract.
+;; Recognize (-a * b - c) via the canonical form: c - (-a * b).
+(define_insn_and_split "*split_fnms"
+  [(set (match_operand:FMAMODE 0 "register_operand")
+	(minus:FMAMODE
+	  (mult:FMAMODE
+	    (neg:FMAMODE
+	      (match_operand:FMAMODE 1 "nonimmediate_operand"))
+	    (match_operand:FMAMODE 2 "nonimmediate_operand"))
+	 (match_operand:FMAMODE 3 "nonimmediate_operand")))]
+  "TARGET_SSE_MATH && TARGET_FUSED_MADD
+   && (TARGET_FMA || TARGET_FMA4)
+   && !(reload_in_progress || reload_completed)"
+  { gcc_unreachable (); }
+  "&& 1"
+  [(set (match_dup 0)
+	(fma:FMAMODE
+	  (neg:FMAMODE (match_dup 1))
+	  (match_dup 2)
+	  (neg:FMAMODE (match_dup 3))))]
+  "")
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 3c6a106..949a391 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,9 @@ 
+2010-10-19  Richard Henderson  <rth@redhat.com>
+
+	* gcc.target/i386/fma3-fma.c: New.
+	* gcc.target/i386/fma3-builtin.c: New.
+	* gcc.target/i386/fma4-builtin.c: New.
+
 2010-10-19  Richard Guenther  <rguenther@suse.de>
 
 	PR testsuite/46081
diff --git a/gcc/testsuite/gcc.target/i386/fma3-builtin.c b/gcc/testsuite/gcc.target/i386/fma3-builtin.c
new file mode 100644
index 0000000..ba8af55
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma3-builtin.c
@@ -0,0 +1,82 @@ 
+/* Test that the compiler properly generates floating point multiply
+   and add instructions FMA3 systems.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -mfma -mno-fma4" } */
+
+#ifndef __FP_FAST_FMAF
+# error "__FP_FAST_FMAF should be defined"
+#endif
+#ifndef __FP_FAST_FMA
+# error "__FP_FAST_FMA should be defined"
+#endif
+
+float
+flt_mul_add (float a, float b, float c)
+{
+  return __builtin_fmaf (a, b, c);
+}
+
+double
+dbl_mul_add (double a, double b, double c)
+{
+  return __builtin_fma (a, b, c);
+}
+
+float
+flt_mul_sub (float a, float b, float c)
+{
+  return __builtin_fmaf (a, b, -c);
+}
+
+double
+dbl_mul_sub (double a, double b, double c)
+{
+  return __builtin_fma (a, b, -c);
+}
+
+float
+flt_neg_mul_add_1 (float a, float b, float c)
+{
+  return __builtin_fmaf (-a, b, c);
+}
+
+double
+dbl_neg_mul_add_1 (double a, double b, double c)
+{
+  return __builtin_fma (-a, b, c);
+}
+
+float
+flt_neg_mul_add_2 (float a, float b, float c)
+{
+  return __builtin_fmaf (a, -b, c);
+}
+
+double
+dbl_neg_mul_add_2 (double a, double b, double c)
+{
+  return __builtin_fma (a, -b, c);
+}
+
+float
+flt_neg_mul_sub (float a, float b, float c)
+{
+  return __builtin_fmaf (-a, b, -c);
+}
+
+double
+dbl_neg_mul_sub (double a, double b, double c)
+{
+  return __builtin_fma (-a, b, -c);
+}
+
+/* { dg-final { scan-assembler-times "vfmadd...ss" 1 } } */
+/* { dg-final { scan-assembler-times "vfmadd...sd" 1 } } */
+/* { dg-final { scan-assembler-times "vfmsub...ss" 1 } } */
+/* { dg-final { scan-assembler-times "vfmsub...sd" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd...ss" 2 } } */
+/* { dg-final { scan-assembler-times "vfnmadd...sd" 2 } } */
+/* { dg-final { scan-assembler-times "vfnmsub...ss" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub...sd" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/fma3-fma.c b/gcc/testsuite/gcc.target/i386/fma3-fma.c
new file mode 100644
index 0000000..1cedba8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma3-fma.c
@@ -0,0 +1,82 @@ 
+/* Test that the compiler properly optimizes floating point multiply
+   and add instructions FMA3 systems.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -mfma -mno-fma4" } */
+
+extern void exit (int);
+
+float
+flt_mul_add (float a, float b, float c)
+{
+  return (a * b) + c;
+}
+
+double
+dbl_mul_add (double a, double b, double c)
+{
+  return (a * b) + c;
+}
+
+float
+flt_mul_sub (float a, float b, float c)
+{
+  return (a * b) - c;
+}
+
+double
+dbl_mul_sub (double a, double b, double c)
+{
+  return (a * b) - c;
+}
+
+float
+flt_neg_mul_add (float a, float b, float c)
+{
+  return (-(a * b)) + c;
+}
+
+double
+dbl_neg_mul_add (double a, double b, double c)
+{
+  return (-(a * b)) + c;
+}
+
+float
+flt_neg_mul_sub (float a, float b, float c)
+{
+  return (-(a * b)) - c;
+}
+
+double
+dbl_neg_mul_sub (double a, double b, double c)
+{
+  return (-(a * b)) - c;
+}
+
+float  f[10] = { 2, 3, 4 };
+double d[10] = { 2, 3, 4 };
+
+int main ()
+{
+  f[3] = flt_mul_add (f[0], f[1], f[2]);
+  f[4] = flt_mul_sub (f[0], f[1], f[2]);
+  f[5] = flt_neg_mul_add (f[0], f[1], f[2]);
+  f[6] = flt_neg_mul_sub (f[0], f[1], f[2]);
+
+  d[3] = dbl_mul_add (d[0], d[1], d[2]);
+  d[4] = dbl_mul_sub (d[0], d[1], d[2]);
+  d[5] = dbl_neg_mul_add (d[0], d[1], d[2]);
+  d[6] = dbl_neg_mul_sub (d[0], d[1], d[2]);
+  exit (0);
+}
+
+/* { dg-final { scan-assembler "vfmadd...ss" } } */
+/* { dg-final { scan-assembler "vfmadd...sd" } } */
+/* { dg-final { scan-assembler "vfmsub...ss" } } */
+/* { dg-final { scan-assembler "vfmsub...sd" } } */
+/* { dg-final { scan-assembler "vfnmadd...ss" } } */
+/* { dg-final { scan-assembler "vfnmadd...sd" } } */
+/* { dg-final { scan-assembler "vfnmsub...ss" } } */
+/* { dg-final { scan-assembler "vfnmsub...sd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/fma4-builtin.c b/gcc/testsuite/gcc.target/i386/fma4-builtin.c
new file mode 100644
index 0000000..5659cf4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma4-builtin.c
@@ -0,0 +1,82 @@ 
+/* Test that the compiler properly generates floating point multiply
+   and add instructions FMA4 systems.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -mfma4" } */
+
+#ifndef __FP_FAST_FMAF
+# error "__FP_FAST_FMAF should be defined"
+#endif
+#ifndef __FP_FAST_FMA
+# error "__FP_FAST_FMA should be defined"
+#endif
+
+float
+flt_mul_add (float a, float b, float c)
+{
+  return __builtin_fmaf (a, b, c);
+}
+
+double
+dbl_mul_add (double a, double b, double c)
+{
+  return __builtin_fma (a, b, c);
+}
+
+float
+flt_mul_sub (float a, float b, float c)
+{
+  return __builtin_fmaf (a, b, -c);
+}
+
+double
+dbl_mul_sub (double a, double b, double c)
+{
+  return __builtin_fma (a, b, -c);
+}
+
+float
+flt_neg_mul_add_1 (float a, float b, float c)
+{
+  return __builtin_fmaf (-a, b, c);
+}
+
+double
+dbl_neg_mul_add_1 (double a, double b, double c)
+{
+  return __builtin_fma (-a, b, c);
+}
+
+float
+flt_neg_mul_add_2 (float a, float b, float c)
+{
+  return __builtin_fmaf (a, -b, c);
+}
+
+double
+dbl_neg_mul_add_2 (double a, double b, double c)
+{
+  return __builtin_fma (a, -b, c);
+}
+
+float
+flt_neg_mul_sub (float a, float b, float c)
+{
+  return __builtin_fmaf (-a, b, -c);
+}
+
+double
+dbl_neg_mul_sub (double a, double b, double c)
+{
+  return __builtin_fma (-a, b, -c);
+}
+
+/* { dg-final { scan-assembler-times "vfmaddss" 1 } } */
+/* { dg-final { scan-assembler-times "vfmaddsd" 1 } } */
+/* { dg-final { scan-assembler-times "vfmsubss" 1 } } */
+/* { dg-final { scan-assembler-times "vfmsubsd" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmaddss" 2 } } */
+/* { dg-final { scan-assembler-times "vfnmaddsd" 2 } } */
+/* { dg-final { scan-assembler-times "vfnmsubss" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmsubsd" 1 } } */