Patchwork [RFA/ARM,3/3] Add support for vfma* and vfms* Neon intrinsics

login
register
mail settings
Submitter Matthew Gretton-Dann
Date June 25, 2012, 2:59 p.m.
Message ID <4FE87CBD.7090706@arm.com>
Download mbox | patch
Permalink /patch/167135/
State New
Headers show

Comments

Matthew Gretton-Dann - June 25, 2012, 2:59 p.m.
All,

This commit adds support for the vmfa* and vfms* Neon intrinsics.

This updates neon.ml, and the various generation tools which use it,
arm_neon.h, the testsuite and documentation.

The documentation has not been regenerated for a while and so the
changes are larger than expected.

OK?

gcc/ChangeLog:

2012-06-25  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>

	* config/arm/arm.c (neon_builtin_data): Add vfma and vfms
	builtins.
	* config/arm/neon-docgen.ml (intrinsic_groups): Add
	fused-multiply-* groups.
	* config/neon-gen.ml (print_feature_test_start): New function.
	(print_feature_test_end): Likewise.
	(print_variant): Print feature test macros.
	* config/arm/neon-testgen.ml (emit_prologue): Allow different
	tests to require different effective targets.
	(effective_target): New function.
	(test_intrinsic): Specify correct effective targets.
	* gcc/config/arm/neon.md (*fmsub<mode>4): Rename...
	(fmsub<mode>4): ...to this.
	(neon_vfma<mode>): New expand.
	(neon_vfms<mode>): Likewise.
	* config/neon.ml (opcode): Add Vfma and Vfms.
	(features): Add Requires_feature.
	(ops): Add VFMA and VFMS intrinsics.
	* config/arm/arm_neon.h: Regenerate.
	* doc/arm-neon-intrinsics.texi: Likewise.

gcc/testsuite/ChangeLog:

2012-06-25  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>

	* gcc.target/arm/neon/vfmaQf32.c: New testcase.
	* gcc.target/arm/neon/vfmaf32.c: Likewise.
	* gcc.target/arm/neon/vfmsQf32.c: Likewise.
	* gcc.target/arm/neon/vfmsf32.c: Likewise.

Thanks,

Matt

Patch

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index cba98f9..0b8b41e 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -18919,6 +18919,8 @@  static neon_builtin_datum neon_builtin_data[] =
   VAR8 (BINOP, vmul, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
   VAR8 (TERNOP, vmla, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
   VAR3 (TERNOP, vmlal, v8qi, v4hi, v2si),
+  VAR2 (TERNOP, vfma, v2sf, v4sf),
+  VAR2 (TERNOP, vfms, v2sf, v4sf),
   VAR8 (TERNOP, vmls, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
   VAR3 (TERNOP, vmlsl, v8qi, v4hi, v2si),
   VAR4 (BINOP, vqdmulh, v4hi, v2si, v8hi, v4si),
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 0567895..3afe3b0 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -1350,6 +1350,38 @@  vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
   return (int64x2_t)__builtin_neon_vqdmlslv2si (__a, __b, __c, 1);
 }
 
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return (float32x2_t)__builtin_neon_vfmav2sf (__a, __b, __c, 3);
+}
+
+#endif
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return (float32x4_t)__builtin_neon_vfmav4sf (__a, __b, __c, 3);
+}
+
+#endif
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return (float32x2_t)__builtin_neon_vfmsv2sf (__a, __b, __c, 3);
+}
+
+#endif
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return (float32x4_t)__builtin_neon_vfmsv4sf (__a, __b, __c, 3);
+}
+
+#endif
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vsub_s8 (int8x8_t __a, int8x8_t __b)
 {
diff --git a/gcc/config/arm/neon-docgen.ml b/gcc/config/arm/neon-docgen.ml
index 23e37b4..043b1e0 100644
--- a/gcc/config/arm/neon-docgen.ml
+++ b/gcc/config/arm/neon-docgen.ml
@@ -103,6 +103,8 @@  let intrinsic_groups =
     "Multiplication", single_opcode Vmul;
     "Multiply-accumulate", single_opcode Vmla;
     "Multiply-subtract", single_opcode Vmls;
+    "Fused-multiply-accumulate", single_opcode Vfma;
+    "Fused-multiply-subtract", single_opcode Vfms;
     "Subtraction", single_opcode Vsub;
     "Comparison (equal-to)", single_opcode Vceq;
     "Comparison (greater-than-or-equal-to)", single_opcode Vcge;
diff --git a/gcc/config/arm/neon-gen.ml b/gcc/config/arm/neon-gen.ml
index 112c8be..0297597 100644
--- a/gcc/config/arm/neon-gen.ml
+++ b/gcc/config/arm/neon-gen.ml
@@ -239,6 +239,24 @@  let rec mode_suffix elttype shape =
     and srcmode = mode_of_elt src shape in
     string_of_mode dstmode ^ string_of_mode srcmode
 
+let print_feature_test_start features =
+  try
+    match List.find (fun feature ->
+                       match feature with Requires_feature _ -> true
+                                        | _ -> false)
+                     features with
+      Requires_feature feature -> 
+        Format.printf "#ifdef __ARM_FEATURE_%s@\n" feature
+    | _ -> assert false
+  with Not_found -> assert true
+
+let print_feature_test_end features =
+  let feature =
+    List.exists (function Requires_feature x -> true
+                                        |  _ -> false) features in
+  if feature then Format.printf "#endif@\n"
+
+
 let print_variant opcode features shape name (ctype, asmtype, elttype) =
   let bits = infoword_value elttype features in
   let modesuf = mode_suffix elttype shape in
@@ -252,7 +270,11 @@  let print_variant opcode features shape name (ctype, asmtype, elttype) =
   let rdecls, stmts = return ctype return_by_ptr builtin in
   let body = pdecls @ rdecls @ stmts
   and fnname = (intrinsic_name name) ^ "_" ^ (string_of_elt elttype) in
-  print_function ctype fnname body
+  begin
+    print_feature_test_start features;
+    print_function ctype fnname body;
+    print_feature_test_end features;
+  end
 
 (* When this function processes the element types in the ops table, it rewrites
    them in a list of tuples (a,b,c):
diff --git a/gcc/config/arm/neon-testgen.ml b/gcc/config/arm/neon-testgen.ml
index a69a539..4645f39 100644
--- a/gcc/config/arm/neon-testgen.ml
+++ b/gcc/config/arm/neon-testgen.ml
@@ -46,13 +46,14 @@  let open_test_file dir name =
     failwith ("Could not create test source file " ^ name ^ ": " ^ str)
 
 (* Emit prologue code to a test source file.  *)
-let emit_prologue chan test_name =
+let emit_prologue chan test_name effective_target =
   Printf.fprintf chan "/* Test the `%s' ARM Neon intrinsic.  */\n" test_name;
   Printf.fprintf chan "/* This file was autogenerated by neon-testgen.  */\n\n";
   Printf.fprintf chan "/* { dg-do assemble } */\n";
-  Printf.fprintf chan "/* { dg-require-effective-target arm_neon_ok } */\n";
+  Printf.fprintf chan "/* { dg-require-effective-target %s_ok } */\n"
+                 effective_target;
   Printf.fprintf chan "/* { dg-options \"-save-temps -O0\" } */\n";
-  Printf.fprintf chan "/* { dg-add-options arm_neon } */\n";
+  Printf.fprintf chan "/* { dg-add-options %s } */\n" effective_target;
   Printf.fprintf chan "\n#include \"arm_neon.h\"\n\n";
   Printf.fprintf chan "void test_%s (void)\n{\n" test_name
 
@@ -156,6 +157,17 @@  let check_types tys =
                 then (Const :: flags, String.sub ty 6 ((String.length ty) - 6))
                 else (flags, ty)) tys'
 
+(* Work out what the effective target should be.  *)
+let effective_target features =
+  try
+    match List.find (fun feature ->
+                       match feature with Requires_feature _ -> true
+                                        | _ -> false)
+                     features with
+      Requires_feature "FMA" -> "arm_neonv2"
+    | _ -> assert false
+  with Not_found -> "arm_neon"
+
 (* Given an intrinsic shape, produce a regexp that will match
    the right-hand sides of instructions generated by an intrinsic of
    that shape.  *)
@@ -263,8 +275,10 @@  let test_intrinsic dir opcode features shape name munge elt_ty =
 			  "!?\\(\\[ \t\\]+@\\[a-zA-Z0-9 \\]+\\)?\\n")
                          (analyze_all_shapes features shape analyze_shape)
   in
+  let effective_target = effective_target features
+  in
     (* Emit file and function prologues.  *)
-    emit_prologue chan test_name;
+    emit_prologue chan test_name effective_target;
     (* Emit local variable declarations.  *)
     emit_automatics chan c_types features;
     Printf.fprintf chan "\n";
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 4d12fb3..64785da 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -725,7 +725,7 @@ 
 		      (const_string "neon_fp_vmla_qqq")))]
 )
 
-(define_insn "*fmsub<mode>4"
+(define_insn "fmsub<mode>4"
   [(set (match_operand:VCVTF 0 "register_operand" "=w")
         (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
 		   (match_operand:VCVTF 2 "register_operand" "w")
@@ -1915,6 +1915,32 @@ 
   DONE;
 })
 
+(define_expand "neon_vfma<mode>"
+  [(match_operand:VCVTF 0 "s_register_operand")
+   (match_operand:VCVTF 1 "s_register_operand")
+   (match_operand:VCVTF 2 "s_register_operand")
+   (match_operand:VCVTF 3 "s_register_operand")
+   (match_operand:SI 4 "immediate_operand")]
+  "TARGET_NEON && TARGET_FMA"
+{
+  emit_insn (gen_fma<mode>4 (operands[0], operands[2], operands[3], 
+					 operands[1]));
+  DONE;
+})
+
+(define_expand "neon_vfms<mode>"
+  [(match_operand:VCVTF 0 "s_register_operand")
+   (match_operand:VCVTF 1 "s_register_operand")
+   (match_operand:VCVTF 2 "s_register_operand")
+   (match_operand:VCVTF 3 "s_register_operand")
+   (match_operand:SI 4 "immediate_operand")]
+  "TARGET_NEON && TARGET_FMA"
+{
+  emit_insn (gen_fmsub<mode>4 (operands[0], operands[2], operands[3], 
+					 operands[1]));
+  DONE;
+})
+
 ; Used for intrinsics when flag_unsafe_math_optimizations is false.
 
 (define_insn "neon_vmla<mode>_unspec"
diff --git a/gcc/config/arm/neon.ml b/gcc/config/arm/neon.ml
index 6774688..cc0e8a6 100644
--- a/gcc/config/arm/neon.ml
+++ b/gcc/config/arm/neon.ml
@@ -102,6 +102,8 @@  type opcode =
   | Vmul
   | Vmla
   | Vmls
+  | Vfma
+  | Vfms
   | Vsub
   | Vceq
   | Vcge
@@ -236,6 +238,8 @@  type features =
   | Const_valuator of (int -> int)
   | Fixed_vector_reg
   | Fixed_core_reg
+    (* Mark that the intrinsic requires __ARM_FEATURE_string to be defined.  *)
+  | Requires_feature of string
 
 exception MixedMode of elts * elts
 
@@ -761,6 +765,12 @@  let ops =
     Vmls, [], Long, "vmlsl", elts_same_io, su_8_32;
     Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32];
 
+    (* Fused-multiply-accumulate. *)
+    Vfma, [Requires_feature "FMA"], All (3, Dreg), "vfma", elts_same_io, [F32];
+    Vfma, [Requires_feature "FMA"], All (3, Qreg), "vfmaQ", elts_same_io, [F32];
+    Vfms, [Requires_feature "FMA"], All (3, Dreg), "vfms", elts_same_io, [F32];
+    Vfms, [Requires_feature "FMA"], All (3, Qreg), "vfmsQ", elts_same_io, [F32];
+
     (* Subtraction.  *)
     Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_32;
     Vsub, [No_op], All (3, Dreg), "vsub", sign_invar_2,  [S64; U64];
diff --git a/gcc/doc/arm-neon-intrinsics.texi b/gcc/doc/arm-neon-intrinsics.texi
index a75e582..14e6264 100644
--- a/gcc/doc/arm-neon-intrinsics.texi
+++ b/gcc/doc/arm-neon-intrinsics.texi
@@ -972,6 +972,38 @@ 
 
 
 
+@subsubsection Fused-multiply-accumulate
+
+@itemize @bullet
+@item float32x2_t vfma_f32 (float32x2_t, float32x2_t, float32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vfma.f32 @var{d0}, @var{d0}, @var{d0}}
+@end itemize
+
+
+@itemize @bullet
+@item float32x4_t vfmaq_f32 (float32x4_t, float32x4_t, float32x4_t)
+@*@emph{Form of expected instruction(s):} @code{vfma.f32 @var{q0}, @var{q0}, @var{q0}}
+@end itemize
+
+
+
+
+@subsubsection Fused-multiply-subtract
+
+@itemize @bullet
+@item float32x2_t vfms_f32 (float32x2_t, float32x2_t, float32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vfms.f32 @var{d0}, @var{d0}, @var{d0}}
+@end itemize
+
+
+@itemize @bullet
+@item float32x4_t vfmsq_f32 (float32x4_t, float32x4_t, float32x4_t)
+@*@emph{Form of expected instruction(s):} @code{vfms.f32 @var{q0}, @var{q0}, @var{q0}}
+@end itemize
+
+
+
+
 @subsubsection Subtraction
 
 @itemize @bullet
@@ -1497,24 +1529,6 @@ 
 @subsubsection Comparison (greater-than-or-equal-to)
 
 @itemize @bullet
-@item uint32x2_t vcge_u32 (uint32x2_t, uint32x2_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{d0}, @var{d0}, @var{d0}}
-@end itemize
-
-
-@itemize @bullet
-@item uint16x4_t vcge_u16 (uint16x4_t, uint16x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{d0}, @var{d0}, @var{d0}}
-@end itemize
-
-
-@itemize @bullet
-@item uint8x8_t vcge_u8 (uint8x8_t, uint8x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{d0}, @var{d0}, @var{d0}}
-@end itemize
-
-
-@itemize @bullet
 @item uint32x2_t vcge_s32 (int32x2_t, int32x2_t)
 @*@emph{Form of expected instruction(s):} @code{vcge.s32 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
@@ -1539,20 +1553,20 @@ 
 
 
 @itemize @bullet
-@item uint32x4_t vcgeq_u32 (uint32x4_t, uint32x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{q0}, @var{q0}, @var{q0}}
+@item uint32x2_t vcge_u32 (uint32x2_t, uint32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint16x8_t vcgeq_u16 (uint16x8_t, uint16x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{q0}, @var{q0}, @var{q0}}
+@item uint16x4_t vcge_u16 (uint16x4_t, uint16x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint8x16_t vcgeq_u8 (uint8x16_t, uint8x16_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{q0}, @var{q0}, @var{q0}}
+@item uint8x8_t vcge_u8 (uint8x8_t, uint8x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
@@ -1580,28 +1594,28 @@ 
 @end itemize
 
 
-
-
-@subsubsection Comparison (less-than-or-equal-to)
-
 @itemize @bullet
-@item uint32x2_t vcle_u32 (uint32x2_t, uint32x2_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{d0}, @var{d0}, @var{d0}}
+@item uint32x4_t vcgeq_u32 (uint32x4_t, uint32x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{q0}, @var{q0}, @var{q0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint16x4_t vcle_u16 (uint16x4_t, uint16x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{d0}, @var{d0}, @var{d0}}
+@item uint16x8_t vcgeq_u16 (uint16x8_t, uint16x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{q0}, @var{q0}, @var{q0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint8x8_t vcle_u8 (uint8x8_t, uint8x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{d0}, @var{d0}, @var{d0}}
+@item uint8x16_t vcgeq_u8 (uint8x16_t, uint8x16_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{q0}, @var{q0}, @var{q0}}
 @end itemize
 
 
+
+
+@subsubsection Comparison (less-than-or-equal-to)
+
 @itemize @bullet
 @item uint32x2_t vcle_s32 (int32x2_t, int32x2_t)
 @*@emph{Form of expected instruction(s):} @code{vcge.s32 @var{d0}, @var{d0}, @var{d0}}
@@ -1627,20 +1641,20 @@ 
 
 
 @itemize @bullet
-@item uint32x4_t vcleq_u32 (uint32x4_t, uint32x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{q0}, @var{q0}, @var{q0}}
+@item uint32x2_t vcle_u32 (uint32x2_t, uint32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint16x8_t vcleq_u16 (uint16x8_t, uint16x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{q0}, @var{q0}, @var{q0}}
+@item uint16x4_t vcle_u16 (uint16x4_t, uint16x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint8x16_t vcleq_u8 (uint8x16_t, uint8x16_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{q0}, @var{q0}, @var{q0}}
+@item uint8x8_t vcle_u8 (uint8x8_t, uint8x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
@@ -1668,28 +1682,28 @@ 
 @end itemize
 
 
-
-
-@subsubsection Comparison (greater-than)
-
 @itemize @bullet
-@item uint32x2_t vcgt_u32 (uint32x2_t, uint32x2_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{d0}, @var{d0}, @var{d0}}
+@item uint32x4_t vcleq_u32 (uint32x4_t, uint32x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{q0}, @var{q0}, @var{q0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint16x4_t vcgt_u16 (uint16x4_t, uint16x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{d0}, @var{d0}, @var{d0}}
+@item uint16x8_t vcleq_u16 (uint16x8_t, uint16x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{q0}, @var{q0}, @var{q0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint8x8_t vcgt_u8 (uint8x8_t, uint8x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{d0}, @var{d0}, @var{d0}}
+@item uint8x16_t vcleq_u8 (uint8x16_t, uint8x16_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{q0}, @var{q0}, @var{q0}}
 @end itemize
 
 
+
+
+@subsubsection Comparison (greater-than)
+
 @itemize @bullet
 @item uint32x2_t vcgt_s32 (int32x2_t, int32x2_t)
 @*@emph{Form of expected instruction(s):} @code{vcgt.s32 @var{d0}, @var{d0}, @var{d0}}
@@ -1715,20 +1729,20 @@ 
 
 
 @itemize @bullet
-@item uint32x4_t vcgtq_u32 (uint32x4_t, uint32x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{q0}, @var{q0}, @var{q0}}
+@item uint32x2_t vcgt_u32 (uint32x2_t, uint32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint16x8_t vcgtq_u16 (uint16x8_t, uint16x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{q0}, @var{q0}, @var{q0}}
+@item uint16x4_t vcgt_u16 (uint16x4_t, uint16x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint8x16_t vcgtq_u8 (uint8x16_t, uint8x16_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{q0}, @var{q0}, @var{q0}}
+@item uint8x8_t vcgt_u8 (uint8x8_t, uint8x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
@@ -1756,28 +1770,28 @@ 
 @end itemize
 
 
-
-
-@subsubsection Comparison (less-than)
-
 @itemize @bullet
-@item uint32x2_t vclt_u32 (uint32x2_t, uint32x2_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{d0}, @var{d0}, @var{d0}}
+@item uint32x4_t vcgtq_u32 (uint32x4_t, uint32x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{q0}, @var{q0}, @var{q0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint16x4_t vclt_u16 (uint16x4_t, uint16x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{d0}, @var{d0}, @var{d0}}
+@item uint16x8_t vcgtq_u16 (uint16x8_t, uint16x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{q0}, @var{q0}, @var{q0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint8x8_t vclt_u8 (uint8x8_t, uint8x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{d0}, @var{d0}, @var{d0}}
+@item uint8x16_t vcgtq_u8 (uint8x16_t, uint8x16_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{q0}, @var{q0}, @var{q0}}
 @end itemize
 
 
+
+
+@subsubsection Comparison (less-than)
+
 @itemize @bullet
 @item uint32x2_t vclt_s32 (int32x2_t, int32x2_t)
 @*@emph{Form of expected instruction(s):} @code{vcgt.s32 @var{d0}, @var{d0}, @var{d0}}
@@ -1803,20 +1817,20 @@ 
 
 
 @itemize @bullet
-@item uint32x4_t vcltq_u32 (uint32x4_t, uint32x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{q0}, @var{q0}, @var{q0}}
+@item uint32x2_t vclt_u32 (uint32x2_t, uint32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint16x8_t vcltq_u16 (uint16x8_t, uint16x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{q0}, @var{q0}, @var{q0}}
+@item uint16x4_t vclt_u16 (uint16x4_t, uint16x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
-@item uint8x16_t vcltq_u8 (uint8x16_t, uint8x16_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{q0}, @var{q0}, @var{q0}}
+@item uint8x8_t vclt_u8 (uint8x8_t, uint8x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{d0}, @var{d0}, @var{d0}}
 @end itemize
 
 
@@ -1844,6 +1858,24 @@ 
 @end itemize
 
 
+@itemize @bullet
+@item uint32x4_t vcltq_u32 (uint32x4_t, uint32x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{q0}, @var{q0}, @var{q0}}
+@end itemize
+
+
+@itemize @bullet
+@item uint16x8_t vcltq_u16 (uint16x8_t, uint16x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{q0}, @var{q0}, @var{q0}}
+@end itemize
+
+
+@itemize @bullet
+@item uint8x16_t vcltq_u8 (uint8x16_t, uint8x16_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{q0}, @var{q0}, @var{q0}}
+@end itemize
+
+
 
 
 @subsubsection Comparison (absolute greater-than-or-equal-to)
@@ -4810,13 +4842,13 @@ 
 
 @itemize @bullet
 @item uint64_t vgetq_lane_u64 (uint64x2_t, const int)
-@*@emph{Form of expected instruction(s):} @code{vmov @var{r0}, @var{r0}, @var{d0}}
+@*@emph{Form of expected instruction(s):} @code{vmov @var{r0}, @var{r0}, @var{d0}} @emph{or} @code{fmrrd @var{r0}, @var{r0}, @var{d0}}
 @end itemize
 
 
 @itemize @bullet
 @item int64_t vgetq_lane_s64 (int64x2_t, const int)
-@*@emph{Form of expected instruction(s):} @code{vmov @var{r0}, @var{r0}, @var{d0}}
+@*@emph{Form of expected instruction(s):} @code{vmov @var{r0}, @var{r0}, @var{d0}} @emph{or} @code{fmrrd @var{r0}, @var{r0}, @var{d0}}
 @end itemize
 
 
diff --git a/gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c b/gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c
new file mode 100644
index 0000000..d400163
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c
@@ -0,0 +1,22 @@ 
+/* Test the `vfmaQf32' ARM Neon intrinsic.  */
+/* This file was autogenerated by neon-testgen.  */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+
+#include "arm_neon.h"
+
+void test_vfmaQf32 (void)
+{
+  float32x4_t out_float32x4_t;
+  float32x4_t arg0_float32x4_t;
+  float32x4_t arg1_float32x4_t;
+  float32x4_t arg2_float32x4_t;
+
+  out_float32x4_t = vfmaq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
+}
+
+/* { dg-final { scan-assembler "vfma\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon/vfmaf32.c b/gcc/testsuite/gcc.target/arm/neon/vfmaf32.c
new file mode 100644
index 0000000..988328d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon/vfmaf32.c
@@ -0,0 +1,22 @@ 
+/* Test the `vfmaf32' ARM Neon intrinsic.  */
+/* This file was autogenerated by neon-testgen.  */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+
+#include "arm_neon.h"
+
+void test_vfmaf32 (void)
+{
+  float32x2_t out_float32x2_t;
+  float32x2_t arg0_float32x2_t;
+  float32x2_t arg1_float32x2_t;
+  float32x2_t arg2_float32x2_t;
+
+  out_float32x2_t = vfma_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
+}
+
+/* { dg-final { scan-assembler "vfma\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c b/gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c
new file mode 100644
index 0000000..247a8ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c
@@ -0,0 +1,22 @@ 
+/* Test the `vfmsQf32' ARM Neon intrinsic.  */
+/* This file was autogenerated by neon-testgen.  */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+
+#include "arm_neon.h"
+
+void test_vfmsQf32 (void)
+{
+  float32x4_t out_float32x4_t;
+  float32x4_t arg0_float32x4_t;
+  float32x4_t arg1_float32x4_t;
+  float32x4_t arg2_float32x4_t;
+
+  out_float32x4_t = vfmsq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
+}
+
+/* { dg-final { scan-assembler "vfms\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon/vfmsf32.c b/gcc/testsuite/gcc.target/arm/neon/vfmsf32.c
new file mode 100644
index 0000000..7f9e857
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon/vfmsf32.c
@@ -0,0 +1,22 @@ 
+/* Test the `vfmsf32' ARM Neon intrinsic.  */
+/* This file was autogenerated by neon-testgen.  */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+
+#include "arm_neon.h"
+
+void test_vfmsf32 (void)
+{
+  float32x2_t out_float32x2_t;
+  float32x2_t arg0_float32x2_t;
+  float32x2_t arg1_float32x2_t;
+  float32x2_t arg2_float32x2_t;
+
+  out_float32x2_t = vfms_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
+}
+
+/* { dg-final { scan-assembler "vfms\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */