diff mbox

[AArch64,NEON] More NEON intrinsics improvement

Message ID DA41BE1DDCA941489001C7FBD7A8820E555588AB@szxema507-mbx.china.huawei.com
State New
Headers show

Commit Message

Yangfei (Felix) Nov. 18, 2014, 8:44 a.m. UTC
Hi,
     This patch converts more intrinsics to use builtin functions instead of the previous inline assembly syntax. 
     Passed the glorious testsuite of Christophe Lyon. 

     Three testcases are added for the testing of intriniscs which are not covered by the testsuite: 
     gcc.target/aarch64/vfma.c
     gcc.target/aarch64/vfma_n.c
     gcc.target/aarch64/vfms.c

     Regtested with aarch64-linux-gnu on QEMU.  OK for the trunk?
diff mbox

Patch

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 217394)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,26 @@ 
+2014-11-18  Felix Yang  <felix.yang@huawei.com>
+	    Haijian Zhang  <z.zhanghaijian@huawei.com>
+	    Jiji Jiang  <jiangjiji@huawei.com>
+	    Pengfei Sui  <suipengfei@huawei.com>
+
+	* config/aarch64/arm_neon.h (vrecpe_u32, vrecpeq_u32): Rewrite using
+	builtin functions.
+	(vfma_f32, vfmaq_f32, vfmaq_f64, vfma_n_f32, vfmaq_n_f32, vfmaq_n_f64,
+	vfms_f32, vfmsq_f32, vfmsq_f64): Likewise.
+	(vhsub_s8, vhsub_u8, vhsub_s16, vhsub_u16, vhsub_s32, vhsub_u32,
+	vhsubq_s8, vhsubq_u8, vhsubq_s16, vhsubq_u16, vhsubq_s32, vhsubq_u32,
+	vsubhn_s16, vsubhn_u16, vsubhn_s32, vsubhn_u32, vsubhn_s64, vsubhn_u66,
+	vrsubhn_s16, vrsubhn_u16, vrsubhn_s32, vrsubhn_u32, vrsubhn_s64,
+	vrsubhn_u64, vsubhn_high_s16, vsubhn_high_u16, vsubhn_high_s32,
+	vsubhn_high_u32, vsubhn_high_s64, vsubhn_high_u64, vrsubhn_high_s16,
+	vrsubhn_high_u16, vrsubhn_high_s32, vrsubhn_high_u32, vrsubhn_high_s64,
+	vrsubhn_high_u64): Likewise.
+	* config/aarch64/iterators.md (VDQ_SI): New mode iterator.
+	* config/aarch64/aarch64.md (define_c_enum "unspec"): Add UNSPEC_URECPE.
+	* config/aarch64/aarch64-simd.md (aarch64_urecpe<mode>): New pattern.
+	* config/aarch64/aarch64-simd-builtins.def (shsub, uhsub, subhn, rsubhn,
+	subhn2, rsubhn2, urecpe): New builtins.
+
 2014-11-11  Andrew Pinski  <apinski@cavium.com>
 
 	Bug target/61997
Index: gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c	(revision 217394)
+++ gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c	(working copy)
@@ -107,9 +107,9 @@  ONE (vmovn_high, uint16x8_t, uint16x4_t, uint32x4_
 ONE (vmovn_high, uint32x4_t, uint32x2_t, uint64x2_t, u64)
 
 
-/* { dg-final { scan-assembler-times "\\tsubhn2 v" 6} }  */
+/* { dg-final { scan-assembler-times "\\tsubhn2\\tv" 6} }  */
 /* { dg-final { scan-assembler-times "\\taddhn2\\tv" 6} }  */
-/* { dg-final { scan-assembler-times "rsubhn2 v" 6} }  */
+/* { dg-final { scan-assembler-times "rsubhn2\\tv" 6} }  */
 /* { dg-final { scan-assembler-times "raddhn2\\tv" 6} }  */
 /* { dg-final { scan-assembler-times "\\trshrn2 v" 6} }  */
 /* { dg-final { scan-assembler-times "\\tshrn2 v" 6} }  */
Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c	(revision 0)
+++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c	(revision 0)
@@ -0,0 +1,69 @@ 
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x4438ca3d, 0x44390a3d };
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8, 0x4486deb8, 0x4486feb8 };
+VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520, 0x40890ee1532b8520 };
+
+#define VECT_VAR_ASSIGN(S,Q,T1,W) S##Q##_##T1##W
+#define ASSIGN(S, Q, T, W, V) T##W##_t S##Q##_##T##W = V
+#define TEST_MSG "VFMA/VFMAQ"
+void exec_vfma_n (void)
+{
+  /* Basic test: v4=vfma_n(v1,v2), then store the result.  */
+#define TEST_VFMA(Q, T1, T2, W, N)					\
+  VECT_VAR(vector_res, T1, W, N) =					\
+    vfma##Q##_n_##T2##W(VECT_VAR(vector1, T1, W, N),			\
+		      VECT_VAR(vector2, T1, W, N),			\
+			  VECT_VAR_ASSIGN(Scalar, Q, T1, W));			\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+#define CHECK_VFMA_RESULTS(test_name,comment)				\
+  {									\
+    CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment);	\
+    CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment);	\
+	CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment);	\
+  }	
+
+#define DECL_VABD_VAR(VAR)			\
+  DECL_VARIABLE(VAR, float, 32, 2);		\
+  DECL_VARIABLE(VAR, float, 32, 4);		\
+  DECL_VARIABLE(VAR, float, 64, 2);		
+
+  DECL_VABD_VAR(vector1);
+  DECL_VABD_VAR(vector2);
+  DECL_VABD_VAR(vector3);
+  DECL_VABD_VAR(vector_res);
+
+  clean_results ();
+
+  /* Initialize input "vector1" from "buffer".  */
+  VLOAD(vector1, buffer, , float, f, 32, 2);
+  VLOAD(vector1, buffer, q, float, f, 32, 4);
+  VLOAD(vector1, buffer, q, float, f, 64, 2);
+
+  /* Choose init value arbitrarily.  */
+  VDUP(vector2, , float, f, 32, 2, 9.3f);
+  VDUP(vector2, q, float, f, 32, 4, 29.7f);
+  VDUP(vector2, q, float, f, 64, 2, 15.8f);
+  
+  /* Choose init value arbitrarily.  */
+  ASSIGN(Scalar, , float, 32, 81.2f);
+  ASSIGN(Scalar, q, float, 32, 36.8f);
+  ASSIGN(Scalar, q, float, 64, 51.7f);
+
+  /* Execute the tests.  */
+  TEST_VFMA(, float, f, 32, 2);
+  TEST_VFMA(q, float, f, 32, 4);
+  TEST_VFMA(q, float, f, 64, 2);
+
+  CHECK_VFMA_RESULTS (TEST_MSG, "");
+}
+
+int main (void)
+{
+  exec_vfma_n ();
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c	(revision 0)
+++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c	(revision 0)
@@ -0,0 +1,67 @@ 
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x4438ca3d, 0x44390a3d };
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8, 0x4486deb8, 0x4486feb8 };
+VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520, 0x40890ee1532b8520 };
+
+#define TEST_MSG "VFMA/VFMAQ"
+void exec_vfma (void)
+{
+  /* Basic test: v4=vfma(v1,v2), then store the result.  */
+#define TEST_VFMA(Q, T1, T2, W, N)					\
+  VECT_VAR(vector_res, T1, W, N) =					\
+    vfma##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),			\
+		      VECT_VAR(vector2, T1, W, N),			\
+			  VECT_VAR(vector3, T1, W, N));			\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+#define CHECK_VFMA_RESULTS(test_name,comment)				\
+  {									\
+    CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment);	\
+    CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment);	\
+	CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment);	\
+  }	
+
+#define DECL_VABD_VAR(VAR)			\
+  DECL_VARIABLE(VAR, float, 32, 2);		\
+  DECL_VARIABLE(VAR, float, 32, 4);		\
+  DECL_VARIABLE(VAR, float, 64, 2);		
+
+  DECL_VABD_VAR(vector1);
+  DECL_VABD_VAR(vector2);
+  DECL_VABD_VAR(vector3);
+  DECL_VABD_VAR(vector_res);
+
+  clean_results ();
+
+  /* Initialize input "vector1" from "buffer".  */
+  VLOAD(vector1, buffer, , float, f, 32, 2);
+  VLOAD(vector1, buffer, q, float, f, 32, 4);
+  VLOAD(vector1, buffer, q, float, f, 64, 2);
+
+  /* Choose init value arbitrarily.  */
+  VDUP(vector2, , float, f, 32, 2, 9.3f);
+  VDUP(vector2, q, float, f, 32, 4, 29.7f);
+  VDUP(vector2, q, float, f, 64, 2, 15.8f);
+  
+  /* Choose init value arbitrarily.  */
+  VDUP(vector3, , float, f, 32, 2, 81.2f);
+  VDUP(vector3, q, float, f, 32, 4, 36.8f);
+  VDUP(vector3, q, float, f, 64, 2, 51.7f);
+
+  /* Execute the tests.  */
+  TEST_VFMA(, float, f, 32, 2);
+  TEST_VFMA(q, float, f, 32, 4);
+  TEST_VFMA(q, float, f, 64, 2);
+
+  CHECK_VFMA_RESULTS (TEST_MSG, "");
+}
+
+int main (void)
+{
+  exec_vfma ();
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c	(revision 0)
+++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c	(revision 0)
@@ -0,0 +1,67 @@ 
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc440ca3d, 0xc4408a3d };
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc48a9eb8, 0xc48a7eb8, 0xc48a5eb8, 0xc48a3eb8 };
+VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0xc08a06e1532b8520, 0xc089fee1532b8520 };
+
+#define TEST_MSG "VFMA/VFMAQ"
+void exec_vfms (void)
+{
+  /* Basic test: v4=vfms(v1,v2), then store the result.  */
+#define TEST_VFMA(Q, T1, T2, W, N)					\
+  VECT_VAR(vector_res, T1, W, N) =					\
+    vfms##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),			\
+		      VECT_VAR(vector2, T1, W, N),			\
+			  VECT_VAR(vector3, T1, W, N));			\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+#define CHECK_VFMA_RESULTS(test_name,comment)				\
+  {									\
+    CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment);	\
+    CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment);	\
+	CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment);	\
+  }	
+
+#define DECL_VABD_VAR(VAR)			\
+  DECL_VARIABLE(VAR, float, 32, 2);		\
+  DECL_VARIABLE(VAR, float, 32, 4);		\
+  DECL_VARIABLE(VAR, float, 64, 2);		
+
+  DECL_VABD_VAR(vector1);
+  DECL_VABD_VAR(vector2);
+  DECL_VABD_VAR(vector3);
+  DECL_VABD_VAR(vector_res);
+
+  clean_results ();
+
+  /* Initialize input "vector1" from "buffer".  */
+  VLOAD(vector1, buffer, , float, f, 32, 2);
+  VLOAD(vector1, buffer, q, float, f, 32, 4);
+  VLOAD(vector1, buffer, q, float, f, 64, 2);
+
+  /* Choose init value arbitrarily.  */
+  VDUP(vector2, , float, f, 32, 2, 9.3f);
+  VDUP(vector2, q, float, f, 32, 4, 29.7f);
+  VDUP(vector2, q, float, f, 64, 2, 15.8f);
+  
+  /* Choose init value arbitrarily.  */
+  VDUP(vector3, , float, f, 32, 2, 81.2f);
+  VDUP(vector3, q, float, f, 32, 4, 36.8f);
+  VDUP(vector3, q, float, f, 64, 2, 51.7f);
+
+  /* Execute the tests.  */
+  TEST_VFMA(, float, f, 32, 2);
+  TEST_VFMA(q, float, f, 32, 4);
+  TEST_VFMA(q, float, f, 64, 2);
+
+  CHECK_VFMA_RESULTS (TEST_MSG, "");
+}
+
+int main (void)
+{
+  exec_vfms ();
+  return 0;
+}
Index: gcc/testsuite/ChangeLog
===================================================================
--- gcc/testsuite/ChangeLog	(revision 217394)
+++ gcc/testsuite/ChangeLog	(working copy)
@@ -1,3 +1,14 @@ 
+2014-11-18  Felix Yang  <felix.yang@huawei.com>
+	    Haijian Zhang  <z.zhanghaijian@huawei.com>
+	    Jiji Jiang  <jiangjiji@huawei.com>
+	    Pengfei Sui  <suipengfei@huawei.com>
+
+	* gcc.target/aarch64/vfma.c: New test.
+	* gcc.target/aarch64/vfma_n.c: New test.
+	* gcc.target/aarch64/vfms.c: New test.
+	* gcc.target/aarch64/narrow_high-intrinsics.c: Fix expected assembler
+	for rsubhn2 & subhn2.
+
 2014-11-11  Anthony Brandon  <anthony.brandon@gmail.com>
 	    Manuel L贸pez-Ib谩帽ez  <manu@gcc.gnu.org>
 
Index: gcc/config/aarch64/arm_neon.h
===================================================================
--- gcc/config/aarch64/arm_neon.h	(revision 217394)
+++ gcc/config/aarch64/arm_neon.h	(working copy)
@@ -2287,7 +2287,247 @@  vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
   return __builtin_aarch64_uqaddv8qi_uuu (__a, __b);
 }
 
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vhsub_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_aarch64_shsubv8qi (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vhsub_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t) __builtin_aarch64_shsubv4hi (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vhsub_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t) __builtin_aarch64_shsubv2si (__a, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vhsub_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t) __builtin_aarch64_uhsubv8qi ((int8x8_t) __a,
+						  (int8x8_t) __b);
+}
+
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vhsub_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_uhsubv4hi ((int16x4_t) __a,
+						   (int16x4_t) __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vhsub_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_uhsubv2si ((int32x2_t) __a,
+						   (int32x2_t) __b);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vhsubq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t) __builtin_aarch64_shsubv16qi (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vhsubq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t) __builtin_aarch64_shsubv8hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vhsubq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t) __builtin_aarch64_shsubv4si (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vhsubq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t) __builtin_aarch64_uhsubv16qi ((int8x16_t) __a,
+						    (int8x16_t) __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vhsubq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t) __builtin_aarch64_uhsubv8hi ((int16x8_t) __a,
+						   (int16x8_t) __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vhsubq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_uhsubv4si ((int32x4_t) __a,
+						   (int32x4_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vsubhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int8x8_t) __builtin_aarch64_subhnv8hi (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vsubhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int16x4_t) __builtin_aarch64_subhnv4si (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vsubhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int32x2_t) __builtin_aarch64_subhnv2di (__a, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint8x8_t) __builtin_aarch64_subhnv8hi ((int16x8_t) __a,
+						  (int16x8_t) __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_subhnv4si ((int32x4_t) __a,
+						   (int32x4_t) __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_subhnv2di ((int64x2_t) __a,
+						   (int64x2_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrsubhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int8x8_t) __builtin_aarch64_rsubhnv8hi (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrsubhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int16x4_t) __builtin_aarch64_rsubhnv4si (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrsubhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int32x2_t) __builtin_aarch64_rsubhnv2di (__a, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint8x8_t) __builtin_aarch64_rsubhnv8hi ((int16x8_t) __a,
+						   (int16x8_t) __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_rsubhnv4si ((int32x4_t) __a,
+						    (int32x4_t) __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_rsubhnv2di ((int64x2_t) __a,
+						    (int64x2_t) __b);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+  return (int8x16_t) __builtin_aarch64_rsubhn2v8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+  return (int16x8_t) __builtin_aarch64_rsubhn2v4si (__a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
+{
+  return (int32x4_t) __builtin_aarch64_rsubhn2v2di (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+  return (uint8x16_t) __builtin_aarch64_rsubhn2v8hi ((int8x8_t) __a,
+						    (int16x8_t) __b,
+						    (int16x8_t) __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+  return (uint16x8_t) __builtin_aarch64_rsubhn2v4si ((int16x4_t) __a,
+						    (int32x4_t) __b,
+						    (int32x4_t) __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
+{
+  return (uint32x4_t) __builtin_aarch64_rsubhn2v2di ((int32x2_t) __a,
+						    (int64x2_t) __b,
+						    (int64x2_t) __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+  return (int8x16_t) __builtin_aarch64_subhn2v8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+  return (int16x8_t) __builtin_aarch64_subhn2v4si (__a, __b, __c);;
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
+{
+  return (int32x4_t) __builtin_aarch64_subhn2v2di (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+  return (uint8x16_t) __builtin_aarch64_subhn2v8hi ((int8x8_t) __a,
+						    (int16x8_t) __b,
+						    (int16x8_t) __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+  return (uint16x8_t) __builtin_aarch64_subhn2v4si ((int16x4_t) __a,
+						    (int32x4_t) __b,
+						    (int32x4_t) __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
+{
+  return (uint32x4_t) __builtin_aarch64_subhn2v2di ((int32x2_t) __a,
+						    (int64x2_t) __b,
+						    (int64x2_t) __c);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
   return __builtin_aarch64_uqaddv4hi_uuu (__a, __b);
@@ -5889,237 +6129,6 @@  vcvtxd_f32_f64 (float64_t a)
 }
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
-{
-  float32x2_t result;
-  __asm__ ("fmla %0.2s,%2.2s,%3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
-{
-  float32x4_t result;
-  __asm__ ("fmla %0.4s,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
-{
-  float64x2_t result;
-  __asm__ ("fmla %0.2d,%2.2d,%3.2d"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfma_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
-{
-  float32x2_t result;
-  __asm__ ("fmla %0.2s, %2.2s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
-{
-  float32x4_t result;
-  __asm__ ("fmla %0.4s, %2.4s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmaq_n_f64 (float64x2_t a, float64x2_t b, float64_t c)
-{
-  float64x2_t result;
-  __asm__ ("fmla %0.2d, %2.2d, %3.d[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfms_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
-{
-  float32x2_t result;
-  __asm__ ("fmls %0.2s,%2.2s,%3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
-{
-  float32x4_t result;
-  __asm__ ("fmls %0.4s,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
-{
-  float64x2_t result;
-  __asm__ ("fmls %0.2d,%2.2d,%3.2d"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vhsub_s8 (int8x8_t a, int8x8_t b)
-{
-  int8x8_t result;
-  __asm__ ("shsub %0.8b, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vhsub_s16 (int16x4_t a, int16x4_t b)
-{
-  int16x4_t result;
-  __asm__ ("shsub %0.4h, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vhsub_s32 (int32x2_t a, int32x2_t b)
-{
-  int32x2_t result;
-  __asm__ ("shsub %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vhsub_u8 (uint8x8_t a, uint8x8_t b)
-{
-  uint8x8_t result;
-  __asm__ ("uhsub %0.8b, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vhsub_u16 (uint16x4_t a, uint16x4_t b)
-{
-  uint16x4_t result;
-  __asm__ ("uhsub %0.4h, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vhsub_u32 (uint32x2_t a, uint32x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("uhsub %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vhsubq_s8 (int8x16_t a, int8x16_t b)
-{
-  int8x16_t result;
-  __asm__ ("shsub %0.16b, %1.16b, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vhsubq_s16 (int16x8_t a, int16x8_t b)
-{
-  int16x8_t result;
-  __asm__ ("shsub %0.8h, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vhsubq_s32 (int32x4_t a, int32x4_t b)
-{
-  int32x4_t result;
-  __asm__ ("shsub %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vhsubq_u8 (uint8x16_t a, uint8x16_t b)
-{
-  uint8x16_t result;
-  __asm__ ("uhsub %0.16b, %1.16b, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vhsubq_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint16x8_t result;
-  __asm__ ("uhsub %0.8h, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vhsubq_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("uhsub %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_dup_f32 (const float32_t * a)
 {
   float32x2_t result;
@@ -10492,28 +10501,6 @@  vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
        result;                                                          \
      })
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrecpe_u32 (uint32x2_t a)
-{
-  uint32x2_t result;
-  __asm__ ("urecpe %0.2s,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrecpeq_u32 (uint32x4_t a)
-{
-  uint32x4_t result;
-  __asm__ ("urecpe %0.4s,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 #define vrshrn_high_n_s16(a, b, c)                                      \
   __extension__                                                         \
     ({                                                                  \
@@ -10819,138 +10806,6 @@  vrsqrtss_f32 (float32_t a, float32_t b)
   return result;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrsubhn_high_s16 (int8x8_t a, int16x8_t b, int16x8_t c)
-{
-  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("rsubhn2 %0.16b, %1.8h, %2.8h"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrsubhn_high_s32 (int16x4_t a, int32x4_t b, int32x4_t c)
-{
-  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("rsubhn2 %0.8h, %1.4s, %2.4s"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vrsubhn_high_s64 (int32x2_t a, int64x2_t b, int64x2_t c)
-{
-  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("rsubhn2 %0.4s, %1.2d, %2.2d"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrsubhn_high_u16 (uint8x8_t a, uint16x8_t b, uint16x8_t c)
-{
-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("rsubhn2 %0.16b, %1.8h, %2.8h"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrsubhn_high_u32 (uint16x4_t a, uint32x4_t b, uint32x4_t c)
-{
-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("rsubhn2 %0.8h, %1.4s, %2.4s"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrsubhn_high_u64 (uint32x2_t a, uint64x2_t b, uint64x2_t c)
-{
-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("rsubhn2 %0.4s, %1.2d, %2.2d"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrsubhn_s16 (int16x8_t a, int16x8_t b)
-{
-  int8x8_t result;
-  __asm__ ("rsubhn %0.8b, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrsubhn_s32 (int32x4_t a, int32x4_t b)
-{
-  int16x4_t result;
-  __asm__ ("rsubhn %0.4h, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrsubhn_s64 (int64x2_t a, int64x2_t b)
-{
-  int32x2_t result;
-  __asm__ ("rsubhn %0.2s, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrsubhn_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint8x8_t result;
-  __asm__ ("rsubhn %0.8b, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrsubhn_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint16x4_t result;
-  __asm__ ("rsubhn %0.4h, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrsubhn_u64 (uint64x2_t a, uint64x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("rsubhn %0.2s, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 #define vshrn_high_n_s16(a, b, c)                                       \
   __extension__                                                         \
     ({                                                                  \
@@ -11482,139 +11337,8 @@  vrsqrtss_f32 (float32_t a, float32_t b)
                 : "memory");                                            \
      })
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vsubhn_high_s16 (int8x8_t a, int16x8_t b, int16x8_t c)
-{
-  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("subhn2 %0.16b, %1.8h, %2.8h"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsubhn_high_s32 (int16x4_t a, int32x4_t b, int32x4_t c)
-{
-  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("subhn2 %0.8h, %1.4s, %2.4s"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsubhn_high_s64 (int32x2_t a, int64x2_t b, int64x2_t c)
-{
-  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("subhn2 %0.4s, %1.2d, %2.2d"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vsubhn_high_u16 (uint8x8_t a, uint16x8_t b, uint16x8_t c)
-{
-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("subhn2 %0.16b, %1.8h, %2.8h"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsubhn_high_u32 (uint16x4_t a, uint32x4_t b, uint32x4_t c)
-{
-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("subhn2 %0.8h, %1.4s, %2.4s"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsubhn_high_u64 (uint32x2_t a, uint64x2_t b, uint64x2_t c)
-{
-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("subhn2 %0.4s, %1.2d, %2.2d"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vsubhn_s16 (int16x8_t a, int16x8_t b)
-{
-  int8x8_t result;
-  __asm__ ("subhn %0.8b, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vsubhn_s32 (int32x4_t a, int32x4_t b)
-{
-  int16x4_t result;
-  __asm__ ("subhn %0.4h, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vsubhn_s64 (int64x2_t a, int64x2_t b)
-{
-  int32x2_t result;
-  __asm__ ("subhn %0.2s, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vsubhn_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint8x8_t result;
-  __asm__ ("subhn %0.8b, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vsubhn_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint16x4_t result;
-  __asm__ ("subhn %0.4h, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vsubhn_u64 (uint64x2_t a, uint64x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("subhn %0.2s, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vtst_p8 (poly8x8_t a, poly8x8_t b)
 {
   uint8x8_t result;
@@ -16057,6 +15781,42 @@  vfma_f64 (float64x1_t __a, float64x1_t __b, float6
   return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return __builtin_aarch64_fmav2sf (__b, __c, __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return __builtin_aarch64_fmav4sf (__b, __c, __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+{
+  return __builtin_aarch64_fmav2df (__b, __c, __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+{
+  return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+{
+  return __builtin_aarch64_fmav4sf (__b, vdupq_n_f32 (__c), __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmaq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
+{
+  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a);
+}
+
 /* vfma_lane  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
@@ -16168,6 +15928,25 @@  vfms_f64 (float64x1_t __a, float64x1_t __b, float6
   return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return __builtin_aarch64_fmav2sf (-__b, __c, __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return __builtin_aarch64_fmav4sf (-__b, __c, __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+{
+  return __builtin_aarch64_fmav2df (-__b, __c, __a);
+}
+
+
 /* vfms_lane  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
@@ -21302,6 +21081,18 @@  vrbitq_u8 (uint8x16_t __a)
 
 /* vrecpe  */
 
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrecpe_u32 (uint32x2_t __a)
+{
+  return (uint32x2_t)__builtin_aarch64_urecpev2si ((int32x2_t)__a);
+}
+ 
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrecpeq_u32 (uint32x4_t __a)
+{
+  return (uint32x4_t)__builtin_aarch64_urecpev4si ((int32x4_t)__a);
+}
+
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vrecpes_f32 (float32_t __a)
 {
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md	(revision 217394)
+++ gcc/config/aarch64/iterators.md	(working copy)
@@ -153,6 +153,9 @@ 
 ;; Vector modes except double int.
 (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
 
+;; Vector modes for S type.
+(define_mode_iterator VDQ_SI [V2SI V4SI])
+
 ;; Vector modes for Q and H types.
 (define_mode_iterator VDQQH [V8QI V16QI V4HI V8HI])
 
Index: gcc/config/aarch64/aarch64-simd.md
===================================================================
--- gcc/config/aarch64/aarch64-simd.md	(revision 217394)
+++ gcc/config/aarch64/aarch64-simd.md	(working copy)
@@ -4972,6 +4972,14 @@ 
   [(set_attr "type" "neon_fp_recps_<Vetype><q>")]
 )
 
+(define_insn "aarch64_urecpe<mode>"
+  [(set (match_operand:VDQ_SI 0 "register_operand" "=w")
+        (unspec:VDQ_SI [(match_operand:VDQ_SI 1 "register_operand" "w")]
+                UNSPEC_URECPE))]
+ "TARGET_SIMD"
+ "urecpe\\t%0.<Vtype>, %1.<Vtype>"
+  [(set_attr "type" "neon_fp_recpe_<Vetype><q>")])
+
 ;; Standard pattern name vec_extract<mode>.
 
 (define_expand "vec_extract<mode>"
Index: gcc/config/aarch64/aarch64.md
===================================================================
--- gcc/config/aarch64/aarch64.md	(revision 217394)
+++ gcc/config/aarch64/aarch64.md	(working copy)
@@ -75,6 +75,7 @@ 
     UNSPEC_CRC32H
     UNSPEC_CRC32W
     UNSPEC_CRC32X
+    UNSPEC_URECPE
     UNSPEC_FRECPE
     UNSPEC_FRECPS
     UNSPEC_FRECPX
Index: gcc/config/aarch64/aarch64-simd-builtins.def
===================================================================
--- gcc/config/aarch64/aarch64-simd-builtins.def	(revision 217394)
+++ gcc/config/aarch64/aarch64-simd-builtins.def	(working copy)
@@ -124,15 +124,21 @@ 
   BUILTIN_VDW (BINOP, usubw, 0)
   /* Implemented by aarch64_<sur>h<addsub><mode>.  */
   BUILTIN_VQ_S (BINOP, shadd, 0)
+  BUILTIN_VQ_S (BINOP, shsub, 0)
   BUILTIN_VQ_S (BINOP, uhadd, 0)
+  BUILTIN_VQ_S (BINOP, uhsub, 0)
   BUILTIN_VQ_S (BINOP, srhadd, 0)
   BUILTIN_VQ_S (BINOP, urhadd, 0)
   /* Implemented by aarch64_<sur><addsub>hn<mode>.  */
   BUILTIN_VQN (BINOP, addhn, 0)
+  BUILTIN_VQN (BINOP, subhn, 0)
   BUILTIN_VQN (BINOP, raddhn, 0)
+  BUILTIN_VQN (BINOP, rsubhn, 0)
   /* Implemented by aarch64_<sur><addsub>hn2<mode>.  */
   BUILTIN_VQN (TERNOP, addhn2, 0)
+  BUILTIN_VQN (TERNOP, subhn2, 0)
   BUILTIN_VQN (TERNOP, raddhn2, 0)
+  BUILTIN_VQN (TERNOP, rsubhn2, 0)
 
   BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0)
   /* Implemented by aarch64_<sur>qmovn<mode>.  */
@@ -334,6 +340,8 @@ 
   BUILTIN_GPF (BINOP, frecps, 0)
   BUILTIN_GPF (UNOP, frecpx, 0)
 
+  BUILTIN_VDQ_SI (UNOP, urecpe, 0)
+
   BUILTIN_VDQF (UNOP, frecpe, 0)
   BUILTIN_VDQF (BINOP, frecps, 0)