Patchwork [AArch64] Support vrecp<esx> neon intrinsics in RTL.

login
register
mail settings
Submitter James Greenhalgh
Date April 22, 2013, 10:34 a.m.
Message ID <1366626842-20146-1-git-send-email-james.greenhalgh@arm.com>
Download mbox | patch
Permalink /patch/238425/
State New
Headers show

Comments

James Greenhalgh - April 22, 2013, 10:34 a.m.
Hi,

This patch adds support for handling the:

vrecpe<sdq>_<fd><32,64>,
vrecpx<sd>_<fd><32,64>,
vrecps<sdq>_<fd><32,64>.

intrinsics in arm_neon.h as as RTL builtins.

The patch has been regression tested on aarch64-none-elf and
aarch64-none-linux-gnu with no regressions.

Is this OK for trunk?

Thanks,
James Greenhalgh

---

gcc/

2013-04-22  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64-builtins.c
	(aarch64_simd_builtin_type_mode): Handle SF types.
	(sf_UP): Define.
	(BUILTIN_GPF): Define.
	(aarch64_init_simd_builtins): Handle SF types.
	* config/aarch64/aarch64-simd-builtins.def (frecpe): Add support.
	(frecps): Likewise.
	(frecpx): Likewise.
	* config/aarch64/aarch64-simd.md
	(simd_types): Update simd_frcp<esx> to simd_frecp<esx>.
	(aarch64_frecpe<mode>): New.
	(aarch64_frecps<mode>): Likewise.
	* config/aarch64/aarch64.md (unspec): Add UNSPEC_FRECP<ESX>.
	(v8type): Add frecp<esx>.
	(aarch64_frecp<FRECP:frecp_suffix><mode>): New.
	(aarch64_frecps<mode>): Likewise.
	* config/aarch64/iterators.md (FRECP): New.
	(frecp_suffix): Likewise.
	* config/aarch64/arm_neon.h
	(vrecp<esx><qsd>_<fd><32, 64>): Convert to using builtins.

gcc/testsuite/

2013-04-22  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.target/aarch64/vrecps.c: New.
	* gcc.target/aarch64/vrecpx.c: Likewise.
Marcus Shawcroft - April 22, 2013, 12:11 p.m.
On 22/04/13 11:34, James Greenhalgh wrote:
>
> Hi,
>
> This patch adds support for handling the:
>
> vrecpe<sdq>_<fd><32,64>,
> vrecpx<sd>_<fd><32,64>,
> vrecps<sdq>_<fd><32,64>.
>
> intrinsics in arm_neon.h as as RTL builtins.
>
> The patch has been regression tested on aarch64-none-elf and
> aarch64-none-linux-gnu with no regressions.
>
> Is this OK for trunk?

OK
/Marcus

Patch

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 1ea55a8..87c4f28 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -50,6 +50,7 @@  enum aarch64_simd_builtin_type_mode
   T_OI,
   T_XI,
   T_SI,
+  T_SF,
   T_HI,
   T_QI,
   T_MAX
@@ -72,6 +73,7 @@  enum aarch64_simd_builtin_type_mode
 #define oi_UP	 T_OI
 #define xi_UP	 T_XI
 #define si_UP    T_SI
+#define sf_UP    T_SF
 #define hi_UP    T_HI
 #define qi_UP    T_QI
 
@@ -172,6 +174,8 @@  typedef struct
 
 #define BUILTIN_DX(T, N) \
   VAR2 (T, N, di, df)
+#define BUILTIN_GPF(T, N) \
+  VAR2 (T, N, sf, df)
 #define BUILTIN_SDQ_I(T, N) \
   VAR4 (T, N, qi, hi, si, di)
 #define BUILTIN_SD_HSI(T, N) \
@@ -609,7 +613,7 @@  aarch64_init_simd_builtins (void)
       {
 	"v8qi", "v4hi", "v2si", "v2sf", "di", "df",
 	"v16qi", "v8hi", "v4si", "v4sf", "v2di", "v2df",
-	"ti", "ei", "oi", "xi", "si", "hi", "qi"
+	"ti", "ei", "oi", "xi", "si", "sf", "hi", "qi"
       };
       char namebuf[60];
       tree ftype = NULL;
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index a6a5e12..83597a3 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -256,3 +256,12 @@ 
   BUILTIN_VALL (BINOP, uzp2)
   BUILTIN_VALL (BINOP, trn1)
   BUILTIN_VALL (BINOP, trn2)
+
+  /* Implemented by
+     aarch64_recp<FRECP:frecp_suffix><mode>.  */
+  BUILTIN_GPF (UNOP, frecpe)
+  BUILTIN_GPF (BINOP, frecps)
+  BUILTIN_GPF (UNOP, frecpx)
+
+  BUILTIN_VDQF (UNOP, frecpe)
+  BUILTIN_VDQF (BINOP, frecps)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 92dcfc0..a797797 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -59,9 +59,9 @@ 
 ; simd_fmul             floating point multiply.
 ; simd_fmul_elt         floating point multiply (by element).
 ; simd_fnegabs          floating point neg/abs.
-; simd_frcpe            floating point reciprocal estimate.
-; simd_frcps            floating point reciprocal step.
-; simd_frecx            floating point reciprocal exponent.
+; simd_frecpe            floating point reciprocal estimate.
+; simd_frecps            floating point reciprocal step.
+; simd_frecpx            floating point reciprocal exponent.
 ; simd_frint            floating point round to integer.
 ; simd_fsqrt            floating point square root.
 ; simd_icvtf            integer convert to floating point.
@@ -163,9 +163,9 @@ 
    simd_fmul,\
    simd_fmul_elt,\
    simd_fnegabs,\
-   simd_frcpe,\
-   simd_frcps,\
-   simd_frecx,\
+   simd_frecpe,\
+   simd_frecps,\
+   simd_frecpx,\
    simd_frint,\
    simd_fsqrt,\
    simd_icvtf,\
@@ -305,8 +305,8 @@ 
 	  (eq_attr "simd_type" "simd_store3,simd_store4") (const_string "neon_vst1_3_4_regs")
 	  (eq_attr "simd_type" "simd_store1s,simd_store2s") (const_string "neon_vst1_vst2_lane")
 	  (eq_attr "simd_type" "simd_store3s,simd_store4s") (const_string "neon_vst3_vst4_lane")
-	  (and (eq_attr "simd_type" "simd_frcpe,simd_frcps") (eq_attr "simd_mode" "V2SF")) (const_string "neon_fp_vrecps_vrsqrts_ddd")
-	  (and (eq_attr "simd_type" "simd_frcpe,simd_frcps") (eq_attr "simd_mode" "V4SF,V2DF")) (const_string "neon_fp_vrecps_vrsqrts_qqq")
+	  (and (eq_attr "simd_type" "simd_frecpe,simd_frecps") (eq_attr "simd_mode" "V2SF")) (const_string "neon_fp_vrecps_vrsqrts_ddd")
+	  (and (eq_attr "simd_type" "simd_frecpe,simd_frecps") (eq_attr "simd_mode" "V4SF,V2DF")) (const_string "neon_fp_vrecps_vrsqrts_qqq")
 	  (eq_attr "simd_type" "none") (const_string "none")
   ]
   (const_string "unknown")))
@@ -3726,3 +3726,25 @@ 
   "ld1r\\t{%0.<Vtype>}, %1"
   [(set_attr "simd_type" "simd_load1r")
    (set_attr "simd_mode" "<MODE>")])
+
+(define_insn "aarch64_frecpe<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+	(unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")]
+		    UNSPEC_FRECPE))]
+  "TARGET_SIMD"
+  "frecpe\\t%0.<Vtype>, %1.<Vtype>"
+  [(set_attr "simd_type" "simd_frecpe")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "aarch64_frecps<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+	(unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")
+		     (match_operand:VDQF 2 "register_operand" "w")]
+		    UNSPEC_FRECPS))]
+  "TARGET_SIMD"
+  "frecps\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "simd_type" "simd_frecps")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 7229878..4555e8d 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -68,6 +68,9 @@ 
 (define_c_enum "unspec" [
     UNSPEC_CASESI
     UNSPEC_CLS
+    UNSPEC_FRECPE
+    UNSPEC_FRECPS
+    UNSPEC_FRECPX
     UNSPEC_FRINTA
     UNSPEC_FRINTI
     UNSPEC_FRINTM
@@ -230,6 +233,9 @@ 
    fmovf2i,\
    fmovi2f,\
    fmul,\
+   frecpe,\
+   frecps,\
+   frecpx,\
    frint,\
    fsqrt,\
    load_acq,\
@@ -3307,6 +3313,27 @@ 
    (set_attr "mode" "<MODE>")]
 )
 
+(define_insn "aarch64_frecp<FRECP:frecp_suffix><mode>"
+  [(set (match_operand:GPF 0 "register_operand" "=w")
+	(unspec:GPF [(match_operand:GPF 1 "register_operand" "w")]
+		    FRECP))]
+  "TARGET_FLOAT"
+  "frecp<FRECP:frecp_suffix>\\t%<s>0, %<s>1"
+  [(set_attr "v8type" "frecp<FRECP:frecp_suffix>")
+   (set_attr "mode" "<MODE>")]
+)
+
+(define_insn "aarch64_frecps<mode>"
+  [(set (match_operand:GPF 0 "register_operand" "=w")
+	(unspec:GPF [(match_operand:GPF 1 "register_operand" "w")
+		     (match_operand:GPF 2 "register_operand" "w")]
+		    UNSPEC_FRECPS))]
+  "TARGET_FLOAT"
+  "frecps\\t%<s>0, %<s>1, %<s>2"
+  [(set_attr "v8type" "frecps")
+   (set_attr "mode" "<MODE>")]
+)
+
 ;; -------------------------------------------------------------------
 ;; Reload support
 ;; -------------------------------------------------------------------
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index ca47403..66989b8 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -14556,17 +14556,6 @@  vrbitq_u8 (uint8x16_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrecpe_f32 (float32x2_t a)
-{
-  float32x2_t result;
-  __asm__ ("frecpe %0.2s,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vrecpe_u32 (uint32x2_t a)
 {
@@ -14578,39 +14567,6 @@  vrecpe_u32 (uint32x2_t a)
   return result;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vrecped_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("frecpe %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrecpeq_f32 (float32x4_t a)
-{
-  float32x4_t result;
-  __asm__ ("frecpe %0.4s,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrecpeq_f64 (float64x2_t a)
-{
-  float64x2_t result;
-  __asm__ ("frecpe %0.2d,%1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vrecpeq_u32 (uint32x4_t a)
 {
@@ -14622,94 +14578,6 @@  vrecpeq_u32 (uint32x4_t a)
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vrecpes_f32 (float32_t a)
-{
-  float32_t result;
-  __asm__ ("frecpe %s0,%s1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrecps_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("frecps %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vrecpsd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("frecps %d0,%d1,%d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrecpsq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("frecps %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrecpsq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("frecps %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vrecpss_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("frecps %s0,%s1,%s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vrecpxd_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("frecpe %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vrecpxs_f32 (float32_t a)
-{
-  float32_t result;
-  __asm__ ("frecpe %s0,%s1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vrev16_p8 (poly8x8_t a)
 {
@@ -23115,6 +22983,84 @@  vqsubd_u64 (uint64x1_t __a, uint64x1_t __b)
   return (uint64x1_t) __builtin_aarch64_uqsubdi (__a, __b);
 }
 
+/* vrecpe  */
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vrecpes_f32 (float32_t __a)
+{
+  return __builtin_aarch64_frecpesf (__a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vrecped_f64 (float64_t __a)
+{
+  return __builtin_aarch64_frecpedf (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrecpe_f32 (float32x2_t __a)
+{
+  return __builtin_aarch64_frecpev2sf (__a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrecpeq_f32 (float32x4_t __a)
+{
+  return __builtin_aarch64_frecpev4sf (__a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vrecpeq_f64 (float64x2_t __a)
+{
+  return __builtin_aarch64_frecpev2df (__a);
+}
+
+/* vrecps  */
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vrecpss_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_aarch64_frecpssf (__a, __b);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vrecpsd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_aarch64_frecpsdf (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrecps_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_frecpsv2sf (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrecpsq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_frecpsv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vrecpsq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_frecpsv2df (__a, __b);
+}
+
+/* vrecpx  */
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vrecpxs_f32 (float32_t __a)
+{
+  return __builtin_aarch64_frecpxsf (__a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vrecpxd_f64 (float64_t __a)
+{
+  return __builtin_aarch64_frecpxdf (__a);
+}
+
 /* vrshl */
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 863a4af..017e128 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -698,6 +698,8 @@ 
 (define_int_iterator FCVT [UNSPEC_FRINTZ UNSPEC_FRINTP UNSPEC_FRINTM
 			    UNSPEC_FRINTA])
 
+(define_int_iterator FRECP [UNSPEC_FRECPE UNSPEC_FRECPX])
+
 ;; -------------------------------------------------------------------
 ;; Int Iterators Attributes.
 ;; -------------------------------------------------------------------
@@ -803,3 +805,5 @@ 
 (define_int_attr perm_hilo [(UNSPEC_ZIP1 "1") (UNSPEC_ZIP2 "2")
 			    (UNSPEC_TRN1 "1") (UNSPEC_TRN2 "2")
 			    (UNSPEC_UZP1 "1") (UNSPEC_UZP2 "2")])
+
+(define_int_attr frecp_suffix  [(UNSPEC_FRECPE "e") (UNSPEC_FRECPX "x")])
diff --git a/gcc/testsuite/gcc.target/aarch64/vrecps.c b/gcc/testsuite/gcc.target/aarch64/vrecps.c
new file mode 100644
index 0000000..c279a44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vrecps.c
@@ -0,0 +1,144 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <arm_neon.h>
+#include <math.h>
+#include <stdlib.h>
+
+int
+test_frecps_float32_t (void)
+{
+  int i;
+  float32_t value = 0.2;
+  float32_t reciprocal = 5.0;
+  float32_t step = vrecpes_f32 (value);
+  /* 3 steps should give us within ~0.001 accuracy.  */
+  for (i = 0; i < 3; i++)
+    step = step * vrecpss_f32 (step, value);
+
+  return fabs (step - reciprocal) < 0.001;
+}
+
+/* { dg-final { scan-assembler "frecpe\\ts\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "frecps\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" } } */
+
+int
+test_frecps_float32x2_t (void)
+{
+  int i;
+  int ret = 1;
+
+  const float32_t value_pool[] = {0.2, 0.4};
+  const float32_t reciprocal_pool[] = {5.0, 2.5};
+  float32x2_t value = vld1_f32 (value_pool);
+  float32x2_t reciprocal = vld1_f32 (reciprocal_pool);
+
+  float32x2_t step = vrecpe_f32 (value);
+  /* 3 steps should give us within ~0.001 accuracy.  */
+  for (i = 0; i < 3; i++)
+    step = step * vrecps_f32 (step, value);
+
+  ret &= fabs (vget_lane_f32 (step, 0)
+	       - vget_lane_f32 (reciprocal, 0)) < 0.001;
+  ret &= fabs (vget_lane_f32 (step, 1)
+	       - vget_lane_f32 (reciprocal, 1)) < 0.001;
+
+  return ret;
+}
+
+/* { dg-final { scan-assembler "frecpe\\tv\[0-9\]+.2s, v\[0-9\]+.2s" } } */
+/* { dg-final { scan-assembler "frecps\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" } } */
+
+int
+test_frecps_float32x4_t (void)
+{
+  int i;
+  int ret = 1;
+
+  const float32_t value_pool[] = {0.2, 0.4, 0.5, 0.8};
+  const float32_t reciprocal_pool[] = {5.0, 2.5, 2.0, 1.25};
+  float32x4_t value = vld1q_f32 (value_pool);
+  float32x4_t reciprocal = vld1q_f32 (reciprocal_pool);
+
+  float32x4_t step = vrecpeq_f32 (value);
+  /* 3 steps should give us within ~0.001 accuracy.  */
+  for (i = 0; i < 3; i++)
+    step = step * vrecpsq_f32 (step, value);
+
+  ret &= fabs (vgetq_lane_f32 (step, 0)
+	       - vgetq_lane_f32 (reciprocal, 0)) < 0.001;
+  ret &= fabs (vgetq_lane_f32 (step, 1)
+	       - vgetq_lane_f32 (reciprocal, 1)) < 0.001;
+  ret &= fabs (vgetq_lane_f32 (step, 2)
+	       - vgetq_lane_f32 (reciprocal, 2)) < 0.001;
+  ret &= fabs (vgetq_lane_f32 (step, 3)
+	       - vgetq_lane_f32 (reciprocal, 3)) < 0.001;
+
+  return ret;
+}
+
+/* { dg-final { scan-assembler "frecpe\\tv\[0-9\]+.4s, v\[0-9\]+.4s" } } */
+/* { dg-final { scan-assembler "frecps\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" } } */
+
+int
+test_frecps_float64_t (void)
+{
+  int i;
+  float64_t value = 0.2;
+  float64_t reciprocal = 5.0;
+  float64_t step = vrecped_f64 (value);
+  /* 3 steps should give us within ~0.001 accuracy.  */
+  for (i = 0; i < 3; i++)
+    step = step * vrecpsd_f64 (step, value);
+
+  return fabs (step - reciprocal) < 0.001;
+}
+
+/* { dg-final { scan-assembler "frecpe\\td\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "frecps\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" } } */
+
+int
+test_frecps_float64x2_t (void)
+{
+  int i;
+  int ret = 1;
+
+  const float64_t value_pool[] = {0.2, 0.4};
+  const float64_t reciprocal_pool[] = {5.0, 2.5};
+  float64x2_t value = vld1q_f64 (value_pool);
+  float64x2_t reciprocal = vld1q_f64 (reciprocal_pool);
+
+  float64x2_t step = vrecpeq_f64 (value);
+  /* 3 steps should give us within ~0.001 accuracy.  */
+  for (i = 0; i < 3; i++)
+    step = step * vrecpsq_f64 (step, value);
+
+  ret &= fabs (vgetq_lane_f64 (step, 0)
+	       - vgetq_lane_f64 (reciprocal, 0)) < 0.001;
+  ret &= fabs (vgetq_lane_f64 (step, 1)
+	       - vgetq_lane_f64 (reciprocal, 1)) < 0.001;
+
+  return ret;
+}
+
+/* { dg-final { scan-assembler "frecpe\\tv\[0-9\]+.2d, v\[0-9\]+.2d" } } */
+/* { dg-final { scan-assembler "frecps\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" } } */
+
+int
+main (int argc, char **argv)
+{
+  if (!test_frecps_float32_t ())
+    abort ();
+  if (!test_frecps_float32x2_t ())
+    abort ();
+  if (!test_frecps_float32x4_t ())
+    abort ();
+  if (!test_frecps_float64_t ())
+    abort ();
+  if (!test_frecps_float64x2_t ())
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vrecpx.c b/gcc/testsuite/gcc.target/aarch64/vrecpx.c
new file mode 100644
index 0000000..63097f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vrecpx.c
@@ -0,0 +1,54 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <arm_neon.h>
+#include <math.h>
+#include <stdlib.h>
+
+float32_t in_f[] =
+{2.0, 4.0, 8.0, 16.0, 1.0, 0.5, 0.25, 0.125};
+float32_t rec_f[] =
+{1.0, 0.5, 0.25, 0.125, 2.0, 4.0, 8.0, 16.0};
+float64_t in_d[] =
+{2.0, 4.0, 8.0, 16.0, 1.0, 0.5, 0.25, 0.125};
+float32_t rec_d[] =
+{1.0, 0.5, 0.25, 0.125, 2.0, 4.0, 8.0, 16.0};
+
+int
+test_frecpx_float32_t (void)
+{
+  int i = 0;
+  int ret = 1;
+  for (i = 0; i < 8; i++)
+    ret &= fabs (vrecpxs_f32 (in_f[i]) - rec_f[i]) < 0.001;
+
+  return ret;
+}
+
+/* { dg-final { scan-assembler "frecpx\\ts\[0-9\]+, s\[0-9\]+" } } */
+
+int
+test_frecpx_float64_t (void)
+{
+  int i = 0;
+  int ret = 1;
+  for (i = 0; i < 8; i++)
+    ret &= fabs (vrecpxd_f64 (in_d[i]) - rec_d[i]) < 0.001;
+
+  return ret;
+}
+
+/* { dg-final { scan-assembler "frecpx\\td\[0-9\]+, d\[0-9\]+" } } */
+
+int
+main (int argc, char **argv)
+{
+  if (!test_frecpx_float32_t ())
+    abort ();
+  if (!test_frecpx_float64_t ())
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { cleanup-saved-temps } } */