diff mbox

2015-10-15 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com>

Message ID 1444946596-1824-2-git-send-email-benedikt.huber@theobroma-systems.com
State New
Headers show

Commit Message

Benedikt Huber Oct. 15, 2015, 10:03 p.m. UTC
* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
	* config/aarch64/aarch64-protos.h: Declare.
	* config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
	frsqrts.
	* config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
	* config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
	applicable.
	* config/aarch64/aarch64.md: Added enum entries.
	* config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
	* testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h: Common macros for
	assembly checks.
	* testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c: Make sure
	frsqrts and frsqrte are not emitted.
	* testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c: Make sure frsqrts and
	frsqrte are emitted.
	* testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.

Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
---
 gcc/ChangeLog                                      |  20 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 114 +++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   4 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 107 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   5 +
 gcc/doc/invoke.texi                                |  12 +++
 .../gcc.target/aarch64/rsqrt-asm-check-common.h    |  42 ++++++++
 .../aarch64/rsqrt-asm-check-negative_1.c           |  12 +++
 .../gcc.target/aarch64/rsqrt-asm-check_1.c         |  25 +++++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 13 files changed, 481 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c

Comments

Benedikt Huber Jan. 1, 1970, 2:20 a.m. UTC | #1
This ninth revision of the patch:
 * Removes unnecessary typedef.

Ok for check in.


Benedikt Huber (1):
  2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>    
    	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

 gcc/ChangeLog                                      |  20 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 115 +++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   4 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 107 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   5 +
 gcc/doc/invoke.texi                                |  12 +++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 .../gcc.target/aarch64/rsqrt_asm_check_1.c         |  25 +++++
 .../gcc.target/aarch64/rsqrt_asm_check_common.h    |  42 ++++++++
 .../aarch64/rsqrt_asm_check_negative_1.c           |  12 +++
 13 files changed, 482 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c
Benedikt Huber Jan. 4, 1970, 12:02 a.m. UTC | #2
This tenth revision of the patch:
 * Removes unnecessary enum.

Ok for check in.


Benedikt Huber (1):
  2015-10-19  Benedikt Huber  <benedikt.huber@theobroma-systems.com>    
    	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

 gcc/ChangeLog                                      |  20 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 115 +++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   4 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 107 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   5 +
 gcc/doc/invoke.texi                                |  12 +++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 .../gcc.target/aarch64/rsqrt_asm_check_1.c         |  25 +++++
 .../gcc.target/aarch64/rsqrt_asm_check_common.h    |  42 ++++++++
 .../aarch64/rsqrt_asm_check_negative_1.c           |  12 +++
 13 files changed, 482 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c
Marcus Shawcroft Oct. 16, 2015, 9:29 a.m. UTC | #3
Hi,

 A few more style nits:

> +  builtin_decls_data bdda[] = {

New line before  {

> +    {double_type_node, "__builtin_aarch64_rsqrt_df", AARCH64_BUILTIN_RSQRT_DF},

Space after {
Space  before }

> +void aarch64_emit_swrsqrt (rtx, rtx);
> +

> +tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);
> +

Drop the formal argument names as you did in the first declaration.

See my previous comment w.r.t the naming of new test cases in
gcc.target/aarch64, at least the following still need s/-/_/

> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h
> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c
> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c

> +//   With -ffast-math these return positive INF.
> +//   t_double (-0.0, -inf);
> +//   t_float (-0.0, -inff);
> +
> +//   The reason here is that -ffast-math flushes to zero.
> +//   t_double  (__DBL_MIN__/256, 0X1.00000000000000P+515);
> +//   t_float (__FLT_MIN__/256, 0X1.00000000000000P+67);

Comment consistently with the rest of the backend ie /* */

Thanks
/Marcus
Oleg Endo Oct. 16, 2015, 12:37 p.m. UTC | #4
On Thu, 2015-10-15 at 22:03 +0000, Benedikt Huber wrote:
>  
> +/* Add builtins for reciprocal square root.  */
> +
> +void
> +aarch64_init_builtin_rsqrt (void)
> +{
> +  tree fndecl = NULL;
> +  tree ftype = NULL;
> +
> +  tree V2SF_type_node = build_vector_type (float_type_node, 2);
> +  tree V2DF_type_node = build_vector_type (double_type_node, 2);
> +  tree V4SF_type_node = build_vector_type (float_type_node, 4);
> +
> +  typedef struct
> +  {
> +    tree type_node;
> +    const char *builtin_name;
> +    int function_code;
> +  } builtin_decls_data;

There is an ongoing effort to remove all the unnecessary typedef struct
and enum etc stuff.  Please try not to add more of it.

Cheers,
Oleg
Benedikt Huber Oct. 16, 2015, 1:59 p.m. UTC | #5
This eighth revision of the patch:
 * Style improvements.

Ok for check in.


Benedikt Huber (1):
  2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>    
    	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

 gcc/ChangeLog                                      |  20 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 115 +++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   4 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 107 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   5 +
 gcc/doc/invoke.texi                                |  12 +++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 .../gcc.target/aarch64/rsqrt_asm_check_1.c         |  25 +++++
 .../gcc.target/aarch64/rsqrt_asm_check_common.h    |  42 ++++++++
 .../aarch64/rsqrt_asm_check_negative_1.c           |  12 +++
 13 files changed, 482 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c
Benedikt Huber Oct. 16, 2015, 2:31 p.m. UTC | #6
I introduced this in revision 7 due to a request from James Greenhalgh.
https://gcc.gnu.org/ml/gcc-patches/2015-10/msg00963.html

> Given that this is all so mechanical, I'd have a preference towards
> refactoring this to loop over some structured data.

Do you mean, that I should get rid of the typedef and leave the struct without it?
Or should I completely drop the struct?

> On 16 Oct 2015, at 14:37, Oleg Endo <oleg.endo@t-online.de> wrote:
> 
> On Thu, 2015-10-15 at 22:03 +0000, Benedikt Huber wrote:
>> 
>> +/* Add builtins for reciprocal square root.  */
>> +
>> +void
>> +aarch64_init_builtin_rsqrt (void)
>> +{
>> +  tree fndecl = NULL;
>> +  tree ftype = NULL;
>> +
>> +  tree V2SF_type_node = build_vector_type (float_type_node, 2);
>> +  tree V2DF_type_node = build_vector_type (double_type_node, 2);
>> +  tree V4SF_type_node = build_vector_type (float_type_node, 4);
>> +
>> +  typedef struct
>> +  {
>> +    tree type_node;
>> +    const char *builtin_name;
>> +    int function_code;
>> +  } builtin_decls_data;
> 
> There is an ongoing effort to remove all the unnecessary typedef struct
> and enum etc stuff.  Please try not to add more of it.
> 
> Cheers,
> Oleg
>
Marcus Shawcroft Oct. 16, 2015, 2:47 p.m. UTC | #7
On 16 October 2015 at 15:31, Benedikt Huber
<benedikt.huber@theobroma-systems.com> wrote:
> I introduced this in revision 7 due to a request from James Greenhalgh.
> https://gcc.gnu.org/ml/gcc-patches/2015-10/msg00963.html
>
>> Given that this is all so mechanical, I'd have a preference towards
>> refactoring this to loop over some structured data.
>
> Do you mean, that I should get rid of the typedef and leave the struct without it?
> Or should I completely drop the struct?

The use of the struct is fine, we are being discouraged from using
unnecessary typedefs.  Just rewrite it as:

 struct builtin_decls_data
   {
   ...
   };

The references to the typedef'd name don't need to be modified.
Cheers
/Marcus
Oleg Endo Oct. 17, 2015, 12:59 a.m. UTC | #8
On Fri, 2015-10-16 at 15:47 +0100, Marcus Shawcroft wrote:
> On 16 October 2015 at 15:31, Benedikt Huber
> <benedikt.huber@theobroma-systems.com> wrote:
> > I introduced this in revision 7 due to a request from James Greenhalgh.
> > https://gcc.gnu.org/ml/gcc-patches/2015-10/msg00963.html
> >
> >> Given that this is all so mechanical, I'd have a preference towards
> >> refactoring this to loop over some structured data.
> >
> > Do you mean, that I should get rid of the typedef and leave the struct without it?
> > Or should I completely drop the struct?
> 
> The use of the struct is fine, we are being discouraged from using
> unnecessary typedefs.  Just rewrite it as:
> 
>  struct builtin_decls_data
>    {
>    ...
>    };
> 

Yes, that's what I meant.  Also things like 

> +void
> +aarch64_emit_swrsqrt (rtx dst, rtx src)
> +{
> +  enum machine_mode mode = GET_MODE (src);
> +  gcc_assert (
> +    mode == SFmode || mode == V2SFmode || mode == V4SFmode
> +       || mode == DFmode || mode == V2DFmode);

Instead of "enum machine_mode" use "machine_mode".  Similarly, for 
"struct bleh*" use "bleh*".  Although it seems there are no occurrences
of the latter in your patch.

Cheers,
Oleg
Bernd Schmidt Oct. 19, 2015, 2:29 p.m. UTC | #9
On 01/04/1970 01:02 AM, Benedikt Huber wrote:
> This tenth revision of the patch:
>   * Removes unnecessary enum.

Please fix your clock.


Bernd
diff mbox

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index a865043..be096c6 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,23 @@ 
+2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
+	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
+	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
+	* config/aarch64/aarch64-protos.h: Declare.
+	* config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
+	frsqrts.
+	* config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
+	* config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
+	applicable.
+	* config/aarch64/aarch64.md: Added enum entries.
+	* config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
+	* testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h: Common macros for
+	assembly checks.
+	* testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c: Make sure
+	frsqrts and frsqrte are not emitted.
+	* testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c: Make sure frsqrts and
+	frsqrte are emitted.
+	* testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.
+
 2015-10-11  Jan Hubicka  <hubicka@ucw.cz>
 
 	revert:
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 716ed6e..e5e62ac 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -344,6 +344,11 @@  enum aarch64_builtins
   AARCH64_BUILTIN_GET_FPSR,
   AARCH64_BUILTIN_SET_FPSR,
 
+  AARCH64_BUILTIN_RSQRT_DF,
+  AARCH64_BUILTIN_RSQRT_SF,
+  AARCH64_BUILTIN_RSQRT_V2DF,
+  AARCH64_BUILTIN_RSQRT_V2SF,
+  AARCH64_BUILTIN_RSQRT_V4SF,
   AARCH64_SIMD_BUILTIN_BASE,
   AARCH64_SIMD_BUILTIN_LANE_CHECK,
 #include "aarch64-simd-builtins.def"
@@ -842,6 +847,45 @@  aarch64_init_crc32_builtins ()
     }
 }
 
+/* Add builtins for reciprocal square root.  */
+
+void
+aarch64_init_builtin_rsqrt (void)
+{
+  tree fndecl = NULL;
+  tree ftype = NULL;
+
+  tree V2SF_type_node = build_vector_type (float_type_node, 2);
+  tree V2DF_type_node = build_vector_type (double_type_node, 2);
+  tree V4SF_type_node = build_vector_type (float_type_node, 4);
+
+  typedef struct
+  {
+    tree type_node;
+    const char *builtin_name;
+    int function_code;
+  } builtin_decls_data;
+
+  builtin_decls_data bdda[] = {
+    {double_type_node, "__builtin_aarch64_rsqrt_df", AARCH64_BUILTIN_RSQRT_DF},
+    {float_type_node, "__builtin_aarch64_rsqrt_sf", AARCH64_BUILTIN_RSQRT_SF},
+    {V2DF_type_node, "__builtin_aarch64_rsqrt_v2df", AARCH64_BUILTIN_RSQRT_V2DF},
+    {V2SF_type_node, "__builtin_aarch64_rsqrt_v2sf", AARCH64_BUILTIN_RSQRT_V2SF},
+    {V4SF_type_node, "__builtin_aarch64_rsqrt_v4sf", AARCH64_BUILTIN_RSQRT_V4SF}
+  };
+
+  builtin_decls_data *bdd = bdda;
+  builtin_decls_data *bdd_end = bdd + (sizeof (bdda) / sizeof (builtin_decls_data));
+
+  for (; bdd < bdd_end; bdd++)
+  {
+    ftype = build_function_type_list (bdd->type_node, bdd->type_node, NULL_TREE);
+    fndecl = add_builtin_function (bdd->builtin_name,
+      ftype, bdd->function_code, BUILT_IN_MD, NULL, NULL_TREE);
+    aarch64_builtin_decls[bdd->function_code] = fndecl;
+  }
+}
+
 void
 aarch64_init_builtins (void)
 {
@@ -873,6 +917,7 @@  aarch64_init_builtins (void)
     aarch64_init_simd_builtins ();
 
   aarch64_init_crc32_builtins ();
+  aarch64_init_builtin_rsqrt ();
 }
 
 tree
@@ -1136,6 +1181,44 @@  aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
   return target;
 }
 
+/* Function to expand reciprocal square root builtins.  */
+
+static rtx
+aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
+{
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op0 = expand_normal (arg0);
+
+  rtx (*gen) (rtx, rtx);
+
+  switch (fcode)
+    {
+      case AARCH64_BUILTIN_RSQRT_DF:
+	gen = gen_aarch64_rsqrt_df2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_SF:
+	gen = gen_aarch64_rsqrt_sf2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V2DF:
+	gen = gen_aarch64_rsqrt_v2df2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V2SF:
+	gen = gen_aarch64_rsqrt_v2sf2;
+	break;
+      case AARCH64_BUILTIN_RSQRT_V4SF:
+	gen = gen_aarch64_rsqrt_v4sf2;
+	break;
+      default: gcc_unreachable ();
+    }
+
+  if (!target)
+    target = gen_reg_rtx (GET_MODE (op0));
+
+  emit_insn (gen (target, op0));
+
+  return target;
+}
+
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient.  */
 rtx
@@ -1183,6 +1266,13 @@  aarch64_expand_builtin (tree exp,
   else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
     return aarch64_crc32_expand_builtin (fcode, exp, target);
 
+  if (fcode == AARCH64_BUILTIN_RSQRT_DF
+      || fcode == AARCH64_BUILTIN_RSQRT_SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2DF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
+    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+
   gcc_unreachable ();
 }
 
@@ -1340,6 +1430,30 @@  aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
   return NULL_TREE;
 }
 
+/* Return builtin for reciprocal square root.  */
+
+tree
+aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
+{
+  if (md_fn)
+    {
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
+    }
+  else
+    {
+      if (fn == BUILT_IN_SQRT)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
+      if (fn == BUILT_IN_SQRTF)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
+    }
+  return NULL_TREE;
+}
+
 #undef VAR1
 #define VAR1(T, N, MAP, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index baaf1bd..0420248 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -332,6 +332,8 @@  void aarch64_register_pragmas (void);
 void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 
+void aarch64_emit_swrsqrt (rtx, rtx);
+
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
 
@@ -383,6 +385,8 @@  rtx aarch64_expand_builtin (tree exp,
 			    int ignore ATTRIBUTE_UNUSED);
 tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED);
 
+tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);
+
 tree
 aarch64_builtin_vectorized_function (tree fndecl,
 				     tree type_out,
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 167277e..8c359cb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -354,6 +354,33 @@ 
   [(set_attr "type" "neon_fp_mul_d_scalar_q")]
 )
 
+(define_insn "aarch64_rsqrte_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRTE))]
+  "TARGET_SIMD"
+  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+
+(define_insn "aarch64_rsqrts_<mode>3"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+	       (match_operand:VALLF 2 "register_operand" "w")]
+		     UNSPEC_RSQRTS))]
+  "TARGET_SIMD"
+  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+
+(define_expand "aarch64_rsqrt_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRT))]
+  "TARGET_SIMD"
+{
+  aarch64_emit_swrsqrt (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "*aarch64_mul3_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
      (mult:DF
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 628386b..6f7dbce 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,4 +29,5 @@ 
      AARCH64_TUNE_ to give an enum name. */
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
+AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5130e37..387d744 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -408,7 +408,8 @@  static const struct tune_params cortexa57_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
+   | AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 static const struct tune_params cortexa72_tunings =
@@ -472,7 +473,7 @@  static const struct tune_params xgene1_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 /* Support for fine-grained override of the tuning structures.  */
@@ -7005,6 +7006,105 @@  aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
   return aarch64_tune_params.memmov_cost;
 }
 
+/* Function to decide when to use
+   reciprocal square root builtins.  */
+
+static tree
+aarch64_builtin_reciprocal (unsigned int fn,
+			    bool md_fn,
+			    bool)
+{
+  if (flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_size
+      || ! (aarch64_tune_params.extra_tuning_flags
+	   & AARCH64_EXTRA_TUNE_RECIP_SQRT))
+  {
+    return NULL_TREE;
+  }
+
+  return aarch64_builtin_rsqrt (fn, md_fn);
+}
+
+typedef rtx (*rsqrte_type) (rtx, rtx);
+
+/* Select reciprocal square root initial estimate
+   insn depending on machine mode.  */
+
+rsqrte_type
+get_rsqrte_type (enum machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_aarch64_rsqrte_df2;
+    case SFmode:   return gen_aarch64_rsqrte_sf2;
+    case V2DFmode: return gen_aarch64_rsqrte_v2df2;
+    case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
+    case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
+    default: gcc_unreachable ();
+  }
+}
+
+typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
+
+/* Select reciprocal square root Newton-Raphson step
+   insn depending on machine mode.  */
+
+rsqrts_type
+get_rsqrts_type (enum machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_aarch64_rsqrts_df3;
+    case SFmode:   return gen_aarch64_rsqrts_sf3;
+    case V2DFmode: return gen_aarch64_rsqrts_v2df3;
+    case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
+    case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
+    default: gcc_unreachable ();
+  }
+}
+
+/* Emit instruction sequence to compute
+   reciprocal square root.  Use two Newton-Raphson steps
+   for single precision and three for double precision.  */
+
+void
+aarch64_emit_swrsqrt (rtx dst, rtx src)
+{
+  enum machine_mode mode = GET_MODE (src);
+  gcc_assert (
+    mode == SFmode || mode == V2SFmode || mode == V4SFmode
+	|| mode == DFmode || mode == V2DFmode);
+
+  rtx xsrc = gen_reg_rtx (mode);
+  emit_move_insn (xsrc, src);
+  rtx x0 = gen_reg_rtx (mode);
+
+  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+
+  bool double_mode = (mode == DFmode || mode == V2DFmode);
+
+  int iterations = double_mode ? 3 : 2;
+
+  if (flag_mrecip_low_precision_sqrt)
+    iterations--;
+
+  for (int i = 0; i < iterations; ++i)
+    {
+      rtx x1 = gen_reg_rtx (mode);
+      rtx x2 = gen_reg_rtx (mode);
+      rtx x3 = gen_reg_rtx (mode);
+      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+
+      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+
+      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+      x0 = x1;
+    }
+
+  emit_move_insn (dst, x0);
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
@@ -13343,6 +13443,9 @@  aarch64_promoted_type (const_tree t)
 #undef TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
 
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
+
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 208f58f..48421d8 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -126,6 +126,9 @@ 
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
     UNSPEC_SP_TEST
+    UNSPEC_RSQRT
+    UNSPEC_RSQRTE
+    UNSPEC_RSQRTS
 ])
 
 (define_c_enum "unspecv" [
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index a1ce58d..e5691be 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -148,3 +148,8 @@  Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
 mpc-relative-literal-loads
 Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
 PC relative literal loads.
+
+mlow-precision-recip-sqrt
+Common Var(flag_mrecip_low_precision_sqrt) Optimization
+When calculating a sqrt approximation, run fewer steps.
+This reduces precision, but can result in faster computation.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 9e028fc..c883b87 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -520,6 +520,7 @@  Objective-C and Objective-C++ Dialects}.
 -mtls-size=@var{size} @gol
 -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
+-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12478,6 +12479,17 @@  Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
 This erratum workaround is made at link time and this will only pass the
 corresponding flag to the linker.
 
+@item -mlow-precision-recip-sqrt
+@item -mno-low-precision-recip-sqrt
+@opindex -mlow-precision-recip-sqrt
+@opindex -mno-low-precision-recip-sqrt
+The square root estimate uses two steps instead of three for double-precision,
+and one step instead of two for single-precision.
+Thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} activates
+reciprocal square root estimate instructions.
+Which in turn depends on the target processor.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture, optionally suffixed by one or
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h
new file mode 100644
index 0000000..8a851e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h
@@ -0,0 +1,42 @@ 
+#define sqrt_float   __builtin_sqrtf
+#define sqrt_double  __builtin_sqrt
+
+#define TESTTYPE(TYPE) \
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+  TYPE c; \
+  TYPE d; \
+} s4_##TYPE; \
+\
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+} s2_##TYPE; \
+\
+s4_##TYPE \
+rsqrtv4_##TYPE (s4_##TYPE i) \
+{ \
+  s4_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  o.c = 1.0 / sqrt_##TYPE (i.c); \
+  o.d = 1.0 / sqrt_##TYPE (i.d); \
+  return o; \
+} \
+\
+s2_##TYPE \
+rsqrtv2_##TYPE (s2_##TYPE i) \
+{ \
+  s2_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  return o; \
+} \
+\
+TYPE \
+rsqrt_##TYPE (TYPE i) \
+{ \
+  return 1.0 / sqrt_##TYPE (i); \
+} \
+
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c
new file mode 100644
index 0000000..58fe7f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c
@@ -0,0 +1,12 @@ 
+/* Test for the recip_sqrt tuning
+   ensuring the correct instructions are generated.  */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic" } */
+
+#include "rsqrt-asm-check-common.h"
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte" 0 } } */
+/* { dg-final { scan-assembler-times "frsqrts" 0 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
new file mode 100644
index 0000000..72bf233
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
@@ -0,0 +1,25 @@ 
+/* Test for the recip_sqrt tuning
+   ensuring the correct instructions are generated.  */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic -moverride=tune=recip_sqrt" } */
+
+#include "rsqrt-asm-check-common.h"
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
+
+
+/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
new file mode 100644
index 0000000..15d495d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
@@ -0,0 +1,111 @@ 
+/* Test for the recip_sqrt tuning
+   ensuring functionality and sufficient accuracy.  */
+/* { dg-do run } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic -moverride=tune=recip_sqrt" } */
+
+#define PI    3.141592653589793
+#define SQRT2 1.4142135623730951
+
+#define PI_4 0.7853981633974483
+#define SQRT1_2 0.7071067811865475
+
+/* 2^25+1, float has 24 significand bits
+ *       according to Single-precision floating-point format.  */
+#define TESTA8_FLT 33554433
+/* 2^54+1, double has 53 significand bits
+ *       according to Double-precision floating-point format.  */
+#define TESTA8_DBL 18014398509481985
+
+#define EPSILON_double __DBL_EPSILON__
+#define EPSILON_float __FLT_EPSILON__
+#define ABS_double __builtin_fabs
+#define ABS_float __builtin_fabsf
+#define SQRT_double __builtin_sqrt
+#define SQRT_float __builtin_sqrtf
+#define ISNAN_double __builtin_isnan
+#define ISNAN_float __builtin_isnanf
+
+extern void abort (void);
+
+#define TESTTYPE(TYPE) \
+TYPE \
+rsqrt_##TYPE (TYPE a) \
+{ \
+  return 1.0/SQRT_##TYPE (a); \
+} \
+\
+int \
+equals_##TYPE (TYPE a, TYPE b) \
+{ \
+  return (a == b || \
+   (ISNAN_##TYPE (a) && ISNAN_##TYPE (b)) || \
+   (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \
+} \
+\
+void \
+t_##TYPE (TYPE a, TYPE result) \
+{ \
+  TYPE r = rsqrt_##TYPE (a); \
+  if (!equals_##TYPE (r, result)) \
+  { \
+    abort (); \
+  } \
+} \
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+int
+main ()
+{
+  double nan = __builtin_nan ("");
+  double inf = __builtin_inf ();
+  float nanf = __builtin_nanf ("");
+  float inff = __builtin_inff ();
+
+  t_double (1.0/256, 0X1.00000000000000P+4);
+  t_double (1.0, 0X1.00000000000000P+0);
+  t_double (-1.0, nan);
+  t_double (11.0, 0X1.34BF63D1568260P-2);
+  t_double (0.0,  inf);
+  t_double (inf, 0X0.00000000000000P+0);
+  t_double (nan, nan);
+  t_double (-nan, -nan);
+  t_double (__DBL_MAX__, 0X1.00000000000010P-512);
+  t_double (__DBL_MIN__, 0X1.00000000000000P+511);
+  t_double (PI, 0X1.20DD750429B6D0P-1);
+  t_double (PI_4, 0X1.20DD750429B6D0P+0);
+  t_double (SQRT2, 0X1.AE89F995AD3AE0P-1);
+  t_double (SQRT1_2, 0X1.306FE0A31B7150P+0);
+  t_double (-PI, nan);
+  t_double (-SQRT2, nan);
+  t_double (TESTA8_DBL, 0X1.00000000000000P-27);
+
+  t_float (1.0/256, 0X1.00000000000000P+4);
+  t_float (1.0, 0X1.00000000000000P+0);
+  t_float (-1.0, nanf);
+  t_float (11.0, 0X1.34BF6400000000P-2);
+  t_float (0.0,  inff);
+  t_float (inff, 0X0.00000000000000P+0);
+  t_float (nanf, nanf);
+  t_float (-nanf, -nanf);
+  t_float (__FLT_MAX__, 0X1.00000200000000P-64);
+  t_float (__FLT_MIN__, 0X1.00000000000000P+63);
+  t_float (PI, 0X1.20DD7400000000P-1);
+  t_float (PI_4, 0X1.20DD7400000000P+0);
+  t_float (SQRT2, 0X1.AE89FA00000000P-1);
+  t_float (SQRT1_2, 0X1.306FE000000000P+0);
+  t_float (-PI, nanf);
+  t_float (-SQRT2, nanf);
+  t_float (TESTA8_FLT, 0X1.6A09E600000000P-13);
+
+//   With -ffast-math these return positive INF.
+//   t_double (-0.0, -inf);
+//   t_float (-0.0, -inff);
+
+//   The reason here is that -ffast-math flushes to zero.
+//   t_double  (__DBL_MIN__/256, 0X1.00000000000000P+515);
+//   t_float (__FLT_MIN__/256, 0X1.00000000000000P+67);
+
+  return 0;
+}