2015-10-02 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
diff mbox

Message ID 185224-2521-2-git-send-email-benedikt.huber@theobroma-systems.com
State New
Headers show

Commit Message

Benedikt Huber Jan. 3, 1970, 3:27 a.m. UTC
* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
	rsqrtf.
	* config/aarch64/aarch64-protos.h: Declare.
	* config/aarch64/aarch64-simd.md: Matching expressions for
	frsqrte and frsqrts.
	* config/aarch64/aarch64-tuning-flags.def: Added
	MRECIP_DEFAULT_ENABLED.
	* config/aarch64/aarch64.c: New functions. Emit rsqrt
	estimation code in fast math mode.
	* config/aarch64/aarch64.md: Added enum entries.
	* config/aarch64/aarch64.opt: Added options -mrecip and
	-mlow-precision-recip-sqrt.
	* testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
	for frsqrte and frsqrts
	* testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.

Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
---
 gcc/ChangeLog                                      |  19 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 112 ++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   3 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 115 ++++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   4 +
 gcc/doc/invoke.texi                                |  12 +++
 .../gcc.target/aarch64/rsqrt-asm-check_1.c         |  65 ++++++++++++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 11 files changed, 467 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c

Comments

James Greenhalgh Oct. 9, 2015, 11:44 a.m. UTC | #1
On Sat, Jan 03, 1970 at 03:27:04AM +0000, Benedikt Huber wrote:
>         * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
>         rsqrtf.
>         * config/aarch64/aarch64-protos.h: Declare.
>         * config/aarch64/aarch64-simd.md: Matching expressions for
>         frsqrte and frsqrts.
>         * config/aarch64/aarch64-tuning-flags.def: Added
>         MRECIP_DEFAULT_ENABLED.
>         * config/aarch64/aarch64.c: New functions. Emit rsqrt
>         estimation code in fast math mode.
>         * config/aarch64/aarch64.md: Added enum entries.
>         * config/aarch64/aarch64.opt: Added options -mrecip and
>         -mlow-precision-recip-sqrt.
>         * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
>         for frsqrte and frsqrts
>         * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.


Hi,

Thanks for this latest revision, I have some structural/refactoring
comments, and I think I've spotted a bug. Otherwise this is getting
close to ready.

Some more comments in line.

(As an asside, I find this style of patch submission to be very
 difficult to follow, as it misses my mail filters and does not keep the
 in-reply-to header correctly across patch revisions).

>  2015-10-01  Lynn Boger  <laboger@linux.vnet.ibm.com>
> 
>         PR target/66870
> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
> index 80916a9..29cfbf5 100644
> --- a/gcc/config/aarch64/aarch64-builtins.c
> +++ b/gcc/config/aarch64/aarch64-builtins.c
> @@ -344,6 +344,11 @@ enum aarch64_builtins
>    AARCH64_BUILTIN_GET_FPSR,
>    AARCH64_BUILTIN_SET_FPSR,
> 
> +  AARCH64_BUILTIN_RSQRT_DF,
> +  AARCH64_BUILTIN_RSQRT_SF,
> +  AARCH64_BUILTIN_RSQRT_V2DF,
> +  AARCH64_BUILTIN_RSQRT_V2SF,
> +  AARCH64_BUILTIN_RSQRT_V4SF,
>    AARCH64_SIMD_BUILTIN_BASE,
>    AARCH64_SIMD_BUILTIN_LANE_CHECK,
>  #include "aarch64-simd-builtins.def"
> @@ -842,6 +847,46 @@ aarch64_init_crc32_builtins ()
>      }
>  }
> 
> +/* Add builtins for reciprocal square root.  */
> +
> +void
> +aarch64_add_builtin_rsqrt (void)
> +{
> +  tree fndecl = NULL;
> +  tree ftype = NULL;
> +
> +  tree V2SF_type_node = build_vector_type (float_type_node, 2);
> +  tree V2DF_type_node = build_vector_type (double_type_node, 2);
> +  tree V4SF_type_node = build_vector_type (float_type_node, 4);
> +
> +  ftype = build_function_type_list (double_type_node, double_type_node,
> +                                   NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_df",
> +    ftype, AARCH64_BUILTIN_RSQRT_DF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF] = fndecl;
> +
> +  ftype = build_function_type_list (float_type_node, float_type_node,
> +                                   NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_sf",
> +    ftype, AARCH64_BUILTIN_RSQRT_SF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF] = fndecl;
> +
> +  ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2df",
> +    ftype, AARCH64_BUILTIN_RSQRT_V2DF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF] = fndecl;
> +
> +  ftype = build_function_type_list (V2SF_type_node, V2SF_type_node, NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2sf",
> +    ftype, AARCH64_BUILTIN_RSQRT_V2SF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF] = fndecl;
> +
> +  ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v4sf",
> +    ftype, AARCH64_BUILTIN_RSQRT_V4SF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF] = fndecl;

Given that this is all so mechanical, I'd have a preference towards
refactoring this to loop over some structured data. Something like:

  {AARCH64_BUILTIN_RSQRT_SF, float_type_node, "__builtin_aarch64_rsqrt_sf"},
  {AARCH64_BUILTIN_RSQRT_DF, double_type_node, "__builtin_aarch64_rsqrt_df"},
  etc.

>  void
>  aarch64_init_builtins (void)
>  {
> @@ -873,6 +918,7 @@ aarch64_init_builtins (void)
>      aarch64_init_simd_builtins ();
> 
>    aarch64_init_crc32_builtins ();
> +  aarch64_add_builtin_rsqrt ();

Very minor nit, other functions use "init", you use "add".

>  }
> 
>  tree
> @@ -1136,6 +1182,41 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
>    return target;
>  }
> 
> +/* Function to expand reciprocal square root builtins.  */
> +
> +static rtx
> +aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
> +{
> +  rtx pat;
> +  tree arg0 = CALL_EXPR_ARG (exp, 0);
> +  rtx op0 = expand_normal (arg0);
> +
> +  enum insn_code c;
> +
> +  switch (fcode)
> +    {
> +      case AARCH64_BUILTIN_RSQRT_DF:
> +       c = CODE_FOR_rsqrt_df2; break;
> +      case AARCH64_BUILTIN_RSQRT_SF:
> +       c = CODE_FOR_rsqrt_sf2; break;
> +      case AARCH64_BUILTIN_RSQRT_V2DF:
> +       c = CODE_FOR_rsqrt_v2df2; break;
> +      case AARCH64_BUILTIN_RSQRT_V2SF:
> +       c = CODE_FOR_rsqrt_v2sf2; break;
> +      case AARCH64_BUILTIN_RSQRT_V4SF:
> +       c = CODE_FOR_rsqrt_v4sf2; break;
> +         default: gcc_unreachable ();
> +    }

Formatting looks off for the "default" case.

> +
> +  if (!target)
> +    target = gen_reg_rtx (GET_MODE (op0));
> +
> +  pat = GEN_FCN (c) (target, op0);
> +  emit_insn (pat);
> +
> +  return target;


Could we rewrite the above using function pointers and gen functions as
you do elsewhere in the patch:

  rtx (*gen) (rtx, rtx);
  switch (fcode)
    {
      case AARCH64_BUILTIN_RSQRT_DF:
	gen = gen_rsqrt_df2;
	break;
      case AARCH64_BUILTIN_RSQRT_SF:
	gen = gen_rsqrt_sf2;
	break;
       <...>
    }
  emit_insn (gen (target, op0));
  
>  /* Expand an expression EXP that calls a built-in function,
>     with result going to TARGET if that's convenient.  */
>  rtx
> @@ -1183,6 +1264,13 @@ aarch64_expand_builtin (tree exp,
>    else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
>      return aarch64_crc32_expand_builtin (fcode, exp, target);
> 
> +  if (fcode == AARCH64_BUILTIN_RSQRT_DF
> +      || fcode == AARCH64_BUILTIN_RSQRT_SF
> +      || fcode == AARCH64_BUILTIN_RSQRT_V2DF
> +      || fcode == AARCH64_BUILTIN_RSQRT_V2SF
> +      || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
> +    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
> +
>    gcc_unreachable ();
>  }
> 
> @@ -1340,6 +1428,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
>    return NULL_TREE;
>  }
> 
> +/* Return builtin for reciprocal square root.  */
> +
> +tree
> +aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
> +{
> +  if (md_fn)
> +    {
> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
> +       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
> +       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
> +       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
> +    }
> +  else
> +    {
> +      if (fn == BUILT_IN_SQRT)
> +       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
> +      if (fn == BUILT_IN_SQRTF)
> +       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
> +    }
> +  return NULL_TREE;
> +}
> +
>  #undef VAR1
>  #define VAR1(T, N, MAP, A) \
>    case AARCH64_SIMD_BUILTIN_##T##_##N##A:
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index baaf1bd..455b1da 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -332,6 +332,8 @@ void aarch64_register_pragmas (void);
>  void aarch64_relayout_simd_types (void);
>  void aarch64_reset_previous_fndecl (void);
> 
> +void aarch64_emit_swrsqrt (rtx, rtx);
> +
>  /* Initialize builtins for SIMD intrinsics.  */
>  void init_aarch64_simd_builtins (void);
> 
> @@ -400,4 +402,5 @@ int aarch64_ccmp_mode_to_code (enum machine_mode mode);
>  bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset);
>  bool aarch64_operands_ok_for_ldpstp (rtx *, bool, enum machine_mode);
>  bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, enum machine_mode);
> +tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);

It is a losing battle, but at some point this file was in alphabetical
order, first by type then by name. If we could keep to that, that would
be good.

>  #endif /* GCC_AARCH64_PROTOS_H */
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 541faf9..d48ad3b 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -354,6 +354,33 @@
>    [(set_attr "type" "neon_fp_mul_d_scalar_q")]
>  )
> 
> +(define_insn "rsqrte_<mode>2"

As this is not a standard pattern name, keep it namespaced as
aarch64_rsqrte<mode>2.

> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
> +                    UNSPEC_RSQRTE))]
> +  "TARGET_SIMD"
> +  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
> +  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
> +
> +(define_insn "rsqrts_<mode>3"

Likewise.

> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
> +              (match_operand:VALLF 2 "register_operand" "w")]
> +                    UNSPEC_RSQRTS))]
> +  "TARGET_SIMD"
> +  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
> +  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
> +
> +(define_expand "rsqrt_<mode>2"

Likewise.

> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
> +                    UNSPEC_RSQRT))]
> +  "TARGET_SIMD"
> +{
> +  aarch64_emit_swrsqrt (operands[0], operands[1]);
> +  DONE;
> +})
> +
>  (define_insn "*aarch64_mul3_elt_to_64v2df"
>    [(set (match_operand:DF 0 "register_operand" "=w")
>       (mult:DF
> diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
> index 628386b..6f7dbce 100644
> --- a/gcc/config/aarch64/aarch64-tuning-flags.def
> +++ b/gcc/config/aarch64/aarch64-tuning-flags.def
> @@ -29,4 +29,5 @@
>       AARCH64_TUNE_ to give an enum name. */
> 
>  AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
> +AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 034da7c..5ddfa5d 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -408,7 +408,8 @@ static const struct tune_params cortexa57_tunings =
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> -  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags.  */
> +  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
> +   | AARCH64_EXTRA_TUNE_RECIP_SQRT)    /* tune_flags.  */
>  };
> 
>  static const struct tune_params cortexa72_tunings =
> @@ -472,7 +473,7 @@ static const struct tune_params xgene1_tunings =
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> -  (AARCH64_EXTRA_TUNE_NONE)    /* tune_flags.  */
> +  (AARCH64_EXTRA_TUNE_RECIP_SQRT)      /* tune_flags.  */
>  };
> 
>  /* Support for fine-grained override of the tuning structures.  */
> @@ -7009,6 +7010,107 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
>    return aarch64_tune_params.memmov_cost;
>  }
> 
> +/* Function to decide when to use
> +   reciprocal square root builtins.  */
> +
> +static tree
> +aarch64_builtin_reciprocal (unsigned int fn,
> +                           bool md_fn,
> +                           bool)
> +{
> +  if (flag_trapping_math
> +      || !flag_unsafe_math_optimizations
> +      || optimize_size
> +      || (aarch64_tune_params.extra_tuning_flags
> +          & AARCH64_EXTRA_TUNE_RECIP_SQRT))

I've checked a number of times, but this condition still looks backwards
to me. As far as I can see, this says not to do the transform if

      (aarch64_tune_params.extra_tuning_flags
          & AARCH64_EXTRA_TUNE_RECIP_SQRT))

But it is Friday, so forgive me if I'm wrong.

> +  {
> +    return NULL_TREE;
> +  }
> +
> +  return aarch64_builtin_rsqrt (fn, md_fn);
> +}
> +
> +typedef rtx (*rsqrte_type) (rtx, rtx);
> +
> +/* Select reciprocal square root initial estimate
> +   insn depending on machine mode.  */
> +
> +rsqrte_type
> +get_rsqrte_type (enum machine_mode mode)
> +{
> +  switch (mode)
> +  {
> +    case DFmode:   return gen_rsqrte_df2;
> +    case SFmode:   return gen_rsqrte_sf2;
> +    case V2DFmode: return gen_rsqrte_v2df2;
> +    case V2SFmode: return gen_rsqrte_v2sf2;
> +    case V4SFmode: return gen_rsqrte_v4sf2;
> +    default: gcc_unreachable ();
> +  }
> +}
> +
> +typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
> +
> +/* Select reciprocal square root Newton-Raphson step
> +   insn depending on machine mode.  */
> +
> +rsqrts_type
> +get_rsqrts_type (enum machine_mode mode)
> +{
> +  switch (mode)
> +  {
> +    case DFmode:   return gen_rsqrts_df3;
> +    case SFmode:   return gen_rsqrts_sf3;
> +    case V2DFmode: return gen_rsqrts_v2df3;
> +    case V2SFmode: return gen_rsqrts_v2sf3;
> +    case V4SFmode: return gen_rsqrts_v4sf3;
> +    default: gcc_unreachable ();
> +  }
> +}
> +
> +/* Emit instruction sequence to compute
> +   reciprocal square root.  Use two Newton-Raphson steps
> +   for single precision and three for double precision.  */
> +
> +void
> +aarch64_emit_swrsqrt (rtx dst, rtx src)
> +{
> +  enum machine_mode mode = GET_MODE (src);
> +  gcc_assert (
> +    mode == SFmode || mode == V2SFmode || mode == V4SFmode ||
> +    mode == DFmode || mode == V2DFmode);

Split before the operator:

   mode == SFmode || mode == V2SFmode || mode == V4SFmode
   || mode == DFmode || mode == V2DFmode);

> +
> +  rtx xsrc = gen_reg_rtx (mode);
> +  emit_move_insn (xsrc, src);
> +  rtx x0 = gen_reg_rtx (mode);
> +
> +  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
> +
> +  bool double_mode = (mode == DFmode || mode == V2DFmode);
> +
> +  int iterations = 2;
> +  if (double_mode)
> +    iterations = 3;

Personal preference:

  int iterations = double_mode ? 3 : 2;

> +
> +  if (flag_mrecip_low_precision_sqrt)
> +    iterations--;
> +
> +  for (int i = 0; i < iterations; ++i)
> +    {
> +      rtx x1 = gen_reg_rtx (mode);
> +      rtx x2 = gen_reg_rtx (mode);
> +      rtx x3 = gen_reg_rtx (mode);
> +      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
> +
> +      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
> +
> +      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
> +      x0 = x1;
> +    }
> +
> +  emit_move_insn (dst, x0);
> +}
> +
>  /* Return the number of instructions that can be issued per cycle.  */
>  static int
>  aarch64_sched_issue_rate (void)
> @@ -13387,6 +13489,9 @@ aarch64_promoted_type (const_tree t)
>  #undef TARGET_BUILD_BUILTIN_VA_LIST
>  #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
> 
> +#undef TARGET_BUILTIN_DECL
> +#define TARGET_BUILTIN_DECL aarch64_builtin_decl
> +

Unrelated change?

>  #undef TARGET_CALLEE_COPIES
>  #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
> 
> @@ -13418,9 +13523,6 @@ aarch64_promoted_type (const_tree t)
>  #undef TARGET_CLASS_MAX_NREGS
>  #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
> 
> -#undef TARGET_BUILTIN_DECL
> -#define TARGET_BUILTIN_DECL aarch64_builtin_decl
> -
>  #undef  TARGET_EXPAND_BUILTIN
>  #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
> 
> @@ -13561,6 +13663,9 @@ aarch64_promoted_type (const_tree t)
>  #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
>  #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
> 
> +#undef TARGET_BUILTIN_RECIPROCAL
> +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
> +

I think Marcus asked for this to be in alphabetical order in the v5
review.

>  #undef TARGET_VECTOR_MODE_SUPPORTED_P
>  #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
> 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index c3cd58d..51c2b87 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -126,6 +126,9 @@
>      UNSPEC_VSTRUCTDUMMY
>      UNSPEC_SP_SET
>      UNSPEC_SP_TEST
> +    UNSPEC_RSQRT
> +    UNSPEC_RSQRTE
> +    UNSPEC_RSQRTS
>  ])
> 
>  (define_c_enum "unspecv" [
> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
> index a1ce58d..00084ea 100644
> --- a/gcc/config/aarch64/aarch64.opt
> +++ b/gcc/config/aarch64/aarch64.opt
> @@ -148,3 +148,7 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
>  mpc-relative-literal-loads
>  Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
>  PC relative literal loads.
> +
> +mlow-precision-recip-sqrt
> +Common Var(flag_mrecip_low_precision_sqrt) Optimization
> +Run fewer approximation steps to reduce latency and precision.

Don't make a definite claim about latency here.

  When calculating a sqrt approximation, run fewer steps.  This reduces
  precision, but can result in faster computation.

> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index c19be78..8b45837 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -519,6 +519,7 @@ Objective-C and Objective-C++ Dialects}.
>  -mtls-size=@var{size} @gol
>  -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
>  -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
> +-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
>  -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
> 
>  @emph{Adapteva Epiphany Options}
> @@ -12445,6 +12446,17 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
>  This erratum workaround is made at link time and this will only pass the
>  corresponding flag to the linker.
> 
> +@item -mlow-precision-recip-sqrt
> +@item -mno-low-precision-recip-sqrt
> +@opindex -mlow-precision-recip-sqrt
> +@opindex -mno-low-precision-recip-sqrt
> +The square root estimate uses two steps instead of three for double-precision,
> +and one step instead of two for single-precision.
> +Thus reducing latency and precision.
> +This is only relevant if @option{-ffast-math} activates
> +reciprocal square root estimate instructions.
> +Which in turn depends on the CPU core.

As above. To be consistent with the other documentation,
s/CPU core/target processor/

> +
>  @item -march=@var{name}
>  @opindex march
>  Specify the name of the target architecture, optionally suffixed by one or
> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
> new file mode 100644
> index 0000000..9f17990
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
> @@ -0,0 +1,65 @@
> +/* Test for the recip_sqrt tuning
> +   ensuring the correct instructions are generated.  */
> +/* { dg-do compile } */
> +/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno" } */

Presumably this testcase needs guarding or otherwise tweaked to make sure
it only runs for targets which want to use the estimate expansion?
Additionally, a test showing the opposite - that a target which does
not want the expansion doesn't get it - would be useful.

> +
> +#define sqrt_float   __builtin_sqrtf
> +#define sqrt_double  __builtin_sqrt
> +
> +#define TESTTYPE(TYPE) \
> +typedef struct { \
> +  TYPE a; \
> +  TYPE b; \
> +  TYPE c; \
> +  TYPE d; \
> +} s4_##TYPE; \
> +\
> +typedef struct { \
> +  TYPE a; \
> +  TYPE b; \
> +} s2_##TYPE; \
> +\
> +s4_##TYPE \
> +rsqrtv4_##TYPE (s4_##TYPE i) \
> +{ \
> +  s4_##TYPE o; \
> +  o.a = 1.0 / sqrt_##TYPE (i.a); \
> +  o.b = 1.0 / sqrt_##TYPE (i.b); \
> +  o.c = 1.0 / sqrt_##TYPE (i.c); \
> +  o.d = 1.0 / sqrt_##TYPE (i.d); \
> +  return o; \
> +} \
> +\
> +s2_##TYPE \
> +rsqrtv2_##TYPE (s2_##TYPE i) \
> +{ \
> +  s2_##TYPE o; \
> +  o.a = 1.0 / sqrt_##TYPE (i.a); \
> +  o.b = 1.0 / sqrt_##TYPE (i.b); \
> +  return o; \
> +} \
> +\
> +TYPE \
> +rsqrt_##TYPE (TYPE i) \
> +{ \
> +  return 1.0 / sqrt_##TYPE (i); \
> +} \
> +
> +TESTTYPE (double)
> +TESTTYPE (float)
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
> +
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
> new file mode 100644
> index 0000000..624f9b2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
> @@ -0,0 +1,111 @@
> +/* Test for the recip_sqrt tuning
> +   ensuring functionality and sufficient accuracy.  */
> +/* { dg-do run } */
> +/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno" } */

Likewise.

> +
> +#define PI    3.141592653589793
> +#define SQRT2 1.4142135623730951
> +
> +#define PI_4 0.7853981633974483
> +#define SQRT1_2 0.7071067811865475
> +
> +/* 2^25+1, float has 24 significand bits
> + *       according to Single-precision floating-point format.  */
> +#define TESTA8_FLT 33554433
> +/* 2^54+1, double has 53 significand bits
> + *       according to Double-precision floating-point format.  */
> +#define TESTA8_DBL 18014398509481985
> +
> +#define EPSILON_double __DBL_EPSILON__
> +#define EPSILON_float __FLT_EPSILON__
> +#define ABS_double __builtin_fabs
> +#define ABS_float __builtin_fabsf
> +#define SQRT_double __builtin_sqrt
> +#define SQRT_float __builtin_sqrtf
> +#define ISNAN_double __builtin_isnan
> +#define ISNAN_float __builtin_isnanf
> +
> +extern void abort (void);
> +
> +#define TESTTYPE(TYPE) \
> +TYPE \
> +rsqrt_##TYPE (TYPE a) \
> +{ \
> +  return 1.0/SQRT_##TYPE (a); \
> +} \
> +\
> +int \
> +equals_##TYPE (TYPE a, TYPE b) \
> +{ \
> +  return (a == b || \
> +   (ISNAN_##TYPE (a) && ISNAN_##TYPE (b)) || \
> +   (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \
> +} \
> +\
> +void \
> +t_##TYPE (TYPE a, TYPE result) \
> +{ \
> +  TYPE r = rsqrt_##TYPE (a); \
> +  if (!equals_##TYPE (r, result)) \
> +  { \
> +    abort (); \
> +  } \
> +} \
> +
> +TESTTYPE (double)
> +TESTTYPE (float)
> +
> +int
> +main ()
> +{
> +  double nan = __builtin_nan ("");
> +  double inf = __builtin_inf ();
> +  float nanf = __builtin_nanf ("");
> +  float inff = __builtin_inff ();
> +
> +  t_double (1.0/256, 0X1.00000000000000P+4);
> +  t_double (1.0, 0X1.00000000000000P+0);
> +  t_double (-1.0, nan);
> +  t_double (11.0, 0X1.34BF63D1568260P-2);
> +  t_double (0.0,  inf);
> +  t_double (inf, 0X0.00000000000000P+0);
> +  t_double (nan, nan);
> +  t_double (-nan, -nan);
> +  t_double (__DBL_MAX__, 0X1.00000000000010P-512);
> +  t_double (__DBL_MIN__, 0X1.00000000000000P+511);
> +  t_double (PI, 0X1.20DD750429B6D0P-1);
> +  t_double (PI_4, 0X1.20DD750429B6D0P+0);
> +  t_double (SQRT2, 0X1.AE89F995AD3AE0P-1);
> +  t_double (SQRT1_2, 0X1.306FE0A31B7150P+0);
> +  t_double (-PI, nan);
> +  t_double (-SQRT2, nan);
> +  t_double (TESTA8_DBL, 0X1.00000000000000P-27);
> +
> +  t_float (1.0/256, 0X1.00000000000000P+4);
> +  t_float (1.0, 0X1.00000000000000P+0);
> +  t_float (-1.0, nanf);
> +  t_float (11.0, 0X1.34BF6400000000P-2);
> +  t_float (0.0,  inff);
> +  t_float (inff, 0X0.00000000000000P+0);
> +  t_float (nanf, nanf);
> +  t_float (-nanf, -nanf);
> +  t_float (__FLT_MAX__, 0X1.00000200000000P-64);
> +  t_float (__FLT_MIN__, 0X1.00000000000000P+63);
> +  t_float (PI, 0X1.20DD7400000000P-1);
> +  t_float (PI_4, 0X1.20DD7400000000P+0);
> +  t_float (SQRT2, 0X1.AE89FA00000000P-1);
> +  t_float (SQRT1_2, 0X1.306FE000000000P+0);
> +  t_float (-PI, nanf);
> +  t_float (-SQRT2, nanf);
> +  t_float (TESTA8_FLT, 0X1.6A09E600000000P-13);
> +
> +//   With -ffast-math these return positive INF.
> +//   t_double (-0.0, -inf);
> +//   t_float (-0.0, -inff);
> +
> +//   The reason here is that -ffast-math flushes to zero.
> +//   t_double  (__DBL_MIN__/256, 0X1.00000000000000P+515);
> +//   t_float (__FLT_MIN__/256, 0X1.00000000000000P+67);
> +
> +  return 0;
> +}


Thanks,
James
Benedikt Huber Oct. 15, 2015, 10:03 p.m. UTC | #2
This seventh revision of the patch:
 * Cleans up style issues.
 * Correct bug in condition.
 * Improved testing code. 

Ok for check in.


Benedikt Huber (1):
  2015-10-15  Benedikt Huber  <benedikt.huber@theobroma-systems.com>    
    	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

 gcc/ChangeLog                                      |  20 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 114 +++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   4 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 +++++
 gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
 gcc/config/aarch64/aarch64.c                       | 107 ++++++++++++++++++-
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   5 +
 gcc/doc/invoke.texi                                |  12 +++
 .../gcc.target/aarch64/rsqrt-asm-check-common.h    |  42 ++++++++
 .../aarch64/rsqrt-asm-check-negative_1.c           |  12 +++
 .../gcc.target/aarch64/rsqrt-asm-check_1.c         |  25 +++++
 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c         | 111 ++++++++++++++++++++
 13 files changed, 481 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-common.h
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check-negative_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c

Patch
diff mbox

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index c8200db..7226f29 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,22 @@ 
+2015-10-02  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
+	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
+	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
+	rsqrtf.
+	* config/aarch64/aarch64-protos.h: Declare.
+	* config/aarch64/aarch64-simd.md: Matching expressions for
+	frsqrte and frsqrts.
+	* config/aarch64/aarch64-tuning-flags.def: Added
+	RECIP_SQRT.
+	* config/aarch64/aarch64.c: New functions.  Emit rsqrt
+	estimation code in fast math mode.
+	* config/aarch64/aarch64.md: Added enum entries.
+	* config/aarch64/aarch64.opt: Added options -mrecip and
+	-mlow-precision-recip-sqrt.
+	* testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
+	for frsqrte and frsqrts
+	* testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
+
 2015-10-01  Lynn Boger  <laboger@linux.vnet.ibm.com>
 
 	PR target/66870
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 80916a9..29cfbf5 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -344,6 +344,11 @@  enum aarch64_builtins
   AARCH64_BUILTIN_GET_FPSR,
   AARCH64_BUILTIN_SET_FPSR,
 
+  AARCH64_BUILTIN_RSQRT_DF,
+  AARCH64_BUILTIN_RSQRT_SF,
+  AARCH64_BUILTIN_RSQRT_V2DF,
+  AARCH64_BUILTIN_RSQRT_V2SF,
+  AARCH64_BUILTIN_RSQRT_V4SF,
   AARCH64_SIMD_BUILTIN_BASE,
   AARCH64_SIMD_BUILTIN_LANE_CHECK,
 #include "aarch64-simd-builtins.def"
@@ -842,6 +847,46 @@  aarch64_init_crc32_builtins ()
     }
 }
 
+/* Add builtins for reciprocal square root.  */
+
+void
+aarch64_add_builtin_rsqrt (void)
+{
+  tree fndecl = NULL;
+  tree ftype = NULL;
+
+  tree V2SF_type_node = build_vector_type (float_type_node, 2);
+  tree V2DF_type_node = build_vector_type (double_type_node, 2);
+  tree V4SF_type_node = build_vector_type (float_type_node, 4);
+
+  ftype = build_function_type_list (double_type_node, double_type_node,
+				    NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_df",
+    ftype, AARCH64_BUILTIN_RSQRT_DF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF] = fndecl;
+
+  ftype = build_function_type_list (float_type_node, float_type_node,
+				    NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_sf",
+    ftype, AARCH64_BUILTIN_RSQRT_SF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF] = fndecl;
+
+  ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2df",
+    ftype, AARCH64_BUILTIN_RSQRT_V2DF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF] = fndecl;
+
+  ftype = build_function_type_list (V2SF_type_node, V2SF_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2sf",
+    ftype, AARCH64_BUILTIN_RSQRT_V2SF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF] = fndecl;
+
+  ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v4sf",
+    ftype, AARCH64_BUILTIN_RSQRT_V4SF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF] = fndecl;
+}
+
 void
 aarch64_init_builtins (void)
 {
@@ -873,6 +918,7 @@  aarch64_init_builtins (void)
     aarch64_init_simd_builtins ();
 
   aarch64_init_crc32_builtins ();
+  aarch64_add_builtin_rsqrt ();
 }
 
 tree
@@ -1136,6 +1182,41 @@  aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
   return target;
 }
 
+/* Function to expand reciprocal square root builtins.  */
+
+static rtx
+aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op0 = expand_normal (arg0);
+
+  enum insn_code c;
+
+  switch (fcode)
+    {
+      case AARCH64_BUILTIN_RSQRT_DF:
+	c = CODE_FOR_rsqrt_df2; break;
+      case AARCH64_BUILTIN_RSQRT_SF:
+	c = CODE_FOR_rsqrt_sf2; break;
+      case AARCH64_BUILTIN_RSQRT_V2DF:
+	c = CODE_FOR_rsqrt_v2df2; break;
+      case AARCH64_BUILTIN_RSQRT_V2SF:
+	c = CODE_FOR_rsqrt_v2sf2; break;
+      case AARCH64_BUILTIN_RSQRT_V4SF:
+	c = CODE_FOR_rsqrt_v4sf2; break;
+	  default: gcc_unreachable ();
+    }
+
+  if (!target)
+    target = gen_reg_rtx (GET_MODE (op0));
+
+  pat = GEN_FCN (c) (target, op0);
+  emit_insn (pat);
+
+  return target;
+}
+
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient.  */
 rtx
@@ -1183,6 +1264,13 @@  aarch64_expand_builtin (tree exp,
   else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
     return aarch64_crc32_expand_builtin (fcode, exp, target);
 
+  if (fcode == AARCH64_BUILTIN_RSQRT_DF
+      || fcode == AARCH64_BUILTIN_RSQRT_SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2DF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
+    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+
   gcc_unreachable ();
 }
 
@@ -1340,6 +1428,30 @@  aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
   return NULL_TREE;
 }
 
+/* Return builtin for reciprocal square root.  */
+
+tree
+aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
+{
+  if (md_fn)
+    {
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
+    }
+  else
+    {
+      if (fn == BUILT_IN_SQRT)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
+      if (fn == BUILT_IN_SQRTF)
+	return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
+    }
+  return NULL_TREE;
+}
+
 #undef VAR1
 #define VAR1(T, N, MAP, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index baaf1bd..455b1da 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -332,6 +332,8 @@  void aarch64_register_pragmas (void);
 void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 
+void aarch64_emit_swrsqrt (rtx, rtx);
+
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
 
@@ -400,4 +402,5 @@  int aarch64_ccmp_mode_to_code (enum machine_mode mode);
 bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset);
 bool aarch64_operands_ok_for_ldpstp (rtx *, bool, enum machine_mode);
 bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, enum machine_mode);
+tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 541faf9..d48ad3b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -354,6 +354,33 @@ 
   [(set_attr "type" "neon_fp_mul_d_scalar_q")]
 )
 
+(define_insn "rsqrte_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRTE))]
+  "TARGET_SIMD"
+  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+
+(define_insn "rsqrts_<mode>3"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+	       (match_operand:VALLF 2 "register_operand" "w")]
+		     UNSPEC_RSQRTS))]
+  "TARGET_SIMD"
+  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+
+(define_expand "rsqrt_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRT))]
+  "TARGET_SIMD"
+{
+  aarch64_emit_swrsqrt (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "*aarch64_mul3_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
      (mult:DF
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 628386b..6f7dbce 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,4 +29,5 @@ 
      AARCH64_TUNE_ to give an enum name. */
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
+AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 034da7c..5ddfa5d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -408,7 +408,8 @@  static const struct tune_params cortexa57_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
+   | AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 static const struct tune_params cortexa72_tunings =
@@ -472,7 +473,7 @@  static const struct tune_params xgene1_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RECIP_SQRT)	/* tune_flags.  */
 };
 
 /* Support for fine-grained override of the tuning structures.  */
@@ -7009,6 +7010,107 @@  aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
   return aarch64_tune_params.memmov_cost;
 }
 
+/* Function to decide when to use
+   reciprocal square root builtins.  */
+
+static tree
+aarch64_builtin_reciprocal (unsigned int fn,
+			    bool md_fn,
+			    bool)
+{
+  if (flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_size
+      || (aarch64_tune_params.extra_tuning_flags
+	   & AARCH64_EXTRA_TUNE_RECIP_SQRT))
+  {
+    return NULL_TREE;
+  }
+
+  return aarch64_builtin_rsqrt (fn, md_fn);
+}
+
+typedef rtx (*rsqrte_type) (rtx, rtx);
+
+/* Select reciprocal square root initial estimate
+   insn depending on machine mode.  */
+
+rsqrte_type
+get_rsqrte_type (enum machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_rsqrte_df2;
+    case SFmode:   return gen_rsqrte_sf2;
+    case V2DFmode: return gen_rsqrte_v2df2;
+    case V2SFmode: return gen_rsqrte_v2sf2;
+    case V4SFmode: return gen_rsqrte_v4sf2;
+    default: gcc_unreachable ();
+  }
+}
+
+typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
+
+/* Select reciprocal square root Newton-Raphson step
+   insn depending on machine mode.  */
+
+rsqrts_type
+get_rsqrts_type (enum machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_rsqrts_df3;
+    case SFmode:   return gen_rsqrts_sf3;
+    case V2DFmode: return gen_rsqrts_v2df3;
+    case V2SFmode: return gen_rsqrts_v2sf3;
+    case V4SFmode: return gen_rsqrts_v4sf3;
+    default: gcc_unreachable ();
+  }
+}
+
+/* Emit instruction sequence to compute
+   reciprocal square root.  Use two Newton-Raphson steps
+   for single precision and three for double precision.  */
+
+void
+aarch64_emit_swrsqrt (rtx dst, rtx src)
+{
+  enum machine_mode mode = GET_MODE (src);
+  gcc_assert (
+    mode == SFmode || mode == V2SFmode || mode == V4SFmode ||
+    mode == DFmode || mode == V2DFmode);
+
+  rtx xsrc = gen_reg_rtx (mode);
+  emit_move_insn (xsrc, src);
+  rtx x0 = gen_reg_rtx (mode);
+
+  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+
+  bool double_mode = (mode == DFmode || mode == V2DFmode);
+
+  int iterations = 2;
+  if (double_mode)
+    iterations = 3;
+
+  if (flag_mrecip_low_precision_sqrt)
+    iterations--;
+
+  for (int i = 0; i < iterations; ++i)
+    {
+      rtx x1 = gen_reg_rtx (mode);
+      rtx x2 = gen_reg_rtx (mode);
+      rtx x3 = gen_reg_rtx (mode);
+      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+
+      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+
+      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+      x0 = x1;
+    }
+
+  emit_move_insn (dst, x0);
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
@@ -13387,6 +13489,9 @@  aarch64_promoted_type (const_tree t)
 #undef TARGET_BUILD_BUILTIN_VA_LIST
 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
 
+#undef TARGET_BUILTIN_DECL
+#define TARGET_BUILTIN_DECL aarch64_builtin_decl
+
 #undef TARGET_CALLEE_COPIES
 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
 
@@ -13418,9 +13523,6 @@  aarch64_promoted_type (const_tree t)
 #undef TARGET_CLASS_MAX_NREGS
 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
 
-#undef TARGET_BUILTIN_DECL
-#define TARGET_BUILTIN_DECL aarch64_builtin_decl
-
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
 
@@ -13561,6 +13663,9 @@  aarch64_promoted_type (const_tree t)
 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
 
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
+
 #undef TARGET_VECTOR_MODE_SUPPORTED_P
 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c3cd58d..51c2b87 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -126,6 +126,9 @@ 
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
     UNSPEC_SP_TEST
+    UNSPEC_RSQRT
+    UNSPEC_RSQRTE
+    UNSPEC_RSQRTS
 ])
 
 (define_c_enum "unspecv" [
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index a1ce58d..00084ea 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -148,3 +148,7 @@  Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
 mpc-relative-literal-loads
 Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
 PC relative literal loads.
+
+mlow-precision-recip-sqrt
+Common Var(flag_mrecip_low_precision_sqrt) Optimization
+Run fewer approximation steps to reduce latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index c19be78..8b45837 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -519,6 +519,7 @@  Objective-C and Objective-C++ Dialects}.
 -mtls-size=@var{size} @gol
 -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
+-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12445,6 +12446,17 @@  Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
 This erratum workaround is made at link time and this will only pass the
 corresponding flag to the linker.
 
+@item -mlow-precision-recip-sqrt
+@item -mno-low-precision-recip-sqrt
+@opindex -mlow-precision-recip-sqrt
+@opindex -mno-low-precision-recip-sqrt
+The square root estimate uses two steps instead of three for double-precision,
+and one step instead of two for single-precision.
+Thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} activates
+reciprocal square root estimate instructions.
+Which in turn depends on the CPU core.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture, optionally suffixed by one or
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
new file mode 100644
index 0000000..9f17990
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
@@ -0,0 +1,65 @@ 
+/* Test for the recip_sqrt tuning
+   ensuring the correct instructions are generated.  */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno" } */
+
+#define sqrt_float   __builtin_sqrtf
+#define sqrt_double  __builtin_sqrt
+
+#define TESTTYPE(TYPE) \
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+  TYPE c; \
+  TYPE d; \
+} s4_##TYPE; \
+\
+typedef struct { \
+  TYPE a; \
+  TYPE b; \
+} s2_##TYPE; \
+\
+s4_##TYPE \
+rsqrtv4_##TYPE (s4_##TYPE i) \
+{ \
+  s4_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  o.c = 1.0 / sqrt_##TYPE (i.c); \
+  o.d = 1.0 / sqrt_##TYPE (i.d); \
+  return o; \
+} \
+\
+s2_##TYPE \
+rsqrtv2_##TYPE (s2_##TYPE i) \
+{ \
+  s2_##TYPE o; \
+  o.a = 1.0 / sqrt_##TYPE (i.a); \
+  o.b = 1.0 / sqrt_##TYPE (i.b); \
+  return o; \
+} \
+\
+TYPE \
+rsqrt_##TYPE (TYPE i) \
+{ \
+  return 1.0 / sqrt_##TYPE (i); \
+} \
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
+
+
+/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
new file mode 100644
index 0000000..624f9b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt_1.c
@@ -0,0 +1,111 @@ 
+/* Test for the recip_sqrt tuning
+   ensuring functionality and sufficient accuracy.  */
+/* { dg-do run } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno" } */
+
+#define PI    3.141592653589793
+#define SQRT2 1.4142135623730951
+
+#define PI_4 0.7853981633974483
+#define SQRT1_2 0.7071067811865475
+
+/* 2^25+1, float has 24 significand bits
+ *       according to Single-precision floating-point format.  */
+#define TESTA8_FLT 33554433
+/* 2^54+1, double has 53 significand bits
+ *       according to Double-precision floating-point format.  */
+#define TESTA8_DBL 18014398509481985
+
+#define EPSILON_double __DBL_EPSILON__
+#define EPSILON_float __FLT_EPSILON__
+#define ABS_double __builtin_fabs
+#define ABS_float __builtin_fabsf
+#define SQRT_double __builtin_sqrt
+#define SQRT_float __builtin_sqrtf
+#define ISNAN_double __builtin_isnan
+#define ISNAN_float __builtin_isnanf
+
+extern void abort (void);
+
+#define TESTTYPE(TYPE) \
+TYPE \
+rsqrt_##TYPE (TYPE a) \
+{ \
+  return 1.0/SQRT_##TYPE (a); \
+} \
+\
+int \
+equals_##TYPE (TYPE a, TYPE b) \
+{ \
+  return (a == b || \
+   (ISNAN_##TYPE (a) && ISNAN_##TYPE (b)) || \
+   (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \
+} \
+\
+void \
+t_##TYPE (TYPE a, TYPE result) \
+{ \
+  TYPE r = rsqrt_##TYPE (a); \
+  if (!equals_##TYPE (r, result)) \
+  { \
+    abort (); \
+  } \
+} \
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+int
+main ()
+{
+  double nan = __builtin_nan ("");
+  double inf = __builtin_inf ();
+  float nanf = __builtin_nanf ("");
+  float inff = __builtin_inff ();
+
+  t_double (1.0/256, 0X1.00000000000000P+4);
+  t_double (1.0, 0X1.00000000000000P+0);
+  t_double (-1.0, nan);
+  t_double (11.0, 0X1.34BF63D1568260P-2);
+  t_double (0.0,  inf);
+  t_double (inf, 0X0.00000000000000P+0);
+  t_double (nan, nan);
+  t_double (-nan, -nan);
+  t_double (__DBL_MAX__, 0X1.00000000000010P-512);
+  t_double (__DBL_MIN__, 0X1.00000000000000P+511);
+  t_double (PI, 0X1.20DD750429B6D0P-1);
+  t_double (PI_4, 0X1.20DD750429B6D0P+0);
+  t_double (SQRT2, 0X1.AE89F995AD3AE0P-1);
+  t_double (SQRT1_2, 0X1.306FE0A31B7150P+0);
+  t_double (-PI, nan);
+  t_double (-SQRT2, nan);
+  t_double (TESTA8_DBL, 0X1.00000000000000P-27);
+
+  t_float (1.0/256, 0X1.00000000000000P+4);
+  t_float (1.0, 0X1.00000000000000P+0);
+  t_float (-1.0, nanf);
+  t_float (11.0, 0X1.34BF6400000000P-2);
+  t_float (0.0,  inff);
+  t_float (inff, 0X0.00000000000000P+0);
+  t_float (nanf, nanf);
+  t_float (-nanf, -nanf);
+  t_float (__FLT_MAX__, 0X1.00000200000000P-64);
+  t_float (__FLT_MIN__, 0X1.00000000000000P+63);
+  t_float (PI, 0X1.20DD7400000000P-1);
+  t_float (PI_4, 0X1.20DD7400000000P+0);
+  t_float (SQRT2, 0X1.AE89FA00000000P-1);
+  t_float (SQRT1_2, 0X1.306FE000000000P+0);
+  t_float (-PI, nanf);
+  t_float (-SQRT2, nanf);
+  t_float (TESTA8_FLT, 0X1.6A09E600000000P-13);
+
+//   With -ffast-math these return positive INF.
+//   t_double (-0.0, -inf);
+//   t_float (-0.0, -inff);
+
+//   The reason here is that -ffast-math flushes to zero.
+//   t_double  (__DBL_MIN__/256, 0X1.00000000000000P+515);
+//   t_float (__FLT_MIN__/256, 0X1.00000000000000P+67);
+
+  return 0;
+}