diff mbox

2015-07-14 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com>

Message ID 1437147821-2653-2-git-send-email-benedikt.huber@theobroma-systems.com
State New
Headers show

Commit Message

Benedikt Huber July 17, 2015, 3:43 p.m. UTC
* config/aarch64/aarch64-builtins.c: Builtins
	for rsqrt and rsqrtf.
	* config/aarch64/aarch64-protos.h: Declare.
	* config/aarch64/aarch64-simd.md: Matching expressions
	for frsqrte and frsqrts.
	* config/aarch64/aarch64.c: New functions. Emit rsqrt
	estimation code in fast math mode.
	* config/aarch64/aarch64.md: Added enum entries.
	* config/aarch64/aarch64.opt: Added options -mrecip and
	-mlow-precision-recip-sqrt.
	* testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
	for frsqrte and frsqrts
	* testsuite/gcc.target/aarch64/rsqrt.c: Functional tests
	for rsqrt.

Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
---
 gcc/ChangeLog                                      |  17 ++++
 gcc/config/aarch64/aarch64-builtins.c              | 103 ++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h                |   2 +
 gcc/config/aarch64/aarch64-simd.md                 |  27 ++++++
 gcc/config/aarch64/aarch64.c                       |  58 +++++++++++
 gcc/config/aarch64/aarch64.md                      |   3 +
 gcc/config/aarch64/aarch64.opt                     |   8 ++
 gcc/doc/invoke.texi                                |  20 ++++
 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c |  63 ++++++++++++
 gcc/testsuite/gcc.target/aarch64/rsqrt.c           | 107 +++++++++++++++++++++
 10 files changed, 408 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c

Comments

Andrew Pinski July 18, 2015, 8:25 a.m. UTC | #1
On Fri, Jul 17, 2015 at 8:43 AM, Benedikt Huber
<benedikt.huber@theobroma-systems.com> wrote:
>         * config/aarch64/aarch64-builtins.c: Builtins
>         for rsqrt and rsqrtf.
>         * config/aarch64/aarch64-protos.h: Declare.
>         * config/aarch64/aarch64-simd.md: Matching expressions
>         for frsqrte and frsqrts.
>         * config/aarch64/aarch64.c: New functions. Emit rsqrt
>         estimation code in fast math mode.
>         * config/aarch64/aarch64.md: Added enum entries.
>         * config/aarch64/aarch64.opt: Added options -mrecip and
>         -mlow-precision-recip-sqrt.
>         * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
>         for frsqrte and frsqrts
>         * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests
>         for rsqrt.

As I mentioned before,  Can we have -mrecip defaulted to on since it
regresses ThunderX performance by at least 10%.

Thanks,
Andrew

>
> Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
> ---
>  gcc/ChangeLog                                      |  17 ++++
>  gcc/config/aarch64/aarch64-builtins.c              | 103 ++++++++++++++++++++
>  gcc/config/aarch64/aarch64-protos.h                |   2 +
>  gcc/config/aarch64/aarch64-simd.md                 |  27 ++++++
>  gcc/config/aarch64/aarch64.c                       |  58 +++++++++++
>  gcc/config/aarch64/aarch64.md                      |   3 +
>  gcc/config/aarch64/aarch64.opt                     |   8 ++
>  gcc/doc/invoke.texi                                |  20 ++++
>  gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c |  63 ++++++++++++
>  gcc/testsuite/gcc.target/aarch64/rsqrt.c           | 107 +++++++++++++++++++++
>  10 files changed, 408 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 3432adb..f4b7407 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,20 @@
> +2015-07-14  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
> +           Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
> +
> +       * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
> +       rsqrtf.
> +       * config/aarch64/aarch64-protos.h: Declare.
> +       * config/aarch64/aarch64-simd.md: Matching expressions for
> +       frsqrte and frsqrts.
> +       * config/aarch64/aarch64.c: New functions. Emit rsqrt
> +       estimation code in fast math mode.
> +       * config/aarch64/aarch64.md: Added enum entries.
> +       * config/aarch64/aarch64.opt: Added options -mrecip and
> +       -mlow-precision-recip-sqrt.
> +       * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
> +       for frsqrte and frsqrts
> +       * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
> +
>  2015-07-08  Jiong Wang  <jiong.wang@arm.com>
>
>         * config/aarch64/aarch64.c (aarch64_unspec_may_trap_p): New function.
> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
> index b6c89b9..adcea07 100644
> --- a/gcc/config/aarch64/aarch64-builtins.c
> +++ b/gcc/config/aarch64/aarch64-builtins.c
> @@ -335,6 +335,11 @@ enum aarch64_builtins
>    AARCH64_BUILTIN_GET_FPSR,
>    AARCH64_BUILTIN_SET_FPSR,
>
> +  AARCH64_BUILTIN_RSQRT_DF,
> +  AARCH64_BUILTIN_RSQRT_SF,
> +  AARCH64_BUILTIN_RSQRT_V2DF,
> +  AARCH64_BUILTIN_RSQRT_V2SF,
> +  AARCH64_BUILTIN_RSQRT_V4SF,
>    AARCH64_SIMD_BUILTIN_BASE,
>    AARCH64_SIMD_BUILTIN_LANE_CHECK,
>  #include "aarch64-simd-builtins.def"
> @@ -824,6 +829,42 @@ aarch64_init_crc32_builtins ()
>  }
>
>  void
> +aarch64_add_builtin_rsqrt (void)
> +{
> +  tree fndecl = NULL;
> +  tree ftype = NULL;
> +
> +  tree V2SF_type_node = build_vector_type (float_type_node, 2);
> +  tree V2DF_type_node = build_vector_type (double_type_node, 2);
> +  tree V4SF_type_node = build_vector_type (float_type_node, 4);
> +
> +  ftype = build_function_type_list (double_type_node, double_type_node, NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_df",
> +    ftype, AARCH64_BUILTIN_RSQRT_DF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF] = fndecl;
> +
> +  ftype = build_function_type_list (float_type_node, float_type_node, NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_sf",
> +    ftype, AARCH64_BUILTIN_RSQRT_SF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF] = fndecl;
> +
> +  ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2df",
> +    ftype, AARCH64_BUILTIN_RSQRT_V2DF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF] = fndecl;
> +
> +  ftype = build_function_type_list (V2SF_type_node, V2SF_type_node, NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2sf",
> +    ftype, AARCH64_BUILTIN_RSQRT_V2SF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF] = fndecl;
> +
> +  ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, NULL_TREE);
> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v4sf",
> +    ftype, AARCH64_BUILTIN_RSQRT_V4SF, BUILT_IN_MD, NULL, NULL_TREE);
> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF] = fndecl;
> +}
> +
> +void
>  aarch64_init_builtins (void)
>  {
>    tree ftype_set_fpr
> @@ -848,6 +889,7 @@ aarch64_init_builtins (void)
>      aarch64_init_simd_builtins ();
>    if (TARGET_CRC32)
>      aarch64_init_crc32_builtins ();
> +  aarch64_add_builtin_rsqrt ();
>  }
>
>  tree
> @@ -1092,6 +1134,38 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
>    return target;
>  }
>
> +static rtx
> +aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
> +{
> +  rtx pat;
> +  tree arg0 = CALL_EXPR_ARG (exp, 0);
> +  rtx op0 = expand_normal (arg0);
> +
> +  enum insn_code c = CODE_FOR_rsqrt_df;
> +
> +  switch (fcode)
> +    {
> +      case AARCH64_BUILTIN_RSQRT_DF:
> +        c = CODE_FOR_rsqrt_df; break;
> +      case AARCH64_BUILTIN_RSQRT_SF:
> +        c = CODE_FOR_rsqrt_sf; break;
> +      case AARCH64_BUILTIN_RSQRT_V2DF:
> +        c = CODE_FOR_rsqrt_v2df; break;
> +      case AARCH64_BUILTIN_RSQRT_V2SF:
> +        c = CODE_FOR_rsqrt_v2sf; break;
> +      case AARCH64_BUILTIN_RSQRT_V4SF:
> +        c = CODE_FOR_rsqrt_v4sf; break;
> +    }
> +
> +  if (!target)
> +    target = gen_reg_rtx (GET_MODE (op0));
> +
> +  pat = GEN_FCN (c) (target, op0);
> +  emit_insn (pat);
> +
> +  return target;
> +}
> +
>  /* Expand an expression EXP that calls a built-in function,
>     with result going to TARGET if that's convenient.  */
>  rtx
> @@ -1139,6 +1213,13 @@ aarch64_expand_builtin (tree exp,
>    else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
>      return aarch64_crc32_expand_builtin (fcode, exp, target);
>
> +  if (fcode == AARCH64_BUILTIN_RSQRT_DF   ||
> +      fcode == AARCH64_BUILTIN_RSQRT_SF   ||
> +      fcode == AARCH64_BUILTIN_RSQRT_V2DF ||
> +      fcode == AARCH64_BUILTIN_RSQRT_V2SF ||
> +      fcode == AARCH64_BUILTIN_RSQRT_V4SF)
> +    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
> +
>    gcc_unreachable ();
>  }
>
> @@ -1296,6 +1377,28 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
>    return NULL_TREE;
>  }
>
> +tree
> +aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
> +{
> +  if (md_fn)
> +    {
> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
> +    }
> +  else
> +    {
> +      if (fn == BUILT_IN_SQRT)
> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
> +      if (fn == BUILT_IN_SQRTF)
> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
> +    }
> +  return NULL_TREE;
> +}
> +
>  #undef VAR1
>  #define VAR1(T, N, MAP, A) \
>    case AARCH64_SIMD_BUILTIN_##T##_##N##A:
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 4062c27..8b8a389 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -321,6 +321,8 @@ void aarch64_print_operand (FILE *, rtx, char);
>  void aarch64_print_operand_address (FILE *, rtx);
>  void aarch64_emit_call_insn (rtx);
>
> +void aarch64_emit_swrsqrt (rtx, rtx);
> +
>  /* Initialize builtins for SIMD intrinsics.  */
>  void init_aarch64_simd_builtins (void);
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index b90f938..ac8e12d 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -353,6 +353,33 @@
>    [(set_attr "type" "neon_fp_mul_d_scalar_q")]
>  )
>
> +(define_insn "*rsqrte_<mode>"
> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
> +                    UNSPEC_RSQRTE))]
> +  "TARGET_SIMD"
> +  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
> +  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
> +
> +(define_insn "*rsqrts_<mode>"
> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
> +               (match_operand:VALLF 2 "register_operand" "w")]
> +                    UNSPEC_RSQRTS))]
> +  "TARGET_SIMD"
> +  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
> +  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
> +
> +(define_expand "rsqrt_<mode>"
> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
> +                    UNSPEC_RSQRT))]
> +  "TARGET_SIMD"
> +{
> +  aarch64_emit_swrsqrt (operands[0], operands[1]);
> +  DONE;
> +})
> +
>  (define_insn "*aarch64_mul3_elt_to_64v2df"
>    [(set (match_operand:DF 0 "register_operand" "=w")
>       (mult:DF
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 6c13a078..2e53103 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -6961,6 +6961,61 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
>    return aarch64_tune_params.memmov_cost;
>  }
>
> +extern tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);
> +
> +static tree
> +aarch64_builtin_reciprocal (unsigned int fn,
> +                            bool md_fn,
> +                            bool)
> +{
> +  if (!flag_mrecip        || !flag_finite_math_only ||
> +       flag_trapping_math || !flag_unsafe_math_optimizations)
> +  {
> +    return NULL_TREE;
> +  }
> +
> +  return aarch64_builtin_rsqrt (fn, md_fn);
> +}
> +
> +void
> +aarch64_emit_swrsqrt (rtx dst, rtx src)
> +{
> +  enum machine_mode mode = GET_MODE (src);
> +  gcc_assert (
> +    mode == SFmode || mode == V2SFmode || mode == V4SFmode ||
> +    mode == DFmode || mode == V2DFmode);
> +
> +  rtx xsrc = gen_reg_rtx (mode);
> +  emit_move_insn (xsrc, src);
> +  rtx x0 = gen_reg_rtx (mode);
> +  emit_set_insn (x0,
> +    gen_rtx_UNSPEC (mode, gen_rtvec (1, xsrc), UNSPEC_RSQRTE));
> +
> +  bool double_mode = (mode == DFmode || mode == V2DFmode);
> +
> +  int iterations = 2;
> +  if (double_mode)
> +    iterations = 3;
> +
> +  if (flag_mrecip_low_precision_sqrt)
> +    iterations--;
> +
> +  for (int i = 0; i < iterations; ++i)
> +    {
> +      rtx x1 = gen_reg_rtx (mode);
> +      rtx x2 = gen_reg_rtx (mode);
> +      rtx x3 = gen_reg_rtx (mode);
> +      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
> +      emit_set_insn (x3, gen_rtx_UNSPEC (
> +        mode, gen_rtvec (2, xsrc, x2), UNSPEC_RSQRTS));
> +      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
> +      x0 = x1;
> +    }
> +
> +  emit_move_insn (dst, x0);
> +  return;
> +}
> +
>  /* Return the number of instructions that can be issued per cycle.  */
>  static int
>  aarch64_sched_issue_rate (void)
> @@ -12099,6 +12154,9 @@ aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
>  #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
>  #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
>
> +#undef TARGET_BUILTIN_RECIPROCAL
> +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
> +
>  #undef TARGET_VECTOR_MODE_SUPPORTED_P
>  #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 1e343fa..d7944b2 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -122,6 +122,9 @@
>      UNSPEC_VSTRUCTDUMMY
>      UNSPEC_SP_SET
>      UNSPEC_SP_TEST
> +    UNSPEC_RSQRT
> +    UNSPEC_RSQRTE
> +    UNSPEC_RSQRTS
>  ])
>
>  (define_c_enum "unspecv" [
> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
> index 98ef9f6..303efa4 100644
> --- a/gcc/config/aarch64/aarch64.opt
> +++ b/gcc/config/aarch64/aarch64.opt
> @@ -124,3 +124,11 @@ Enum(aarch64_abi) String(ilp32) Value(AARCH64_ABI_ILP32)
>
>  EnumValue
>  Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
> +
> +mrecip
> +Common Report Var(flag_mrecip) Optimization Init(1)
> +Generate software reciprocal square root for better throughput. Default with fast-math.
> +
> +mlow-precision-recip-sqrt
> +Common Var(flag_mrecip_low_precision_sqrt) Optimization
> +Run fewer approximation steps to reduce latency and precision.
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index b28e5d6..a0ab23d 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -515,6 +515,8 @@ Objective-C and Objective-C++ Dialects}.
>  -mtls-dialect=desc  -mtls-dialect=traditional @gol
>  -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
>  -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
> +-mrecip -no-mrecip @gol
> +-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
>  -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
>
>  @emph{Adapteva Epiphany Options}
> @@ -12426,6 +12428,24 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
>  This erratum workaround is made at link time and this will only pass the
>  corresponding flag to the linker.
>
> +@item -mrecip
> +@item -mno-recip
> +@opindex mrecip
> +@opindex mno-recip
> +This option enables use of the
> +reciprocal square root estimate instructions with additional
> +Newton-Raphson steps to increase precision instead of doing a square root and
> +divide for floating-point arguments.
> +This is the default setting with @option{-ffast-math}.
> +It can only be used together with @option{-ffast-math}.
> +
> +@item -mlow-precision-recip-sqrt
> +@item -mno-low-precision-recip-sqrt
> +@opindex -mlow-precision-recip-sqrt
> +@opindex -mno-low-precision-recip-sqrt
> +The square root estimate uses two steps instead of three for double-precision,
> +and one step instead of two for single-precision. Thus reducing latency and precision.
> +
>  @item -march=@var{name}
>  @opindex march
>  Specify the name of the target architecture, optionally suffixed by one or
> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
> new file mode 100644
> index 0000000..cd81d9e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
> @@ -0,0 +1,63 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 --save-temps -fverbose-asm -ffast-math" } */
> +
> +#include <math.h>
> +
> +#define sqrt_float   __builtin_sqrtf
> +#define sqrt_double  __builtin_sqrt
> +
> +#define TESTTYPE(TYPE)                                          \
> +typedef struct {                                                \
> +  TYPE a;                                                       \
> +  TYPE b;                                                       \
> +  TYPE c;                                                       \
> +  TYPE d;                                                       \
> +} s4_##TYPE;                                                    \
> +                                                                \
> +typedef struct {                                                \
> +  TYPE a;                                                       \
> +  TYPE b;                                                       \
> +} s2_##TYPE;                                                    \
> +                                                                \
> +s4_##TYPE rsqrtv4_##TYPE (s4_##TYPE i)                          \
> +{                                                               \
> +  s4_##TYPE o;                                                  \
> +  o.a = 1.0 / sqrt_##TYPE (i.a);                                \
> +  o.b = 1.0 / sqrt_##TYPE (i.b);                                \
> +  o.c = 1.0 / sqrt_##TYPE (i.c);                                \
> +  o.d = 1.0 / sqrt_##TYPE (i.d);                                \
> +  return o;                                                     \
> +}                                                               \
> +                                                                \
> +s2_##TYPE rsqrtv2_##TYPE (s2_##TYPE i)                          \
> +{                                                               \
> +  s2_##TYPE o;                                                  \
> +  o.a = 1.0 / sqrt_##TYPE (i.a);                                \
> +  o.b = 1.0 / sqrt_##TYPE (i.b);                                \
> +  return o;                                                     \
> +}                                                               \
> +                                                                \
> +TYPE rsqrt_##TYPE (TYPE i)                                      \
> +{                                                               \
> +  return 1.0 / sqrt_##TYPE (i);                                 \
> +}                                                               \
> +                                                                \
> +
> +TESTTYPE(double)
> +TESTTYPE(float)
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
> +
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
> +
> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */
> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt.c b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
> new file mode 100644
> index 0000000..8e2ae10
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
> @@ -0,0 +1,107 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 --save-temps -fverbose-asm -ffast-math" } */
> +
> +#include <math.h>
> +#include <stdio.h>
> +
> +#include <float.h>
> +
> +#define PI    3.141592653589793
> +#define SQRT2 1.4142135623730951
> +
> +#define PI_4 0.7853981633974483
> +#define SQRT1_2 0.7071067811865475
> +
> +/* 2^25+1, float has 24 significand bits
> + *       according to Single-precision floating-point format.  */
> +#define TESTA8_FLT 33554433
> +/* 2^54+1, double has 53 significand bits
> + *       according to Double-precision floating-point format.  */
> +#define TESTA8_DBL 18014398509481985
> +
> +#define SD(a, b) t_double ((#a), (a), (b));
> +#define SF(a, b) t_float ((#a), (a), (b));
> +
> +#define EPSILON_double __DBL_EPSILON__
> +#define EPSILON_float __FLT_EPSILON__
> +#define ABS_double __builtin_fabs
> +#define ABS_float __builtin_fabsf
> +#define SQRT_double __builtin_sqrt
> +#define SQRT_float __builtin_sqrtf
> +
> +extern void abort (void);
> +
> +#define TESTTYPE(TYPE)                                                     \
> +TYPE rsqrt_##TYPE (TYPE a)                                                 \
> +{                                                                          \
> +  return 1.0/SQRT_##TYPE(a);                                               \
> +}                                                                          \
> +                                                                           \
> +int equals_##TYPE (TYPE a, TYPE b)                                         \
> +{                                                                          \
> +  return (a == b ||                                                        \
> +          (isnan (a) && isnan (b)) ||                                      \
> +          (ABS_##TYPE (a - b) < EPSILON_##TYPE));                          \
> +}                                                                          \
> +                                                                           \
> +void t_##TYPE (const char *s, TYPE a, TYPE result)                         \
> +{                                                                          \
> +  TYPE r = rsqrt_##TYPE (a);                                               \
> +  if (!equals_##TYPE (r, result))                                          \
> +  {                                                                        \
> +    abort ();                                                              \
> +  }                                                                        \
> +}                                                                          \
> +
> +//  printf ("Problem in %20s: %30.18A should be %30.18A\n", s, r, result); \
> +
> +TESTTYPE(double)
> +TESTTYPE(float)
> +
> +int main ()
> +{
> +  SD(    1.0/256, 0X1.00000000000000P+4  );
> +  SD(        1.0, 0X1.00000000000000P+0  );
> +  SD(       -1.0,                     NAN);
> +  SD(       11.0, 0X1.34BF63D1568260P-2  );
> +  SD(        0.0,                INFINITY);
> +  SD(   INFINITY, 0X0.00000000000000P+0  );
> +  SD(        NAN,                     NAN);
> +  SD(       -NAN,                    -NAN);
> +  SD(    DBL_MAX, 0X1.00000000000010P-512);
> +  SD(    DBL_MIN, 0X1.00000000000000P+511);
> +  SD(         PI, 0X1.20DD750429B6D0P-1  );
> +  SD(       PI_4, 0X1.20DD750429B6D0P+0  );
> +  SD(      SQRT2, 0X1.AE89F995AD3AE0P-1  );
> +  SD(    SQRT1_2, 0X1.306FE0A31B7150P+0  );
> +  SD(        -PI,                     NAN);
> +  SD(     -SQRT2,                     NAN);
> +  SD( TESTA8_DBL, 0X1.00000000000000P-27 );
> +
> +  SF(    1.0/256, 0X1.00000000000000P+4  );
> +  SF(        1.0, 0X1.00000000000000P+0  );
> +  SF(       -1.0,                     NAN);
> +  SF(       11.0, 0X1.34BF6400000000P-2  );
> +  SF(        0.0,                INFINITY);
> +  SF(   INFINITY, 0X0.00000000000000P+0  );
> +  SF(        NAN,                     NAN);
> +  SF(       -NAN,                    -NAN);
> +  SF(    FLT_MAX, 0X1.00000200000000P-64 );
> +  SF(    FLT_MIN, 0X1.00000000000000P+63 );
> +  SF(         PI, 0X1.20DD7400000000P-1  );
> +  SF(       PI_4, 0X1.20DD7400000000P+0  );
> +  SF(      SQRT2, 0X1.AE89FA00000000P-1  );
> +  SF(    SQRT1_2, 0X1.306FE000000000P+0  );
> +  SF(        -PI,                     NAN);
> +  SF(     -SQRT2,                     NAN);
> +  SF( TESTA8_FLT, 0X1.6A09E600000000P-13 );
> +
> +//   With -ffast-math these return positive INF.
> +//   SD(       -0.0,               -INFINITY);
> +//   SF(       -0.0,               -INFINITY);
> +//   The reason here is that -ffast-math flushes to zero.
> +//   SD(DBL_MIN/256, 0X1.00000000000000P+515);
> +//   SF(FLT_MIN/256, 0X1.00000000000000P+67 );
> +
> +  return 0;
> +}
> --
> 1.9.1
>
Andrew Pinski July 18, 2015, 8:26 a.m. UTC | #2
On Sat, Jul 18, 2015 at 1:25 AM, Andrew Pinski <pinskia@gmail.com> wrote:
> On Fri, Jul 17, 2015 at 8:43 AM, Benedikt Huber
> <benedikt.huber@theobroma-systems.com> wrote:
>>         * config/aarch64/aarch64-builtins.c: Builtins
>>         for rsqrt and rsqrtf.
>>         * config/aarch64/aarch64-protos.h: Declare.
>>         * config/aarch64/aarch64-simd.md: Matching expressions
>>         for frsqrte and frsqrts.
>>         * config/aarch64/aarch64.c: New functions. Emit rsqrt
>>         estimation code in fast math mode.
>>         * config/aarch64/aarch64.md: Added enum entries.
>>         * config/aarch64/aarch64.opt: Added options -mrecip and
>>         -mlow-precision-recip-sqrt.
>>         * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
>>         for frsqrte and frsqrts
>>         * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests
>>         for rsqrt.
>
> As I mentioned before,  Can we have -mrecip defaulted to on since it
> regresses ThunderX performance by at least 10%.

As I mentioned before,  Can we have -mrecip not defaulted to on since
it regresses ThunderX performance by at least 10%?


>
> Thanks,
> Andrew
>
>>
>> Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
>> ---
>>  gcc/ChangeLog                                      |  17 ++++
>>  gcc/config/aarch64/aarch64-builtins.c              | 103 ++++++++++++++++++++
>>  gcc/config/aarch64/aarch64-protos.h                |   2 +
>>  gcc/config/aarch64/aarch64-simd.md                 |  27 ++++++
>>  gcc/config/aarch64/aarch64.c                       |  58 +++++++++++
>>  gcc/config/aarch64/aarch64.md                      |   3 +
>>  gcc/config/aarch64/aarch64.opt                     |   8 ++
>>  gcc/doc/invoke.texi                                |  20 ++++
>>  gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c |  63 ++++++++++++
>>  gcc/testsuite/gcc.target/aarch64/rsqrt.c           | 107 +++++++++++++++++++++
>>  10 files changed, 408 insertions(+)
>>  create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
>>  create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c
>>
>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>> index 3432adb..f4b7407 100644
>> --- a/gcc/ChangeLog
>> +++ b/gcc/ChangeLog
>> @@ -1,3 +1,20 @@
>> +2015-07-14  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
>> +           Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
>> +
>> +       * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
>> +       rsqrtf.
>> +       * config/aarch64/aarch64-protos.h: Declare.
>> +       * config/aarch64/aarch64-simd.md: Matching expressions for
>> +       frsqrte and frsqrts.
>> +       * config/aarch64/aarch64.c: New functions. Emit rsqrt
>> +       estimation code in fast math mode.
>> +       * config/aarch64/aarch64.md: Added enum entries.
>> +       * config/aarch64/aarch64.opt: Added options -mrecip and
>> +       -mlow-precision-recip-sqrt.
>> +       * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
>> +       for frsqrte and frsqrts
>> +       * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
>> +
>>  2015-07-08  Jiong Wang  <jiong.wang@arm.com>
>>
>>         * config/aarch64/aarch64.c (aarch64_unspec_may_trap_p): New function.
>> diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
>> index b6c89b9..adcea07 100644
>> --- a/gcc/config/aarch64/aarch64-builtins.c
>> +++ b/gcc/config/aarch64/aarch64-builtins.c
>> @@ -335,6 +335,11 @@ enum aarch64_builtins
>>    AARCH64_BUILTIN_GET_FPSR,
>>    AARCH64_BUILTIN_SET_FPSR,
>>
>> +  AARCH64_BUILTIN_RSQRT_DF,
>> +  AARCH64_BUILTIN_RSQRT_SF,
>> +  AARCH64_BUILTIN_RSQRT_V2DF,
>> +  AARCH64_BUILTIN_RSQRT_V2SF,
>> +  AARCH64_BUILTIN_RSQRT_V4SF,
>>    AARCH64_SIMD_BUILTIN_BASE,
>>    AARCH64_SIMD_BUILTIN_LANE_CHECK,
>>  #include "aarch64-simd-builtins.def"
>> @@ -824,6 +829,42 @@ aarch64_init_crc32_builtins ()
>>  }
>>
>>  void
>> +aarch64_add_builtin_rsqrt (void)
>> +{
>> +  tree fndecl = NULL;
>> +  tree ftype = NULL;
>> +
>> +  tree V2SF_type_node = build_vector_type (float_type_node, 2);
>> +  tree V2DF_type_node = build_vector_type (double_type_node, 2);
>> +  tree V4SF_type_node = build_vector_type (float_type_node, 4);
>> +
>> +  ftype = build_function_type_list (double_type_node, double_type_node, NULL_TREE);
>> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_df",
>> +    ftype, AARCH64_BUILTIN_RSQRT_DF, BUILT_IN_MD, NULL, NULL_TREE);
>> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF] = fndecl;
>> +
>> +  ftype = build_function_type_list (float_type_node, float_type_node, NULL_TREE);
>> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_sf",
>> +    ftype, AARCH64_BUILTIN_RSQRT_SF, BUILT_IN_MD, NULL, NULL_TREE);
>> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF] = fndecl;
>> +
>> +  ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, NULL_TREE);
>> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2df",
>> +    ftype, AARCH64_BUILTIN_RSQRT_V2DF, BUILT_IN_MD, NULL, NULL_TREE);
>> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF] = fndecl;
>> +
>> +  ftype = build_function_type_list (V2SF_type_node, V2SF_type_node, NULL_TREE);
>> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2sf",
>> +    ftype, AARCH64_BUILTIN_RSQRT_V2SF, BUILT_IN_MD, NULL, NULL_TREE);
>> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF] = fndecl;
>> +
>> +  ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, NULL_TREE);
>> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v4sf",
>> +    ftype, AARCH64_BUILTIN_RSQRT_V4SF, BUILT_IN_MD, NULL, NULL_TREE);
>> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF] = fndecl;
>> +}
>> +
>> +void
>>  aarch64_init_builtins (void)
>>  {
>>    tree ftype_set_fpr
>> @@ -848,6 +889,7 @@ aarch64_init_builtins (void)
>>      aarch64_init_simd_builtins ();
>>    if (TARGET_CRC32)
>>      aarch64_init_crc32_builtins ();
>> +  aarch64_add_builtin_rsqrt ();
>>  }
>>
>>  tree
>> @@ -1092,6 +1134,38 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
>>    return target;
>>  }
>>
>> +static rtx
>> +aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
>> +{
>> +  rtx pat;
>> +  tree arg0 = CALL_EXPR_ARG (exp, 0);
>> +  rtx op0 = expand_normal (arg0);
>> +
>> +  enum insn_code c = CODE_FOR_rsqrt_df;
>> +
>> +  switch (fcode)
>> +    {
>> +      case AARCH64_BUILTIN_RSQRT_DF:
>> +        c = CODE_FOR_rsqrt_df; break;
>> +      case AARCH64_BUILTIN_RSQRT_SF:
>> +        c = CODE_FOR_rsqrt_sf; break;
>> +      case AARCH64_BUILTIN_RSQRT_V2DF:
>> +        c = CODE_FOR_rsqrt_v2df; break;
>> +      case AARCH64_BUILTIN_RSQRT_V2SF:
>> +        c = CODE_FOR_rsqrt_v2sf; break;
>> +      case AARCH64_BUILTIN_RSQRT_V4SF:
>> +        c = CODE_FOR_rsqrt_v4sf; break;
>> +    }
>> +
>> +  if (!target)
>> +    target = gen_reg_rtx (GET_MODE (op0));
>> +
>> +  pat = GEN_FCN (c) (target, op0);
>> +  emit_insn (pat);
>> +
>> +  return target;
>> +}
>> +
>>  /* Expand an expression EXP that calls a built-in function,
>>     with result going to TARGET if that's convenient.  */
>>  rtx
>> @@ -1139,6 +1213,13 @@ aarch64_expand_builtin (tree exp,
>>    else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
>>      return aarch64_crc32_expand_builtin (fcode, exp, target);
>>
>> +  if (fcode == AARCH64_BUILTIN_RSQRT_DF   ||
>> +      fcode == AARCH64_BUILTIN_RSQRT_SF   ||
>> +      fcode == AARCH64_BUILTIN_RSQRT_V2DF ||
>> +      fcode == AARCH64_BUILTIN_RSQRT_V2SF ||
>> +      fcode == AARCH64_BUILTIN_RSQRT_V4SF)
>> +    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
>> +
>>    gcc_unreachable ();
>>  }
>>
>> @@ -1296,6 +1377,28 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
>>    return NULL_TREE;
>>  }
>>
>> +tree
>> +aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
>> +{
>> +  if (md_fn)
>> +    {
>> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
>> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
>> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
>> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
>> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
>> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
>> +    }
>> +  else
>> +    {
>> +      if (fn == BUILT_IN_SQRT)
>> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
>> +      if (fn == BUILT_IN_SQRTF)
>> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
>> +    }
>> +  return NULL_TREE;
>> +}
>> +
>>  #undef VAR1
>>  #define VAR1(T, N, MAP, A) \
>>    case AARCH64_SIMD_BUILTIN_##T##_##N##A:
>> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
>> index 4062c27..8b8a389 100644
>> --- a/gcc/config/aarch64/aarch64-protos.h
>> +++ b/gcc/config/aarch64/aarch64-protos.h
>> @@ -321,6 +321,8 @@ void aarch64_print_operand (FILE *, rtx, char);
>>  void aarch64_print_operand_address (FILE *, rtx);
>>  void aarch64_emit_call_insn (rtx);
>>
>> +void aarch64_emit_swrsqrt (rtx, rtx);
>> +
>>  /* Initialize builtins for SIMD intrinsics.  */
>>  void init_aarch64_simd_builtins (void);
>>
>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>> index b90f938..ac8e12d 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -353,6 +353,33 @@
>>    [(set_attr "type" "neon_fp_mul_d_scalar_q")]
>>  )
>>
>> +(define_insn "*rsqrte_<mode>"
>> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
>> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
>> +                    UNSPEC_RSQRTE))]
>> +  "TARGET_SIMD"
>> +  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
>> +  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
>> +
>> +(define_insn "*rsqrts_<mode>"
>> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
>> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
>> +               (match_operand:VALLF 2 "register_operand" "w")]
>> +                    UNSPEC_RSQRTS))]
>> +  "TARGET_SIMD"
>> +  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
>> +  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
>> +
>> +(define_expand "rsqrt_<mode>"
>> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
>> +       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
>> +                    UNSPEC_RSQRT))]
>> +  "TARGET_SIMD"
>> +{
>> +  aarch64_emit_swrsqrt (operands[0], operands[1]);
>> +  DONE;
>> +})
>> +
>>  (define_insn "*aarch64_mul3_elt_to_64v2df"
>>    [(set (match_operand:DF 0 "register_operand" "=w")
>>       (mult:DF
>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index 6c13a078..2e53103 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -6961,6 +6961,61 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
>>    return aarch64_tune_params.memmov_cost;
>>  }
>>
>> +extern tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);
>> +
>> +static tree
>> +aarch64_builtin_reciprocal (unsigned int fn,
>> +                            bool md_fn,
>> +                            bool)
>> +{
>> +  if (!flag_mrecip        || !flag_finite_math_only ||
>> +       flag_trapping_math || !flag_unsafe_math_optimizations)
>> +  {
>> +    return NULL_TREE;
>> +  }
>> +
>> +  return aarch64_builtin_rsqrt (fn, md_fn);
>> +}
>> +
>> +void
>> +aarch64_emit_swrsqrt (rtx dst, rtx src)
>> +{
>> +  enum machine_mode mode = GET_MODE (src);
>> +  gcc_assert (
>> +    mode == SFmode || mode == V2SFmode || mode == V4SFmode ||
>> +    mode == DFmode || mode == V2DFmode);
>> +
>> +  rtx xsrc = gen_reg_rtx (mode);
>> +  emit_move_insn (xsrc, src);
>> +  rtx x0 = gen_reg_rtx (mode);
>> +  emit_set_insn (x0,
>> +    gen_rtx_UNSPEC (mode, gen_rtvec (1, xsrc), UNSPEC_RSQRTE));
>> +
>> +  bool double_mode = (mode == DFmode || mode == V2DFmode);
>> +
>> +  int iterations = 2;
>> +  if (double_mode)
>> +    iterations = 3;
>> +
>> +  if (flag_mrecip_low_precision_sqrt)
>> +    iterations--;
>> +
>> +  for (int i = 0; i < iterations; ++i)
>> +    {
>> +      rtx x1 = gen_reg_rtx (mode);
>> +      rtx x2 = gen_reg_rtx (mode);
>> +      rtx x3 = gen_reg_rtx (mode);
>> +      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
>> +      emit_set_insn (x3, gen_rtx_UNSPEC (
>> +        mode, gen_rtvec (2, xsrc, x2), UNSPEC_RSQRTS));
>> +      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
>> +      x0 = x1;
>> +    }
>> +
>> +  emit_move_insn (dst, x0);
>> +  return;
>> +}
>> +
>>  /* Return the number of instructions that can be issued per cycle.  */
>>  static int
>>  aarch64_sched_issue_rate (void)
>> @@ -12099,6 +12154,9 @@ aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
>>  #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
>>  #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
>>
>> +#undef TARGET_BUILTIN_RECIPROCAL
>> +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
>> +
>>  #undef TARGET_VECTOR_MODE_SUPPORTED_P
>>  #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
>>
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index 1e343fa..d7944b2 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -122,6 +122,9 @@
>>      UNSPEC_VSTRUCTDUMMY
>>      UNSPEC_SP_SET
>>      UNSPEC_SP_TEST
>> +    UNSPEC_RSQRT
>> +    UNSPEC_RSQRTE
>> +    UNSPEC_RSQRTS
>>  ])
>>
>>  (define_c_enum "unspecv" [
>> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
>> index 98ef9f6..303efa4 100644
>> --- a/gcc/config/aarch64/aarch64.opt
>> +++ b/gcc/config/aarch64/aarch64.opt
>> @@ -124,3 +124,11 @@ Enum(aarch64_abi) String(ilp32) Value(AARCH64_ABI_ILP32)
>>
>>  EnumValue
>>  Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
>> +
>> +mrecip
>> +Common Report Var(flag_mrecip) Optimization Init(1)
>> +Generate software reciprocal square root for better throughput. Default with fast-math.
>> +
>> +mlow-precision-recip-sqrt
>> +Common Var(flag_mrecip_low_precision_sqrt) Optimization
>> +Run fewer approximation steps to reduce latency and precision.
>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>> index b28e5d6..a0ab23d 100644
>> --- a/gcc/doc/invoke.texi
>> +++ b/gcc/doc/invoke.texi
>> @@ -515,6 +515,8 @@ Objective-C and Objective-C++ Dialects}.
>>  -mtls-dialect=desc  -mtls-dialect=traditional @gol
>>  -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
>>  -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
>> +-mrecip -no-mrecip @gol
>> +-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
>>  -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
>>
>>  @emph{Adapteva Epiphany Options}
>> @@ -12426,6 +12428,24 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
>>  This erratum workaround is made at link time and this will only pass the
>>  corresponding flag to the linker.
>>
>> +@item -mrecip
>> +@item -mno-recip
>> +@opindex mrecip
>> +@opindex mno-recip
>> +This option enables use of the
>> +reciprocal square root estimate instructions with additional
>> +Newton-Raphson steps to increase precision instead of doing a square root and
>> +divide for floating-point arguments.
>> +This is the default setting with @option{-ffast-math}.
>> +It can only be used together with @option{-ffast-math}.
>> +
>> +@item -mlow-precision-recip-sqrt
>> +@item -mno-low-precision-recip-sqrt
>> +@opindex -mlow-precision-recip-sqrt
>> +@opindex -mno-low-precision-recip-sqrt
>> +The square root estimate uses two steps instead of three for double-precision,
>> +and one step instead of two for single-precision. Thus reducing latency and precision.
>> +
>>  @item -march=@var{name}
>>  @opindex march
>>  Specify the name of the target architecture, optionally suffixed by one or
>> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
>> new file mode 100644
>> index 0000000..cd81d9e
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
>> @@ -0,0 +1,63 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3 --save-temps -fverbose-asm -ffast-math" } */
>> +
>> +#include <math.h>
>> +
>> +#define sqrt_float   __builtin_sqrtf
>> +#define sqrt_double  __builtin_sqrt
>> +
>> +#define TESTTYPE(TYPE)                                          \
>> +typedef struct {                                                \
>> +  TYPE a;                                                       \
>> +  TYPE b;                                                       \
>> +  TYPE c;                                                       \
>> +  TYPE d;                                                       \
>> +} s4_##TYPE;                                                    \
>> +                                                                \
>> +typedef struct {                                                \
>> +  TYPE a;                                                       \
>> +  TYPE b;                                                       \
>> +} s2_##TYPE;                                                    \
>> +                                                                \
>> +s4_##TYPE rsqrtv4_##TYPE (s4_##TYPE i)                          \
>> +{                                                               \
>> +  s4_##TYPE o;                                                  \
>> +  o.a = 1.0 / sqrt_##TYPE (i.a);                                \
>> +  o.b = 1.0 / sqrt_##TYPE (i.b);                                \
>> +  o.c = 1.0 / sqrt_##TYPE (i.c);                                \
>> +  o.d = 1.0 / sqrt_##TYPE (i.d);                                \
>> +  return o;                                                     \
>> +}                                                               \
>> +                                                                \
>> +s2_##TYPE rsqrtv2_##TYPE (s2_##TYPE i)                          \
>> +{                                                               \
>> +  s2_##TYPE o;                                                  \
>> +  o.a = 1.0 / sqrt_##TYPE (i.a);                                \
>> +  o.b = 1.0 / sqrt_##TYPE (i.b);                                \
>> +  return o;                                                     \
>> +}                                                               \
>> +                                                                \
>> +TYPE rsqrt_##TYPE (TYPE i)                                      \
>> +{                                                               \
>> +  return 1.0 / sqrt_##TYPE (i);                                 \
>> +}                                                               \
>> +                                                                \
>> +
>> +TESTTYPE(double)
>> +TESTTYPE(float)
>> +
>> +/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
>> +/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
>> +
>> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */
>> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
>> +
>> +
>> +/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
>> +/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
>> +
>> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */
>> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
>> +
>> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */
>> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
>> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt.c b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
>> new file mode 100644
>> index 0000000..8e2ae10
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
>> @@ -0,0 +1,107 @@
>> +/* { dg-do run } */
>> +/* { dg-options "-O3 --save-temps -fverbose-asm -ffast-math" } */
>> +
>> +#include <math.h>
>> +#include <stdio.h>
>> +
>> +#include <float.h>
>> +
>> +#define PI    3.141592653589793
>> +#define SQRT2 1.4142135623730951
>> +
>> +#define PI_4 0.7853981633974483
>> +#define SQRT1_2 0.7071067811865475
>> +
>> +/* 2^25+1, float has 24 significand bits
>> + *       according to Single-precision floating-point format.  */
>> +#define TESTA8_FLT 33554433
>> +/* 2^54+1, double has 53 significand bits
>> + *       according to Double-precision floating-point format.  */
>> +#define TESTA8_DBL 18014398509481985
>> +
>> +#define SD(a, b) t_double ((#a), (a), (b));
>> +#define SF(a, b) t_float ((#a), (a), (b));
>> +
>> +#define EPSILON_double __DBL_EPSILON__
>> +#define EPSILON_float __FLT_EPSILON__
>> +#define ABS_double __builtin_fabs
>> +#define ABS_float __builtin_fabsf
>> +#define SQRT_double __builtin_sqrt
>> +#define SQRT_float __builtin_sqrtf
>> +
>> +extern void abort (void);
>> +
>> +#define TESTTYPE(TYPE)                                                     \
>> +TYPE rsqrt_##TYPE (TYPE a)                                                 \
>> +{                                                                          \
>> +  return 1.0/SQRT_##TYPE(a);                                               \
>> +}                                                                          \
>> +                                                                           \
>> +int equals_##TYPE (TYPE a, TYPE b)                                         \
>> +{                                                                          \
>> +  return (a == b ||                                                        \
>> +          (isnan (a) && isnan (b)) ||                                      \
>> +          (ABS_##TYPE (a - b) < EPSILON_##TYPE));                          \
>> +}                                                                          \
>> +                                                                           \
>> +void t_##TYPE (const char *s, TYPE a, TYPE result)                         \
>> +{                                                                          \
>> +  TYPE r = rsqrt_##TYPE (a);                                               \
>> +  if (!equals_##TYPE (r, result))                                          \
>> +  {                                                                        \
>> +    abort ();                                                              \
>> +  }                                                                        \
>> +}                                                                          \
>> +
>> +//  printf ("Problem in %20s: %30.18A should be %30.18A\n", s, r, result); \
>> +
>> +TESTTYPE(double)
>> +TESTTYPE(float)
>> +
>> +int main ()
>> +{
>> +  SD(    1.0/256, 0X1.00000000000000P+4  );
>> +  SD(        1.0, 0X1.00000000000000P+0  );
>> +  SD(       -1.0,                     NAN);
>> +  SD(       11.0, 0X1.34BF63D1568260P-2  );
>> +  SD(        0.0,                INFINITY);
>> +  SD(   INFINITY, 0X0.00000000000000P+0  );
>> +  SD(        NAN,                     NAN);
>> +  SD(       -NAN,                    -NAN);
>> +  SD(    DBL_MAX, 0X1.00000000000010P-512);
>> +  SD(    DBL_MIN, 0X1.00000000000000P+511);
>> +  SD(         PI, 0X1.20DD750429B6D0P-1  );
>> +  SD(       PI_4, 0X1.20DD750429B6D0P+0  );
>> +  SD(      SQRT2, 0X1.AE89F995AD3AE0P-1  );
>> +  SD(    SQRT1_2, 0X1.306FE0A31B7150P+0  );
>> +  SD(        -PI,                     NAN);
>> +  SD(     -SQRT2,                     NAN);
>> +  SD( TESTA8_DBL, 0X1.00000000000000P-27 );
>> +
>> +  SF(    1.0/256, 0X1.00000000000000P+4  );
>> +  SF(        1.0, 0X1.00000000000000P+0  );
>> +  SF(       -1.0,                     NAN);
>> +  SF(       11.0, 0X1.34BF6400000000P-2  );
>> +  SF(        0.0,                INFINITY);
>> +  SF(   INFINITY, 0X0.00000000000000P+0  );
>> +  SF(        NAN,                     NAN);
>> +  SF(       -NAN,                    -NAN);
>> +  SF(    FLT_MAX, 0X1.00000200000000P-64 );
>> +  SF(    FLT_MIN, 0X1.00000000000000P+63 );
>> +  SF(         PI, 0X1.20DD7400000000P-1  );
>> +  SF(       PI_4, 0X1.20DD7400000000P+0  );
>> +  SF(      SQRT2, 0X1.AE89FA00000000P-1  );
>> +  SF(    SQRT1_2, 0X1.306FE000000000P+0  );
>> +  SF(        -PI,                     NAN);
>> +  SF(     -SQRT2,                     NAN);
>> +  SF( TESTA8_FLT, 0X1.6A09E600000000P-13 );
>> +
>> +//   With -ffast-math these return positive INF.
>> +//   SD(       -0.0,               -INFINITY);
>> +//   SF(       -0.0,               -INFINITY);
>> +//   The reason here is that -ffast-math flushes to zero.
>> +//   SD(DBL_MIN/256, 0X1.00000000000000P+515);
>> +//   SF(FLT_MIN/256, 0X1.00000000000000P+67 );
>> +
>> +  return 0;
>> +}
>> --
>> 1.9.1
>>
Evandro Menezes July 20, 2015, 7:42 p.m. UTC | #3
Hi, Benedikt.

Please, see below.
Evandro Menezes July 20, 2015, 8:15 p.m. UTC | #4
Or rather, have an additional tuning/selection flag that a core may choose to make -mrecip the default with -ffast-math.

Cheers,
diff mbox

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 3432adb..f4b7407 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,20 @@ 
+2015-07-14  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
+	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
+	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
+	rsqrtf.
+	* config/aarch64/aarch64-protos.h: Declare.
+	* config/aarch64/aarch64-simd.md: Matching expressions for
+	frsqrte and frsqrts.
+	* config/aarch64/aarch64.c: New functions. Emit rsqrt
+	estimation code in fast math mode.
+	* config/aarch64/aarch64.md: Added enum entries.
+	* config/aarch64/aarch64.opt: Added options -mrecip and
+	-mlow-precision-recip-sqrt.
+	* testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
+	for frsqrte and frsqrts
+	* testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
+
 2015-07-08  Jiong Wang  <jiong.wang@arm.com>
 
 	* config/aarch64/aarch64.c (aarch64_unspec_may_trap_p): New function.
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index b6c89b9..adcea07 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -335,6 +335,11 @@  enum aarch64_builtins
   AARCH64_BUILTIN_GET_FPSR,
   AARCH64_BUILTIN_SET_FPSR,
 
+  AARCH64_BUILTIN_RSQRT_DF,
+  AARCH64_BUILTIN_RSQRT_SF,
+  AARCH64_BUILTIN_RSQRT_V2DF,
+  AARCH64_BUILTIN_RSQRT_V2SF,
+  AARCH64_BUILTIN_RSQRT_V4SF,
   AARCH64_SIMD_BUILTIN_BASE,
   AARCH64_SIMD_BUILTIN_LANE_CHECK,
 #include "aarch64-simd-builtins.def"
@@ -824,6 +829,42 @@  aarch64_init_crc32_builtins ()
 }
 
 void
+aarch64_add_builtin_rsqrt (void)
+{
+  tree fndecl = NULL;
+  tree ftype = NULL;
+
+  tree V2SF_type_node = build_vector_type (float_type_node, 2);
+  tree V2DF_type_node = build_vector_type (double_type_node, 2);
+  tree V4SF_type_node = build_vector_type (float_type_node, 4);
+
+  ftype = build_function_type_list (double_type_node, double_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_df",
+    ftype, AARCH64_BUILTIN_RSQRT_DF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF] = fndecl;
+
+  ftype = build_function_type_list (float_type_node, float_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_sf",
+    ftype, AARCH64_BUILTIN_RSQRT_SF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF] = fndecl;
+
+  ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2df",
+    ftype, AARCH64_BUILTIN_RSQRT_V2DF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF] = fndecl;
+
+  ftype = build_function_type_list (V2SF_type_node, V2SF_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2sf",
+    ftype, AARCH64_BUILTIN_RSQRT_V2SF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF] = fndecl;
+
+  ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v4sf",
+    ftype, AARCH64_BUILTIN_RSQRT_V4SF, BUILT_IN_MD, NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF] = fndecl;
+}
+
+void
 aarch64_init_builtins (void)
 {
   tree ftype_set_fpr
@@ -848,6 +889,7 @@  aarch64_init_builtins (void)
     aarch64_init_simd_builtins ();
   if (TARGET_CRC32)
     aarch64_init_crc32_builtins ();
+  aarch64_add_builtin_rsqrt ();
 }
 
 tree
@@ -1092,6 +1134,38 @@  aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
   return target;
 }
 
+static rtx
+aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op0 = expand_normal (arg0);
+
+  enum insn_code c = CODE_FOR_rsqrt_df;
+
+  switch (fcode)
+    {
+      case AARCH64_BUILTIN_RSQRT_DF:
+        c = CODE_FOR_rsqrt_df; break;
+      case AARCH64_BUILTIN_RSQRT_SF:
+        c = CODE_FOR_rsqrt_sf; break;
+      case AARCH64_BUILTIN_RSQRT_V2DF:
+        c = CODE_FOR_rsqrt_v2df; break;
+      case AARCH64_BUILTIN_RSQRT_V2SF:
+        c = CODE_FOR_rsqrt_v2sf; break;
+      case AARCH64_BUILTIN_RSQRT_V4SF:
+        c = CODE_FOR_rsqrt_v4sf; break;
+    }
+
+  if (!target)
+    target = gen_reg_rtx (GET_MODE (op0));
+
+  pat = GEN_FCN (c) (target, op0);
+  emit_insn (pat);
+
+  return target;
+}
+
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient.  */
 rtx
@@ -1139,6 +1213,13 @@  aarch64_expand_builtin (tree exp,
   else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
     return aarch64_crc32_expand_builtin (fcode, exp, target);
 
+  if (fcode == AARCH64_BUILTIN_RSQRT_DF   ||
+      fcode == AARCH64_BUILTIN_RSQRT_SF   ||
+      fcode == AARCH64_BUILTIN_RSQRT_V2DF ||
+      fcode == AARCH64_BUILTIN_RSQRT_V2SF ||
+      fcode == AARCH64_BUILTIN_RSQRT_V4SF)
+    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+
   gcc_unreachable ();
 }
 
@@ -1296,6 +1377,28 @@  aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
   return NULL_TREE;
 }
 
+tree
+aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
+{
+  if (md_fn)
+    {
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
+        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
+        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
+        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
+    }
+  else
+    {
+      if (fn == BUILT_IN_SQRT)
+        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
+      if (fn == BUILT_IN_SQRTF)
+        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
+    }
+  return NULL_TREE;
+}
+
 #undef VAR1
 #define VAR1(T, N, MAP, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 4062c27..8b8a389 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -321,6 +321,8 @@  void aarch64_print_operand (FILE *, rtx, char);
 void aarch64_print_operand_address (FILE *, rtx);
 void aarch64_emit_call_insn (rtx);
 
+void aarch64_emit_swrsqrt (rtx, rtx);
+
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index b90f938..ac8e12d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -353,6 +353,33 @@ 
   [(set_attr "type" "neon_fp_mul_d_scalar_q")]
 )
 
+(define_insn "*rsqrte_<mode>"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRTE))]
+  "TARGET_SIMD"
+  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+
+(define_insn "*rsqrts_<mode>"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+               (match_operand:VALLF 2 "register_operand" "w")]
+		     UNSPEC_RSQRTS))]
+  "TARGET_SIMD"
+  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+
+(define_expand "rsqrt_<mode>"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+		     UNSPEC_RSQRT))]
+  "TARGET_SIMD"
+{
+  aarch64_emit_swrsqrt (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "*aarch64_mul3_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
      (mult:DF
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 6c13a078..2e53103 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6961,6 +6961,61 @@  aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
   return aarch64_tune_params.memmov_cost;
 }
 
+extern tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);
+
+static tree
+aarch64_builtin_reciprocal (unsigned int fn,
+                            bool md_fn,
+                            bool)
+{
+  if (!flag_mrecip        || !flag_finite_math_only ||
+       flag_trapping_math || !flag_unsafe_math_optimizations)
+  {
+    return NULL_TREE;
+  }
+
+  return aarch64_builtin_rsqrt (fn, md_fn);
+}
+
+void
+aarch64_emit_swrsqrt (rtx dst, rtx src)
+{
+  enum machine_mode mode = GET_MODE (src);
+  gcc_assert (
+    mode == SFmode || mode == V2SFmode || mode == V4SFmode ||
+    mode == DFmode || mode == V2DFmode);
+
+  rtx xsrc = gen_reg_rtx (mode);
+  emit_move_insn (xsrc, src);
+  rtx x0 = gen_reg_rtx (mode);
+  emit_set_insn (x0,
+    gen_rtx_UNSPEC (mode, gen_rtvec (1, xsrc), UNSPEC_RSQRTE));
+
+  bool double_mode = (mode == DFmode || mode == V2DFmode);
+
+  int iterations = 2;
+  if (double_mode)
+    iterations = 3;
+
+  if (flag_mrecip_low_precision_sqrt)
+    iterations--;
+
+  for (int i = 0; i < iterations; ++i)
+    {
+      rtx x1 = gen_reg_rtx (mode);
+      rtx x2 = gen_reg_rtx (mode);
+      rtx x3 = gen_reg_rtx (mode);
+      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+      emit_set_insn (x3, gen_rtx_UNSPEC (
+        mode, gen_rtvec (2, xsrc, x2), UNSPEC_RSQRTS));
+      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+      x0 = x1;
+    }
+
+  emit_move_insn (dst, x0);
+  return;
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
@@ -12099,6 +12154,9 @@  aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
 
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
+
 #undef TARGET_VECTOR_MODE_SUPPORTED_P
 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 1e343fa..d7944b2 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -122,6 +122,9 @@ 
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
     UNSPEC_SP_TEST
+    UNSPEC_RSQRT
+    UNSPEC_RSQRTE
+    UNSPEC_RSQRTS
 ])
 
 (define_c_enum "unspecv" [
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 98ef9f6..303efa4 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -124,3 +124,11 @@  Enum(aarch64_abi) String(ilp32) Value(AARCH64_ABI_ILP32)
 
 EnumValue
 Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
+
+mrecip
+Common Report Var(flag_mrecip) Optimization Init(1)
+Generate software reciprocal square root for better throughput. Default with fast-math.
+
+mlow-precision-recip-sqrt
+Common Var(flag_mrecip_low_precision_sqrt) Optimization
+Run fewer approximation steps to reduce latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index b28e5d6..a0ab23d 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -515,6 +515,8 @@  Objective-C and Objective-C++ Dialects}.
 -mtls-dialect=desc  -mtls-dialect=traditional @gol
 -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
+-mrecip -no-mrecip @gol
+-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12426,6 +12428,24 @@  Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
 This erratum workaround is made at link time and this will only pass the
 corresponding flag to the linker.
 
+@item -mrecip
+@item -mno-recip
+@opindex mrecip
+@opindex mno-recip
+This option enables use of the
+reciprocal square root estimate instructions with additional
+Newton-Raphson steps to increase precision instead of doing a square root and
+divide for floating-point arguments.
+This is the default setting with @option{-ffast-math}.
+It can only be used together with @option{-ffast-math}.
+
+@item -mlow-precision-recip-sqrt
+@item -mno-low-precision-recip-sqrt
+@opindex -mlow-precision-recip-sqrt
+@opindex -mno-low-precision-recip-sqrt
+The square root estimate uses two steps instead of three for double-precision,
+and one step instead of two for single-precision. Thus reducing latency and precision.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture, optionally suffixed by one or
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
new file mode 100644
index 0000000..cd81d9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
@@ -0,0 +1,63 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 --save-temps -fverbose-asm -ffast-math" } */
+
+#include <math.h>
+
+#define sqrt_float   __builtin_sqrtf
+#define sqrt_double  __builtin_sqrt
+
+#define TESTTYPE(TYPE)                                          \
+typedef struct {                                                \
+  TYPE a;                                                       \
+  TYPE b;                                                       \
+  TYPE c;                                                       \
+  TYPE d;                                                       \
+} s4_##TYPE;                                                    \
+                                                                \
+typedef struct {                                                \
+  TYPE a;                                                       \
+  TYPE b;                                                       \
+} s2_##TYPE;                                                    \
+                                                                \
+s4_##TYPE rsqrtv4_##TYPE (s4_##TYPE i)                          \
+{                                                               \
+  s4_##TYPE o;                                                  \
+  o.a = 1.0 / sqrt_##TYPE (i.a);                                \
+  o.b = 1.0 / sqrt_##TYPE (i.b);                                \
+  o.c = 1.0 / sqrt_##TYPE (i.c);                                \
+  o.d = 1.0 / sqrt_##TYPE (i.d);                                \
+  return o;                                                     \
+}                                                               \
+                                                                \
+s2_##TYPE rsqrtv2_##TYPE (s2_##TYPE i)                          \
+{                                                               \
+  s2_##TYPE o;                                                  \
+  o.a = 1.0 / sqrt_##TYPE (i.a);                                \
+  o.b = 1.0 / sqrt_##TYPE (i.b);                                \
+  return o;                                                     \
+}                                                               \
+                                                                \
+TYPE rsqrt_##TYPE (TYPE i)                                      \
+{                                                               \
+  return 1.0 / sqrt_##TYPE (i);                                 \
+}                                                               \
+                                                                \
+
+TESTTYPE(double)
+TESTTYPE(float)
+
+/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
+
+
+/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt.c b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
new file mode 100644
index 0000000..8e2ae10
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
@@ -0,0 +1,107 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps -fverbose-asm -ffast-math" } */
+
+#include <math.h>
+#include <stdio.h>
+
+#include <float.h>
+
+#define PI    3.141592653589793
+#define SQRT2 1.4142135623730951
+
+#define PI_4 0.7853981633974483
+#define SQRT1_2 0.7071067811865475
+
+/* 2^25+1, float has 24 significand bits
+ *       according to Single-precision floating-point format.  */
+#define TESTA8_FLT 33554433
+/* 2^54+1, double has 53 significand bits
+ *       according to Double-precision floating-point format.  */
+#define TESTA8_DBL 18014398509481985
+
+#define SD(a, b) t_double ((#a), (a), (b));
+#define SF(a, b) t_float ((#a), (a), (b));
+
+#define EPSILON_double __DBL_EPSILON__
+#define EPSILON_float __FLT_EPSILON__
+#define ABS_double __builtin_fabs
+#define ABS_float __builtin_fabsf
+#define SQRT_double __builtin_sqrt
+#define SQRT_float __builtin_sqrtf
+
+extern void abort (void);
+
+#define TESTTYPE(TYPE)                                                     \
+TYPE rsqrt_##TYPE (TYPE a)                                                 \
+{                                                                          \
+  return 1.0/SQRT_##TYPE(a);                                               \
+}                                                                          \
+                                                                           \
+int equals_##TYPE (TYPE a, TYPE b)                                         \
+{                                                                          \
+  return (a == b ||                                                        \
+          (isnan (a) && isnan (b)) ||                                      \
+          (ABS_##TYPE (a - b) < EPSILON_##TYPE));                          \
+}                                                                          \
+                                                                           \
+void t_##TYPE (const char *s, TYPE a, TYPE result)                         \
+{                                                                          \
+  TYPE r = rsqrt_##TYPE (a);                                               \
+  if (!equals_##TYPE (r, result))                                          \
+  {                                                                        \
+    abort ();                                                              \
+  }                                                                        \
+}                                                                          \
+
+//  printf ("Problem in %20s: %30.18A should be %30.18A\n", s, r, result); \
+
+TESTTYPE(double)
+TESTTYPE(float)
+
+int main ()
+{
+  SD(    1.0/256, 0X1.00000000000000P+4  );
+  SD(        1.0, 0X1.00000000000000P+0  );
+  SD(       -1.0,                     NAN);
+  SD(       11.0, 0X1.34BF63D1568260P-2  );
+  SD(        0.0,                INFINITY);
+  SD(   INFINITY, 0X0.00000000000000P+0  );
+  SD(        NAN,                     NAN);
+  SD(       -NAN,                    -NAN);
+  SD(    DBL_MAX, 0X1.00000000000010P-512);
+  SD(    DBL_MIN, 0X1.00000000000000P+511);
+  SD(         PI, 0X1.20DD750429B6D0P-1  );
+  SD(       PI_4, 0X1.20DD750429B6D0P+0  );
+  SD(      SQRT2, 0X1.AE89F995AD3AE0P-1  );
+  SD(    SQRT1_2, 0X1.306FE0A31B7150P+0  );
+  SD(        -PI,                     NAN);
+  SD(     -SQRT2,                     NAN);
+  SD( TESTA8_DBL, 0X1.00000000000000P-27 );
+
+  SF(    1.0/256, 0X1.00000000000000P+4  );
+  SF(        1.0, 0X1.00000000000000P+0  );
+  SF(       -1.0,                     NAN);
+  SF(       11.0, 0X1.34BF6400000000P-2  );
+  SF(        0.0,                INFINITY);
+  SF(   INFINITY, 0X0.00000000000000P+0  );
+  SF(        NAN,                     NAN);
+  SF(       -NAN,                    -NAN);
+  SF(    FLT_MAX, 0X1.00000200000000P-64 );
+  SF(    FLT_MIN, 0X1.00000000000000P+63 );
+  SF(         PI, 0X1.20DD7400000000P-1  );
+  SF(       PI_4, 0X1.20DD7400000000P+0  );
+  SF(      SQRT2, 0X1.AE89FA00000000P-1  );
+  SF(    SQRT1_2, 0X1.306FE000000000P+0  );
+  SF(        -PI,                     NAN);
+  SF(     -SQRT2,                     NAN);
+  SF( TESTA8_FLT, 0X1.6A09E600000000P-13 );
+
+//   With -ffast-math these return positive INF.
+//   SD(       -0.0,               -INFINITY);
+//   SF(       -0.0,               -INFINITY);
+//   The reason here is that -ffast-math flushes to zero.
+//   SD(DBL_MIN/256, 0X1.00000000000000P+515);
+//   SF(FLT_MIN/256, 0X1.00000000000000P+67 );
+
+  return 0;
+}