From 4f61f722f744339650a48aa034906dd685110ae2 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Tue, 8 Mar 2016 15:06:03 -0600
Subject: [PATCH] Emit square root using the Newton series
gcc/
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNE_APPROX_SQRT_{DF,SF}): New tuning macros.
* config/aarch64/aarch64-protos.h
(aarch64_emit_approx_sqrt): Declare new function.
* config/aarch64/aarch64.c
(aarch64_emit_approx_sqrt): Define new function.
* config/aarch64/aarch64.md
(sqrt*2): New expansion and insn definitions.
* config/aarch64/aarch64-simd.md (sqrt*2): Likewise.
* config/aarch64/aarch64.opt
(mlow-precision-recip-sqrt): Expand option description.
* doc/invoke.texi (mlow-precision-recip-sqrt): Likewise.
---
gcc/config/aarch64/aarch64-protos.h | 3 +++
gcc/config/aarch64/aarch64-simd.md | 25 ++++++++++++++++++++-
gcc/config/aarch64/aarch64-tuning-flags.def | 3 ++-
gcc/config/aarch64/aarch64.c | 35 ++++++++++++++++++++++++-----
gcc/config/aarch64/aarch64.md | 25 ++++++++++++++++++++-
gcc/config/aarch64/aarch64.opt | 4 ++--
gcc/doc/invoke.texi | 9 ++++----
7 files changed, 89 insertions(+), 15 deletions(-)
@@ -265,6 +265,8 @@ enum aarch64_extra_tuning_flags
#define AARCH64_EXTRA_TUNE_APPROX_RSQRT \
(AARCH64_EXTRA_TUNE_APPROX_RSQRT_DF | AARCH64_EXTRA_TUNE_APPROX_RSQRT_SF)
+#define AARCH64_EXTRA_TUNE_APPROX_SQRT \
+ (AARCH64_EXTRA_TUNE_APPROX_SQRT_DF | AARCH64_EXTRA_TUNE_APPROX_SQRT_SF)
extern struct tune_params aarch64_tune_params;
@@ -364,6 +366,7 @@ void aarch64_register_pragmas (void);
void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void);
void aarch64_emit_approx_rsqrt (rtx, rtx);
+void aarch64_emit_approx_sqrt (rtx, rtx);
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
@@ -4307,7 +4307,30 @@
;; sqrt
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:VDQF 0 "register_operand")
+ (sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+ "TARGET_SIMD"
+{
+ machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1]));
+
+ if (flag_finite_math_only
+ && !flag_trapping_math
+ && flag_unsafe_math_optimizations
+ && !optimize_function_for_size_p (cfun)
+ && ((mode == SFmode
+ && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_APPROX_SQRT_SF))
+ || (mode == DFmode
+ && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_APPROX_SQRT_DF))))
+ {
+ aarch64_emit_approx_sqrt (operands[0], operands[1]);
+ DONE;
+ }
+})
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
"TARGET_SIMD"
@@ -31,4 +31,5 @@
AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT_DF)
AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrtf", APPROX_RSQRT_SF)
-
+AARCH64_EXTRA_TUNING_OPTION ("approx_sqrt", APPROX_SQRT_DF)
+AARCH64_EXTRA_TUNING_OPTION ("approx_sqrtf", APPROX_SQRT_SF)
@@ -538,7 +538,8 @@ static const struct tune_params exynosm1_tunings =
48, /* max_case_values. */
64, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_APPROX_SQRT_SF
+ | AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */
};
static const struct tune_params thunderx_tunings =
@@ -7537,9 +7538,8 @@ void
aarch64_emit_approx_rsqrt (rtx dst, rtx src)
{
machine_mode mode = GET_MODE (src);
- gcc_assert (
- mode == SFmode || mode == V2SFmode || mode == V4SFmode
- || mode == DFmode || mode == V2DFmode);
+ gcc_assert (GET_MODE_INNER (mode) == SFmode
+ || GET_MODE_INNER (mode) == DFmode);
rtx xsrc = gen_reg_rtx (mode);
emit_move_insn (xsrc, src);
@@ -7547,8 +7547,7 @@ aarch64_emit_approx_rsqrt (rtx dst, rtx src)
emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
- bool double_mode = (mode == DFmode || mode == V2DFmode);
-
+ bool double_mode = (GET_MODE_INNER (mode) == DFmode);
int iterations = double_mode ? 3 : 2;
/* Optionally iterate over the series one less time than otherwise. */
@@ -7571,6 +7570,30 @@ aarch64_emit_approx_rsqrt (rtx dst, rtx src)
emit_move_insn (dst, x0);
}
+/* Emit instruction sequence to compute the approximate square root. */
+
+void
+aarch64_emit_approx_sqrt (rtx dst, rtx src)
+{
+ machine_mode mode = GET_MODE (src);
+ gcc_assert (GET_MODE_INNER (mode) == SFmode
+ || GET_MODE_INNER (mode) == DFmode);
+
+ rtx xsrc = gen_reg_rtx (mode);
+ emit_move_insn (xsrc, src);
+
+ /* Calculate the approximate square root by multiplying the approximate
+ reciprocal square root... */
+ rtx xrsqrt = gen_reg_rtx (mode);
+ aarch64_emit_approx_rsqrt (xrsqrt, xsrc);
+
+ /* ... by the original value. */
+ rtx xsqrt = gen_reg_rtx (mode);
+ emit_set_insn (xsqrt, gen_rtx_MULT (mode, xrsqrt, xsrc));
+
+ emit_move_insn (dst, xsqrt);
+}
+
/* Return the number of instructions that can be issued per cycle. */
static int
aarch64_sched_issue_rate (void)
@@ -4665,7 +4665,30 @@
[(set_attr "type" "ffarith<s>")]
)
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:GPF 0 "register_operand")
+ (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
+ "TARGET_SIMD"
+{
+ machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1]));
+
+ if (flag_finite_math_only
+ && !flag_trapping_math
+ && flag_unsafe_math_optimizations
+ && !optimize_function_for_size_p (cfun)
+ && ((mode == SFmode
+ && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_APPROX_SQRT_SF))
+ || (mode == DFmode
+ && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_APPROX_SQRT_DF))))
+ {
+ aarch64_emit_approx_sqrt (operands[0], operands[1]);
+ DONE;
+ }
+})
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:GPF 0 "register_operand" "=w")
(sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
"TARGET_FLOAT"
@@ -151,5 +151,5 @@ PC relative literal loads.
mlow-precision-recip-sqrt
Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
+When calculating the approximate square root or its approximate reciprocal,
+use one less step than otherwise, thus reducing latency and precision.
@@ -12887,10 +12887,11 @@ corresponding flag to the linker.
@item -mno-low-precision-recip-sqrt
@opindex -mlow-precision-recip-sqrt
@opindex -mno-low-precision-recip-sqrt
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
-This is only relevant if @option{-ffast-math} enables the reciprocal square root
-approximation, which in turn depends on the target processor.
+When calculating the approximate square root or its approximate reciprocal,
+use one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables
+the approximate square root or its approximate reciprocal,
+which in turn depends on the target processor.
@item -march=@var{name}
@opindex march
--
2.6.3