From e598b0df2e43f2e64254a0e1ddec608fd78025b1 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Mon, 4 Apr 2016 14:02:24 -0500
Subject: [PATCH 3/3] [AArch64] Emit division using the Newton series
2016-04-04 Evandro Menezes <e.menezes@samsung.com>
Wilco Dijkstra <Wilco.Dijkstra@arm.com>
gcc/
* config/aarch64/aarch64-protos.h
(cpu_approx_modes): Add new member "division".
(aarch64_emit_approx_div): Declare new function.
* config/aarch64/aarch64.c
(generic_approx_modes): New member "division".
(exynosm1_approx_modes): Likewise.
(xgene1_approx_modes): Likewise.
(aarch64_emit_approx_div): Define new function.
* config/aarch64/aarch64.md ("div<mode>3"): New expansion.
* config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
* config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
* doc/invoke.texi (-mlow-precision-div): Describe new option.
---
gcc/config/aarch64/aarch64-protos.h | 2 +
gcc/config/aarch64/aarch64-simd.md | 14 +++++-
gcc/config/aarch64/aarch64.c | 92 +++++++++++++++++++++++++++++++++++++
gcc/config/aarch64/aarch64.md | 19 ++++++--
gcc/config/aarch64/aarch64.opt | 5 ++
gcc/doc/invoke.texi | 10 ++++
6 files changed, 137 insertions(+), 5 deletions(-)
@@ -192,6 +192,7 @@ struct cpu_branch_cost
/* Allowed modes for approximations. */
struct cpu_approx_modes
{
+ const unsigned int division; /* Division. */
const unsigned int sqrt; /* Square root. */
const unsigned int recip_sqrt; /* Reciprocal square root. */
};
@@ -387,6 +388,7 @@ void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void);
void aarch64_save_restore_target_globals (tree);
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+bool aarch64_emit_approx_div (rtx, rtx, rtx);
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
@@ -1509,7 +1509,19 @@
[(set_attr "type" "neon_fp_mul_<Vetype><q>")]
)
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:VDQF 0 "register_operand")
+ (div:VDQF (match_operand:VDQF 1 "general_operand")
+ (match_operand:VDQF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+ DONE;
+
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(div:VDQF (match_operand:VDQF 1 "register_operand" "w")
(match_operand:VDQF 2 "register_operand" "w")))]
@@ -397,6 +397,7 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
/* Generic approximation modes. */
static const cpu_approx_modes generic_approx_modes =
{
+ AARCH64_APPROX_NONE, /* division */
AARCH64_APPROX_NONE, /* sqrt */
AARCH64_APPROX_NONE /* recip_sqrt */
};
@@ -404,6 +405,7 @@ static const cpu_approx_modes generic_approx_modes =
/* Approximation modes for Exynos M1. */
static const cpu_approx_modes exynosm1_approx_modes =
{
+ AARCH64_APPROX_NONE, /* division */
AARCH64_APPROX_ALL, /* sqrt */
AARCH64_APPROX_ALL /* recip_sqrt */
};
@@ -411,6 +413,7 @@ static const cpu_approx_modes exynosm1_approx_modes =
/* Approximation modes for Xgene1. */
static const cpu_approx_modes xgene1_approx_modes =
{
+ AARCH64_APPROX_NONE, /* division */
AARCH64_APPROX_NONE, /* sqrt */
AARCH64_APPROX_ALL /* recip_sqrt */
};
@@ -7486,6 +7489,95 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
return true;
}
+typedef rtx (*recpe_type) (rtx, rtx);
+
+/* Select reciprocal initial estimate insn depending on machine mode. */
+
+static recpe_type
+get_recpe_type (machine_mode mode)
+{
+ switch (mode)
+ {
+ case SFmode: return (gen_aarch64_frecpesf);
+ case V2SFmode: return (gen_aarch64_frecpev2sf);
+ case V4SFmode: return (gen_aarch64_frecpev4sf);
+ case DFmode: return (gen_aarch64_frecpedf);
+ case V2DFmode: return (gen_aarch64_frecpev2df);
+ default: gcc_unreachable ();
+ }
+}
+
+typedef rtx (*recps_type) (rtx, rtx, rtx);
+
+/* Select reciprocal series step insn depending on machine mode. */
+
+static recps_type
+get_recps_type (machine_mode mode)
+{
+ switch (mode)
+ {
+ case SFmode: return (gen_aarch64_frecpssf);
+ case V2SFmode: return (gen_aarch64_frecpsv2sf);
+ case V4SFmode: return (gen_aarch64_frecpsv4sf);
+ case DFmode: return (gen_aarch64_frecpsdf);
+ case V2DFmode: return (gen_aarch64_frecpsv2df);
+ default: gcc_unreachable ();
+ }
+}
+
+/* Emit the instruction sequence to compute the approximation for the division
+ of NUM by DEN in QUO and return whether the sequence was emitted or not. */
+
+bool
+aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
+{
+ machine_mode mode = GET_MODE (quo);
+ bool use_approx_division_p = (flag_mlow_precision_div
+ || (aarch64_tune_params.approx_modes->division
+ & AARCH64_APPROX_MODE (mode)));
+
+ if (!flag_finite_math_only
+ || flag_trapping_math
+ || !flag_unsafe_math_optimizations
+ || optimize_function_for_size_p (cfun)
+ || !use_approx_division_p)
+ return false;
+
+ /* Estimate the approximate reciprocal. */
+ rtx xrcp = gen_reg_rtx (mode);
+ emit_insn ((*get_recpe_type (mode)) (xrcp, den));
+
+ /* Iterate over the series twice for SF and thrice for DF. */
+ int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+ /* Optionally iterate over the series once less for faster performance,
+ while sacrificing the accuracy. */
+ if (flag_mlow_precision_div)
+ iterations--;
+
+ /* Iterate over the series to calculate the approximate reciprocal. */
+ rtx xtmp = gen_reg_rtx (mode);
+ while (iterations--)
+ {
+ emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
+
+ if (iterations > 0)
+ emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
+ }
+
+ if (num != CONST1_RTX (mode))
+ {
+ /* As the approximate reciprocal of DEN is already calculated, only
+ calculate the approximate division when NUM is not 1.0. */
+ rtx xnum = force_reg (mode, num);
+ emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
+ }
+
+ /* Finalize the approximation. */
+ emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
+ return true;
+}
+
/* Return the number of instructions that can be issued per cycle. */
static int
aarch64_sched_issue_rate (void)
@@ -4674,11 +4674,22 @@
[(set_attr "type" "fmul<s>")]
)
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:GPF 0 "register_operand")
+ (div:GPF (match_operand:GPF 1 "general_operand")
+ (match_operand:GPF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+ DONE;
+
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
[(set (match_operand:GPF 0 "register_operand" "=w")
- (div:GPF
- (match_operand:GPF 1 "register_operand" "w")
- (match_operand:GPF 2 "register_operand" "w")))]
+ (div:GPF (match_operand:GPF 1 "register_operand" "w")
+ (match_operand:GPF 2 "register_operand" "w")))]
"TARGET_FLOAT"
"fdiv\\t%<s>0, %<s>1, %<s>2"
[(set_attr "type" "fdiv<s>")]
@@ -158,3 +158,8 @@ mlow-precision-sqrt
Common Var(flag_mlow_precision_sqrt) Optimization
When calculating the approximate square root,
use one less step than otherwise, thus reducing latency and precision.
+
+mlow-precision-div
+Common Var(flag_mlow_precision_div) Optimization
+When calculating the approximate division,
+use one less step than otherwise, thus reducing latency and precision.
@@ -577,6 +577,7 @@ Objective-C and Objective-C++ Dialects}.
-mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol
-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
-mlow-precision-sqrt -mno-low-precision-sqrt@gol
+-mlow-precision-div -mno-low-precision-div @gol
-march=@var{name} -mcpu=@var{name} -mtune=@var{name}}
@emph{Adapteva Epiphany Options}
@@ -13010,6 +13011,15 @@ uses one less step than otherwise, thus reducing latency and precision.
This is only relevant if @option{-ffast-math} enables the square root
approximation.
+@item -mlow-precision-div
+@item -mno-low-precision-div
+@opindex -mlow-precision-div
+@opindex -mno-low-precision-div
+When calculating the division approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the division
+approximation.
+
@item -march=@var{name}
@opindex march
Specify the name of the target architecture and, optionally, one or
--
2.6.3