From 0bdd18af83a82377dd6b954c4e64904f6022a2b2 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Mon, 4 Apr 2016 14:02:24 -0500
Subject: [PATCH 3/3] [AArch64] Emit division using the Newton series
2016-04-04 Evandro Menezes <e.menezes@samsung.com>
Wilco Dijkstra <Wilco.Dijkstra@arm.com>
gcc/
* config/aarch64/aarch64-protos.h
(tune_params): Add new member "approx_div_modes".
(aarch64_emit_approx_div): Declare new function.
* config/aarch64/aarch64.c
(generic_tunings): New member "approx_div_modes".
(cortexa35_tunings): Likewise.
(cortexa53_tunings): Likewise.
(cortexa57_tunings): Likewise.
(cortexa72_tunings): Likewise.
(exynosm1_tunings): Likewise.
(thunderx_tunings): Likewise.
(xgene1_tunings): Likewise.
(aarch64_emit_approx_div): Define new function.
* config/aarch64/aarch64.md ("div<mode>3"): New expansion.
* config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
* config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
* doc/invoke.texi (-mlow-precision-div): Describe new option.
---
gcc/config/aarch64/aarch64-protos.h | 2 +
gcc/config/aarch64/aarch64-simd.md | 14 +++++-
gcc/config/aarch64/aarch64.c | 85 +++++++++++++++++++++++++++++++++++++
gcc/config/aarch64/aarch64.md | 19 +++++++--
gcc/config/aarch64/aarch64.opt | 5 +++
gcc/doc/invoke.texi | 10 +++++
6 files changed, 130 insertions(+), 5 deletions(-)
@@ -244,6 +244,7 @@ struct tune_params
} autoprefetcher_model;
unsigned int extra_tuning_flags;
+ unsigned int approx_div_modes;
unsigned int approx_sqrt_modes;
unsigned int approx_rsqrt_modes;
};
@@ -398,6 +399,7 @@ void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void);
void aarch64_save_restore_target_globals (tree);
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+bool aarch64_emit_approx_div (rtx, rtx, rtx);
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
@@ -1509,7 +1509,19 @@
[(set_attr "type" "neon_fp_mul_<Vetype><q>")]
)
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:VDQF 0 "register_operand")
+ (div:VDQF (match_operand:VDQF 1 "general_operand")
+ (match_operand:VDQF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+ DONE;
+
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(div:VDQF (match_operand:VDQF 1 "register_operand" "w")
(match_operand:VDQF 2 "register_operand" "w")))]
@@ -417,6 +417,7 @@ static const struct tune_params generic_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -444,6 +445,7 @@ static const struct tune_params cortexa35_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -471,6 +473,7 @@ static const struct tune_params cortexa53_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -498,6 +501,7 @@ static const struct tune_params cortexa57_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -525,6 +529,7 @@ static const struct tune_params cortexa72_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -551,6 +556,7 @@ static const struct tune_params exynosm1_tunings =
64, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_ALL), /* approx_sqrt_modes. */
(AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */
};
@@ -577,6 +583,7 @@ static const struct tune_params thunderx_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -603,6 +610,7 @@ static const struct tune_params xgene1_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */
};
@@ -7604,6 +7612,83 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
return true;
}
+/* Emit the instruction sequence to compute the approximation for a division. */
+
+bool
+aarch64_emit_approx_div (rtx quo, rtx num, rtx div)
+{
+ machine_mode mode = GET_MODE (quo);
+
+ if (!flag_finite_math_only
+ || flag_trapping_math
+ || !flag_unsafe_math_optimizations
+ || optimize_function_for_size_p (cfun)
+ || !(flag_mlow_precision_div
+ || (aarch64_tune_params.approx_div_modes & AARCH64_APPROX_MODE (mode))))
+ return false;
+
+ /* Estimate the approximate reciprocal. */
+ rtx xrcp = gen_reg_rtx (mode);
+ switch (mode)
+ {
+ case SFmode:
+ emit_insn (gen_aarch64_frecpesf (xrcp, div)); break;
+ case V2SFmode:
+ emit_insn (gen_aarch64_frecpev2sf (xrcp, div)); break;
+ case V4SFmode:
+ emit_insn (gen_aarch64_frecpev4sf (xrcp, div)); break;
+ case DFmode:
+ emit_insn (gen_aarch64_frecpedf (xrcp, div)); break;
+ case V2DFmode:
+ emit_insn (gen_aarch64_frecpev2df (xrcp, div)); break;
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Iterate over the series twice for SF and thrice for DF. */
+ int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+ /* Optionally iterate over the series once less for faster performance,
+ while sacrificing the accuracy. */
+ if (flag_mlow_precision_div)
+ iterations--;
+
+ /* Iterate over the series to calculate the approximate reciprocal. */
+ rtx xtmp = gen_reg_rtx (mode);
+ while (iterations--)
+ {
+ switch (mode)
+ {
+ case SFmode:
+ emit_insn (gen_aarch64_frecpssf (xtmp, xrcp, div)); break;
+ case V2SFmode:
+ emit_insn (gen_aarch64_frecpsv2sf (xtmp, xrcp, div)); break;
+ case V4SFmode:
+ emit_insn (gen_aarch64_frecpsv4sf (xtmp, xrcp, div)); break;
+ case DFmode:
+ emit_insn (gen_aarch64_frecpsdf (xtmp, xrcp, div)); break;
+ case V2DFmode:
+ emit_insn (gen_aarch64_frecpsv2df (xtmp, xrcp, div)); break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (iterations > 0)
+ emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
+ }
+
+ if (num != CONST1_RTX (mode))
+ {
+ /* Calculate the approximate division. */
+ rtx xnum = force_reg (mode, num);
+ emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
+ }
+
+ /* Return the approximation. */
+ emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
+ return true;
+}
+
/* Return the number of instructions that can be issued per cycle. */
static int
aarch64_sched_issue_rate (void)
@@ -4665,11 +4665,22 @@
[(set_attr "type" "fmul<s>")]
)
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:GPF 0 "register_operand")
+ (div:GPF (match_operand:GPF 1 "general_operand")
+ (match_operand:GPF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+ DONE;
+
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
[(set (match_operand:GPF 0 "register_operand" "=w")
- (div:GPF
- (match_operand:GPF 1 "register_operand" "w")
- (match_operand:GPF 2 "register_operand" "w")))]
+ (div:GPF (match_operand:GPF 1 "register_operand" "w")
+ (match_operand:GPF 2 "register_operand" "w")))]
"TARGET_FLOAT"
"fdiv\\t%<s>0, %<s>1, %<s>2"
[(set_attr "type" "fdiv<s>")]
@@ -158,3 +158,8 @@ mlow-precision-sqrt
Common Var(flag_mlow_precision_sqrt) Optimization
When calculating the approximate square root,
use one less step than otherwise, thus reducing latency and precision.
+
+mlow-precision-div
+Common Var(flag_mlow_precision_div) Optimization
+When calculating the approximate division,
+use one less step than otherwise, thus reducing latency and precision.
@@ -575,6 +575,7 @@ Objective-C and Objective-C++ Dialects}.
-mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol
-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
-mlow-precision-sqrt -mno-low-precision-sqrt@gol
+-mlow-precision-div -mno-low-precision-div @gol
-march=@var{name} -mcpu=@var{name} -mtune=@var{name}}
@emph{Adapteva Epiphany Options}
@@ -12951,6 +12952,15 @@ uses one less step than otherwise, thus reducing latency and precision.
This is only relevant if @option{-ffast-math} enables the square root
approximation.
+@item -mlow-precision-div
+@item -mno-low-precision-div
+@opindex -mlow-precision-div
+@opindex -mno-low-precision-div
+When calculating the division approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the division
+approximation.
+
@item -march=@var{name}
@opindex march
Specify the name of the target architecture and, optionally, one or
--
2.6.3