diff mbox

[AArch64] Add more precision choices for the reciprocal square root approximation

Message ID 56FECE90.9@samsung.com
State New
Headers show

Commit Message

Evandro Menezes April 1, 2016, 7:40 p.m. UTC
[AArch64] Add more choices for the reciprocal square root
    approximation

         Allow a target to prefer such operation depending on the
    operation mode.

         gcc/
             * config/aarch64/aarch64-protos.h
             (AARCH64_APPROX_MODE): New macro.
             (AARCH64_APPROX_{NONE,SP,DP,DFORM,QFORM,ALL}: Likewise.
             (tune_params): New member "approx_rsqrt_modes".
             * config/aarch64/aarch64-tuning-flags.def
             (AARCH64_EXTRA_TUNE_APPROX_RSQRT): Remove macro.
             * config/aarch64/aarch64.c
             (generic_tunings): New member "approx_rsqrt_modes".
             (cortexa35_tunings): Likewise.
             (cortexa53_tunings): Likewise.
             (cortexa57_tunings): Likewise.
             (cortexa72_tunings): Likewise.
             (exynosm1_tunings): Likewise.
             (thunderx_tunings): Likewise.
             (xgene1_tunings): Likewise.
             (use_rsqrt_p): New argument for the mode and use new member
             "approx_rsqrt_modes" from "tune_params".
             (aarch64_builtin_reciprocal): Devise mode from builtin.
             (aarch64_optab_supported_p): New argument for the mode.

This patch allows a target to choose the mode of this operation when it 
is beneficial to use the approximate version.

I hope that this gets in the ballpark of what's been discussed previously.

Thank you,

Comments

Wilco Dijkstra April 1, 2016, 11:08 p.m. UTC | #1
Evandro Menezes wrote:
>
> I hope that this gets in the ballpark of what's been discussed previously.

Yes that's very close to what I had in mind. A minor issue is that the vector 
modes cannot work as they start at MAX_MODE_FLOAT (which is > 32):

+/* Control approximate alternatives to certain FP operators.  */
+#define AARCH64_APPROX_MODE(MODE) \
+  ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \
+   ? (1 << ((MODE) - MIN_MODE_FLOAT)) \
+   : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \
+     ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT + MAX_MODE_FLOAT + 1)) \
+     : (0))

That should be: 

+     ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT + MAX_MODE_FLOAT - MIN_MODE_FLOAT + 1)) \

It would be worth testing all the obvious cases to be sure they work.

Also I don't think it is a good idea to enable all modes on Exynos-M1 and XGene-1 -
I haven't seen any evidence that shows it gives a speedup on real code for all modes
(or at least on a good micro benchmark like the unit vector test I suggested - a simple
throughput test does not count!).

The issue is it hides performance gains from an improved divider/sqrt on new revisions
or microarchitectures. That means you should only enable cases where there is evidence
of a major speedup that cannot be matched by a future improved divider/sqrt.

Wilco
diff mbox

Patch

From 17ac33719bae8966a481cc833c9ac062f7fb742f Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Thu, 3 Mar 2016 18:13:46 -0600
Subject: [PATCH] [AArch64] Add more choices for the reciprocal square root
 approximation

Allow a target to prefer such operation depending on the operation mode.

gcc/
	* config/aarch64/aarch64-protos.h
	(AARCH64_APPROX_MODE): New macro.
	(AARCH64_APPROX_{NONE,SP,DP,DFORM,QFORM,ALL}: Likewise.
	(tune_params): New member "approx_rsqrt_modes".
	* config/aarch64/aarch64-tuning-flags.def
	(AARCH64_EXTRA_TUNE_APPROX_RSQRT): Remove macro.
	* config/aarch64/aarch64.c
	(generic_tunings): New member "approx_rsqrt_modes".
	(cortexa35_tunings): Likewise.
	(cortexa53_tunings): Likewise.
	(cortexa57_tunings): Likewise.
	(cortexa72_tunings): Likewise.
	(exynosm1_tunings): Likewise.
	(thunderx_tunings): Likewise.
	(xgene1_tunings): Likewise.
	(use_rsqrt_p): New argument for the mode and use new member
	"approx_rsqrt_modes" from "tune_params".
	(aarch64_builtin_reciprocal): Devise mode from builtin.
	(aarch64_optab_supported_p): New argument for the mode.
---
 gcc/config/aarch64/aarch64-protos.h         | 21 ++++++++++++++++
 gcc/config/aarch64/aarch64-tuning-flags.def |  2 --
 gcc/config/aarch64/aarch64.c                | 39 ++++++++++++++++++-----------
 3 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 58c9d0d..46f4f18 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -178,6 +178,26 @@  struct cpu_branch_cost
   const int unpredictable;  /* Unpredictable branch or optimizing for speed.  */
 };
 
+/* Control approximate alternatives to certain FP operators.  */
+#define AARCH64_APPROX_MODE(MODE) \
+  ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \
+   ? (1 << ((MODE) - MIN_MODE_FLOAT)) \
+   : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \
+     ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT + MAX_MODE_FLOAT + 1)) \
+     : (0))
+#define AARCH64_APPROX_NONE (0)
+#define AARCH64_APPROX_SP (AARCH64_APPROX_MODE (SFmode) \
+			   | AARCH64_APPROX_MODE (V2SFmode) \
+			   | AARCH64_APPROX_MODE (V4SFmode))
+#define AARCH64_APPROX_DP (AARCH64_APPROX_MODE (DFmode) \
+			   | AARCH64_APPROX_MODE (V2DFmode))
+#define AARCH64_APPROX_DFORM (AARCH64_APPROX_MODE (SFmode) \
+			      | AARCH64_APPROX_MODE (DFmode) \
+			      | AARCH64_APPROX_MODE (V2SFmode))
+#define AARCH64_APPROX_QFORM (AARCH64_APPROX_MODE (V4SFmode) \
+			      | AARCH64_APPROX_MODE (V2DFmode))
+#define AARCH64_APPROX_ALL (-1)
+
 struct tune_params
 {
   const struct cpu_cost_table *insn_extra_cost;
@@ -218,6 +238,7 @@  struct tune_params
   } autoprefetcher_model;
 
   unsigned int extra_tuning_flags;
+  unsigned int approx_rsqrt_modes;
 };
 
 #define AARCH64_FUSION_PAIR(x, name) \
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..048c2a3 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,5 +29,3 @@ 
      AARCH64_TUNE_ to give an enum name. */
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
-AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b7086dd..b0ee11e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -38,6 +38,7 @@ 
 #include "recog.h"
 #include "diagnostic.h"
 #include "insn-attr.h"
+#include "insn-modes.h"
 #include "alias.h"
 #include "fold-const.h"
 #include "stor-layout.h"
@@ -414,7 +415,8 @@  static const struct tune_params generic_tunings =
   0,	/* max_case_values.  */
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
 
 static const struct tune_params cortexa35_tunings =
@@ -439,7 +441,8 @@  static const struct tune_params cortexa35_tunings =
   0,	/* max_case_values.  */
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
 
 static const struct tune_params cortexa53_tunings =
@@ -464,7 +467,8 @@  static const struct tune_params cortexa53_tunings =
   0,	/* max_case_values.  */
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
 
 static const struct tune_params cortexa57_tunings =
@@ -489,7 +493,8 @@  static const struct tune_params cortexa57_tunings =
   0,	/* max_case_values.  */
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
 
 static const struct tune_params cortexa72_tunings =
@@ -514,7 +519,8 @@  static const struct tune_params cortexa72_tunings =
   0,	/* max_case_values.  */
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
 
 static const struct tune_params exynosm1_tunings =
@@ -538,7 +544,8 @@  static const struct tune_params exynosm1_tunings =
   48,	/* max_case_values.  */
   64,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
+  (AARCH64_APPROX_ALL) /* approx_rsqrt_modes.  */
 };
 
 static const struct tune_params thunderx_tunings =
@@ -562,7 +569,8 @@  static const struct tune_params thunderx_tunings =
   0,	/* max_case_values.  */
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_NONE)	/* approx_rsqrt_modes.  */
 };
 
 static const struct tune_params xgene1_tunings =
@@ -586,7 +594,8 @@  static const struct tune_params xgene1_tunings =
   0,	/* max_case_values.  */
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  (AARCH64_APPROX_ALL)	/* approx_rsqrt_modes.  */
 };
 
 /* Support for fine-grained override of the tuning structures.  */
@@ -7452,12 +7461,12 @@  aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
    to optimize 1.0/sqrt.  */
 
 static bool
-use_rsqrt_p (void)
+use_rsqrt_p (machine_mode mode)
 {
   return (!flag_trapping_math
 	  && flag_unsafe_math_optimizations
-	  && ((aarch64_tune_params.extra_tuning_flags
-	       & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
+	  && ((aarch64_tune_params.approx_rsqrt_modes
+	       & AARCH64_APPROX_MODE (mode))
 	      || flag_mrecip_low_precision_sqrt));
 }
 
@@ -7467,7 +7476,9 @@  use_rsqrt_p (void)
 static tree
 aarch64_builtin_reciprocal (tree fndecl)
 {
-  if (!use_rsqrt_p ())
+  machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
+
+  if (!use_rsqrt_p (mode))
     return NULL_TREE;
   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
 }
@@ -13964,13 +13975,13 @@  aarch64_promoted_type (const_tree t)
 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
 
 static bool
-aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
 			   optimization_type opt_type)
 {
   switch (op)
     {
     case rsqrt_optab:
-      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
+      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
 
     default:
       return true;
-- 
1.9.1