diff mbox

[1/3,AArch64] Add more choices for the reciprocal square root approximation

Message ID 5751FB88.4050906@samsung.com
State New
Headers show

Commit Message

Evandro Menezes June 3, 2016, 9:50 p.m. UTC
On 06/01/16 03:35, James Greenhalgh wrote:
> On Fri, May 27, 2016 at 05:57:23PM -0500, Evandro Menezes wrote:
>>  From 86d7690632d03ec85fd69bfaef8e89c0542518ad Mon Sep 17 00:00:00 2001
>> From: Evandro Menezes <e.menezes@samsung.com>
>> Date: Thu, 3 Mar 2016 18:13:46 -0600
>> Subject: [PATCH 1/3] [AArch64] Add more choices for the reciprocal square root
>>   approximation
>>
>> Allow a target to prefer such operation depending on the operation mode.
>>
>> 2016-03-03  Evandro Menezes  <e.menezes@samsung.com>
>>
>> gcc/
>> 	* config/aarch64/aarch64-protos.h
>> 	(AARCH64_APPROX_MODE): New macro.
>> 	(AARCH64_APPROX_{NONE,ALL}): Likewise.
>> 	(cpu_approx_modes): New structure.
>> 	(tune_params): New member "approx_modes".
>> 	* config/aarch64/aarch64-tuning-flags.def
>> 	(AARCH64_EXTRA_TUNE_APPROX_RSQRT): Remove macro.
>> 	* config/aarch64/aarch64.c
>> 	({generic,exynosm1,xgene1}_approx_modes): New core
>> 	"cpu_approx_modes" structures.
>> 	(generic_tunings): New member "approx_modes".
>> 	(cortexa35_tunings): Likewise.
>> 	(cortexa53_tunings): Likewise.
>> 	(cortexa57_tunings): Likewise.
>> 	(cortexa72_tunings): Likewise.
>> 	(exynosm1_tunings): Likewise.
>> 	(thunderx_tunings): Likewise.
>> 	(xgene1_tunings): Likewise.
>> 	(use_rsqrt_p): New argument for the mode and use new member from
>> 	"tune_params".
>> 	(aarch64_builtin_reciprocal): Devise mode from builtin.
>> 	(aarch64_optab_supported_p): New argument for the mode.
>> 	* doc/invoke.texi (-mlow-precision-recip-sqrt): Reword description.
> We're almost there, just a couple of style comments left on this one
> I think. Thanks for sticking with it so far.
>
>> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
>> index f22a31c..6156281 100644
>> --- a/gcc/config/aarch64/aarch64-protos.h
>> +++ b/gcc/config/aarch64/aarch64-protos.h
>> @@ -178,6 +178,23 @@ struct cpu_branch_cost
>>     const int unpredictable;  /* Unpredictable branch or optimizing for speed.  */
>>   };
>>   
>> +/* Control approximate alternatives to certain FP operators.  */
>> +#define AARCH64_APPROX_MODE(MODE) \
>> +  ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \
>> +   ? (1 << ((MODE) - MIN_MODE_FLOAT)) \
>> +   : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \
>> +     ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT \
>> +	      + MAX_MODE_FLOAT - MIN_MODE_FLOAT + 1)) \
>> +     : (0))
>> +#define AARCH64_APPROX_NONE (0)
>> +#define AARCH64_APPROX_ALL (-1)
>> +
>> +/* Allowed modes for approximations.  */
>> +struct cpu_approx_modes
>> +{
>> +  const unsigned int recip_sqrt; /* Reciprocal square root.  */
>> +};
>> +
>>   struct tune_params
>>   {
>>     const struct cpu_cost_table *insn_extra_cost;
>> @@ -218,6 +235,8 @@ struct tune_params
>>     } autoprefetcher_model;
>>   
>>     unsigned int extra_tuning_flags;
>> +
>> +  const struct cpu_approx_modes *approx_modes;
> Sorry to be irritating, but would you mind putting this up beside the
> other "struct" members of tune_params. So directly under the branch_costs
> member?

OK

>>   };
>>   
>>   #define AARCH64_FUSION_PAIR(x, name) \
>
>> diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
>> index 7e45a0c..048c2a3 100644
>> --- a/gcc/config/aarch64/aarch64-tuning-flags.def
>> +++ b/gcc/config/aarch64/aarch64-tuning-flags.def
>> @@ -29,5 +29,3 @@
>>        AARCH64_TUNE_ to give an enum name. */
>>   
>>   AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
>> -AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
>> -
>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index 9995494..e532cfc 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -38,6 +38,7 @@
>>   #include "recog.h"
>>   #include "diagnostic.h"
>>   #include "insn-attr.h"
>> +#include "insn-modes.h"
> Do we need this include? My build test of just this patch suggests not (maybe
> you need it later in the series?). This can probably be dropped.

OK

>>   #include "alias.h"
>>   #include "fold-const.h"
>>   #include "stor-layout.h"
>> @@ -393,6 +394,24 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
>>     3   /* Unpredictable.  */
>>   };
>>   
>> +/* Generic approximation modes.  */
>> +static const cpu_approx_modes generic_approx_modes =
>> +{
>> +  AARCH64_APPROX_NONE	/* recip_sqrt  */
>> +};
>> +
>> +/* Approximation modes for Exynos M1.  */
>> +static const cpu_approx_modes exynosm1_approx_modes =
>> +{
>> +  AARCH64_APPROX_ALL	/* recip_sqrt  */
>> +};
>> +
>> +/* Approximation modes for Xgene1.  */
> This should be "X-Gene 1" looking at how Applied Micro style it in their
> marketing materials.

OK

>> +static const cpu_approx_modes xgene1_approx_modes =
>> +{
>> +  AARCH64_APPROX_ALL	/* recip_sqrt  */
>> +};
>> +
>>   static bool
>> -use_rsqrt_p (void)
>> +use_rsqrt_p (machine_mode mode)
>>   {
>>     return (!flag_trapping_math
>>   	  && flag_unsafe_math_optimizations
>> -	  && ((aarch64_tune_params.extra_tuning_flags
>> -	       & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
>> +	  && ((aarch64_tune_params.approx_modes->recip_sqrt
>> +	       & AARCH64_APPROX_MODE (mode))
>>   	      || flag_mrecip_low_precision_sqrt));
>>   }
>>   
>> @@ -7467,7 +7494,9 @@ use_rsqrt_p (void)
>>   static tree
>>   aarch64_builtin_reciprocal (tree fndecl)
>>   {
>> -  if (!use_rsqrt_p ())
>> +  machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
>> +
>> +  if (!use_rsqrt_p (mode))
>>       return NULL_TREE;
>>     return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
>>   }
>> @@ -13889,13 +13918,13 @@ aarch64_promoted_type (const_tree t)
>>   /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
>>   
>>   static bool
>> -aarch64_optab_supported_p (int op, machine_mode, machine_mode,
>> +aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
>>   			   optimization_type opt_type)
>>   {
>>     switch (op)
>>       {
>>       case rsqrt_optab:
>> -      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
>> +      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
>>   
>>       default:
>>         return true;
>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>> index f1ac257..4340b08 100644
>> --- a/gcc/doc/invoke.texi
>> +++ b/gcc/doc/invoke.texi
>> @@ -12939,7 +12939,7 @@ corresponding flag to the linker.
>>   When calculating the reciprocal square root approximation,
>>   uses one less step than otherwise, thus reducing latency and precision.
>>   This is only relevant if @option{-ffast-math} enables the reciprocal square root
>> -approximation, which in turn depends on the target processor.
>> +approximation.
> This hunk no longer applies on trunk over Wilco's changes. Can you check
> whether you're happy with the wording that is on trunk now, and drop or
> update this hunk as appropriate.

OK

Thank you,

Comments

James Greenhalgh June 13, 2016, 10 a.m. UTC | #1
On Fri, Jun 03, 2016 at 04:50:00PM -0500, Evandro Menezes wrote:
> From 763562f829d4fec54d21555b2bfd6478d449294f Mon Sep 17 00:00:00 2001
> From: Evandro Menezes <e.menezes@samsung.com>
> Date: Thu, 3 Mar 2016 18:13:46 -0600
> Subject: [PATCH 1/3] [AArch64] Add more choices for the reciprocal square root
>  approximation
> 
> Allow a target to prefer such operation depending on the operation mode.

This is OK for trunk.

Thanks for the patch, and your work getting it ready for trunk.

Thanks,
James

> 
> 2016-03-03  Evandro Menezes  <e.menezes@samsung.com>
> 
> gcc/
> 	* config/aarch64/aarch64-protos.h
> 	(AARCH64_APPROX_MODE): New macro.
> 	(AARCH64_APPROX_{NONE,ALL}): Likewise.
> 	(cpu_approx_modes): New structure.
> 	(tune_params): New member "approx_modes".
> 	* config/aarch64/aarch64-tuning-flags.def
> 	(AARCH64_EXTRA_TUNE_APPROX_RSQRT): Remove macro.
> 	* config/aarch64/aarch64.c
> 	({generic,exynosm1,xgene1}_approx_modes): New core
> 	"cpu_approx_modes" structures.
> 	(generic_tunings): New member "approx_modes".
> 	(cortexa35_tunings): Likewise.
> 	(cortexa53_tunings): Likewise.
> 	(cortexa57_tunings): Likewise.
> 	(cortexa72_tunings): Likewise.
> 	(exynosm1_tunings): Likewise.
> 	(thunderx_tunings): Likewise.
> 	(xgene1_tunings): Likewise.
> 	(use_rsqrt_p): New argument for the mode and use new member from
> 	"tune_params".
> 	(aarch64_builtin_reciprocal): Devise mode from builtin.
> 	(aarch64_optab_supported_p): New argument for the mode.
> 	* doc/invoke.texi (-mlow-precision-recip-sqrt): Reword description.
diff mbox

Patch

From 763562f829d4fec54d21555b2bfd6478d449294f Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Thu, 3 Mar 2016 18:13:46 -0600
Subject: [PATCH 1/3] [AArch64] Add more choices for the reciprocal square root
 approximation

Allow a target to prefer such operation depending on the operation mode.

2016-03-03  Evandro Menezes  <e.menezes@samsung.com>

gcc/
	* config/aarch64/aarch64-protos.h
	(AARCH64_APPROX_MODE): New macro.
	(AARCH64_APPROX_{NONE,ALL}): Likewise.
	(cpu_approx_modes): New structure.
	(tune_params): New member "approx_modes".
	* config/aarch64/aarch64-tuning-flags.def
	(AARCH64_EXTRA_TUNE_APPROX_RSQRT): Remove macro.
	* config/aarch64/aarch64.c
	({generic,exynosm1,xgene1}_approx_modes): New core
	"cpu_approx_modes" structures.
	(generic_tunings): New member "approx_modes".
	(cortexa35_tunings): Likewise.
	(cortexa53_tunings): Likewise.
	(cortexa57_tunings): Likewise.
	(cortexa72_tunings): Likewise.
	(exynosm1_tunings): Likewise.
	(thunderx_tunings): Likewise.
	(xgene1_tunings): Likewise.
	(use_rsqrt_p): New argument for the mode and use new member from
	"tune_params".
	(aarch64_builtin_reciprocal): Devise mode from builtin.
	(aarch64_optab_supported_p): New argument for the mode.
	* doc/invoke.texi (-mlow-precision-recip-sqrt): Reword description.
---
 gcc/config/aarch64/aarch64-protos.h         | 18 ++++++++++++
 gcc/config/aarch64/aarch64-tuning-flags.def |  2 --
 gcc/config/aarch64/aarch64.c                | 44 +++++++++++++++++++++++------
 gcc/doc/invoke.texi                         |  2 +-
 4 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ab9b37a..1c56a1d 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -178,6 +178,23 @@  struct cpu_branch_cost
   const int unpredictable;  /* Unpredictable branch or optimizing for speed.  */
 };
 
+/* Control approximate alternatives to certain FP operators.  */
+#define AARCH64_APPROX_MODE(MODE) \
+  ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \
+   ? (1 << ((MODE) - MIN_MODE_FLOAT)) \
+   : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \
+     ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT \
+	      + MAX_MODE_FLOAT - MIN_MODE_FLOAT + 1)) \
+     : (0))
+#define AARCH64_APPROX_NONE (0)
+#define AARCH64_APPROX_ALL (-1)
+
+/* Allowed modes for approximations.  */
+struct cpu_approx_modes
+{
+  const unsigned int recip_sqrt; /* Reciprocal square root.  */
+};
+
 struct tune_params
 {
   const struct cpu_cost_table *insn_extra_cost;
@@ -185,6 +202,7 @@  struct tune_params
   const struct cpu_regmove_cost *regmove_cost;
   const struct cpu_vector_cost *vec_costs;
   const struct cpu_branch_cost *branch_costs;
+  const struct cpu_approx_modes *approx_modes;
   int memmov_cost;
   int issue_rate;
   unsigned int fusible_ops;
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..048c2a3 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,5 +29,3 @@ 
      AARCH64_TUNE_ to give an enum name. */
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
-AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ad07fe1..f7a3bd6 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -393,6 +393,24 @@  static const struct cpu_branch_cost cortexa57_branch_cost =
   3   /* Unpredictable.  */
 };
 
+/* Generic approximation modes.  */
+static const cpu_approx_modes generic_approx_modes =
+{
+  AARCH64_APPROX_NONE	/* recip_sqrt  */
+};
+
+/* Approximation modes for Exynos M1.  */
+static const cpu_approx_modes exynosm1_approx_modes =
+{
+  AARCH64_APPROX_ALL	/* recip_sqrt  */
+};
+
+/* Approximation modes for X-Gene 1.  */
+static const cpu_approx_modes xgene1_approx_modes =
+{
+  AARCH64_APPROX_ALL	/* recip_sqrt  */
+};
+
 static const struct tune_params generic_tunings =
 {
   &cortexa57_extra_costs,
@@ -400,6 +418,7 @@  static const struct tune_params generic_tunings =
   &generic_regmove_cost,
   &generic_vector_cost,
   &generic_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   2, /* issue_rate  */
   AARCH64_FUSE_NOTHING, /* fusible_ops  */
@@ -424,6 +443,7 @@  static const struct tune_params cortexa35_tunings =
   &cortexa53_regmove_cost,
   &generic_vector_cost,
   &generic_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   1, /* issue_rate  */
   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -449,6 +469,7 @@  static const struct tune_params cortexa53_tunings =
   &cortexa53_regmove_cost,
   &generic_vector_cost,
   &generic_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   2, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -474,6 +495,7 @@  static const struct tune_params cortexa57_tunings =
   &cortexa57_regmove_cost,
   &cortexa57_vector_cost,
   &cortexa57_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -499,6 +521,7 @@  static const struct tune_params cortexa72_tunings =
   &cortexa57_regmove_cost,
   &cortexa57_vector_cost,
   &generic_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -524,6 +547,7 @@  static const struct tune_params exynosm1_tunings =
   &exynosm1_regmove_cost,
   &exynosm1_vector_cost,
   &generic_branch_cost,
+  &exynosm1_approx_modes,
   4,	/* memmov_cost  */
   3,	/* issue_rate  */
   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
@@ -538,7 +562,7 @@  static const struct tune_params exynosm1_tunings =
   48,	/* max_case_values.  */
   64,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 };
 
 static const struct tune_params thunderx_tunings =
@@ -548,6 +572,7 @@  static const struct tune_params thunderx_tunings =
   &thunderx_regmove_cost,
   &generic_vector_cost,
   &generic_branch_cost,
+  &generic_approx_modes,
   6, /* memmov_cost  */
   2, /* issue_rate  */
   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
@@ -572,6 +597,7 @@  static const struct tune_params xgene1_tunings =
   &xgene1_regmove_cost,
   &xgene1_vector_cost,
   &generic_branch_cost,
+  &xgene1_approx_modes,
   6, /* memmov_cost  */
   4, /* issue_rate  */
   AARCH64_FUSE_NOTHING, /* fusible_ops  */
@@ -586,7 +612,7 @@  static const struct tune_params xgene1_tunings =
   0,	/* max_case_values.  */
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
 
 /* Support for fine-grained override of the tuning structures.  */
@@ -7319,12 +7345,12 @@  aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
    to optimize 1.0/sqrt.  */
 
 static bool
-use_rsqrt_p (void)
+use_rsqrt_p (machine_mode mode)
 {
   return (!flag_trapping_math
 	  && flag_unsafe_math_optimizations
-	  && ((aarch64_tune_params.extra_tuning_flags
-	       & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
+	  && ((aarch64_tune_params.approx_modes->recip_sqrt
+	       & AARCH64_APPROX_MODE (mode))
 	      || flag_mrecip_low_precision_sqrt));
 }
 
@@ -7334,7 +7360,9 @@  use_rsqrt_p (void)
 static tree
 aarch64_builtin_reciprocal (tree fndecl)
 {
-  if (!use_rsqrt_p ())
+  machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
+
+  if (!use_rsqrt_p (mode))
     return NULL_TREE;
   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
 }
@@ -13730,13 +13758,13 @@  aarch64_promoted_type (const_tree t)
 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
 
 static bool
-aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
 			   optimization_type opt_type)
 {
   switch (op)
     {
     case rsqrt_optab:
-      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
+      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
 
     default:
       return true;
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 89bd983..d5391c1 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -13014,7 +13014,7 @@  corresponding flag to the linker.
 @item -mno-low-precision-recip-sqrt
 @opindex mlow-precision-recip-sqrt
 @opindex mno-low-precision-recip-sqrt
-Enable or disable reciprocal square root approximation.
+Enable or disable the reciprocal square root approximation.
 This option only has an effect if @option{-ffast-math} or
 @option{-funsafe-math-optimizations} is used as well.  Enabling this reduces
 precision of reciprocal square root results to about 16 bits for
-- 
2.6.3