diff mbox

[AArch64] Emit square root using the Newton series

Message ID 56F43BF2.7020407@samsung.com
State New
Headers show

Commit Message

Evandro Menezes March 24, 2016, 7:11 p.m. UTC
On 03/17/16 17:46, Evandro Menezes wrote:
> This patch refactors the function to emit the reciprocal square root 
> approximation to also emit the square root approximation.

    2016-03-23  Evandro Menezes <e.menezes@samsung.com>
                 Wilco Dijkstra  <wilco.dijkstra@arm.com>

    gcc/
             * config/aarch64/aarch64-tuning-flags.def
             (AARCH64_EXTRA_TUNE_APPROX_SQRT_{SF,DF}): New tuning macros.
             * config/aarch64/aarch64-protos.h
             (aarch64_emit_approx_rsqrt): Replace with
    "aarch64_emit_approx_sqrt".
             (AARCH64_EXTRA_TUNE_APPROX_SQRT): New macro.
             * config/aarch64/aarch64.c
             (exynosm1_tunings): Use the new macro.
             (aarch64_emit_approx_sqrt): Define new function.
             (aarch64_override_options_after_change_1): Handle new option.
             * config/aarch64/aarch64.md
             (rsqrt<mode>2): Use new function instead.
             (sqrt<mode>2): New expansion and insn definitions.
             * config/aarch64/aarch64-simd.md: Likewise.
             * config/aarch64/aarch64.opt
             (mlow-precision-sqrt): Add new option description.
             * doc/invoke.texi (mlow-precision-sqrt): Likewise.

This version of the patch cleans up the changes to the MD files and 
fixes some bugs introduced in it since the first proposal.

Thanks for your feedback,
diff mbox

Patch

From 712e330bf651393bb788e85ebe7b3d9a37f54ae7 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Thu, 17 Mar 2016 17:39:55 -0500
Subject: [PATCH] [AArch64] Emit square root using the Newton series

2016-03-23  Evandro Menezes  <e.menezes@samsung.com>
            Wilco Dijkstra  <wilco.dijkstra@arm.com>

gcc/
	* config/aarch64/aarch64-tuning-flags.def
	(AARCH64_EXTRA_TUNE_APPROX_SQRT_{SF,DF}): New tuning macros.
	* config/aarch64/aarch64-protos.h
	(aarch64_emit_approx_rsqrt): Replace with "aarch64_emit_approx_sqrt".
	(AARCH64_EXTRA_TUNE_APPROX_SQRT): New macro.
	* config/aarch64/aarch64.c
	(exynosm1_tunings): Use the new macro.
	(aarch64_emit_approx_sqrt): Define new function.
	(aarch64_override_options_after_change_1): Handle new option.
	* config/aarch64/aarch64.md
	(rsqrt<mode>2): Use new function instead.
	(sqrt<mode>2): New expansion and insn definitions.
	* config/aarch64/aarch64-simd.md: Likewise.
	* config/aarch64/aarch64.opt
	(mlow-precision-sqrt): Add new option description.
	* doc/invoke.texi (mlow-precision-sqrt): Likewise.
---
 gcc/config/aarch64/aarch64-protos.h         |   5 +-
 gcc/config/aarch64/aarch64-simd.md          |  13 ++-
 gcc/config/aarch64/aarch64-tuning-flags.def |   3 +-
 gcc/config/aarch64/aarch64.c                | 129 ++++++++++++++++++++++------
 gcc/config/aarch64/aarch64.md               |  11 ++-
 gcc/config/aarch64/aarch64.opt              |   9 +-
 gcc/doc/invoke.texi                         |  10 +++
 7 files changed, 147 insertions(+), 33 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index dced209..24c2125 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -263,6 +263,9 @@  enum aarch64_extra_tuning_flags
 };
 #undef AARCH64_EXTRA_TUNING_OPTION
 
+#define AARCH64_EXTRA_TUNE_APPROX_SQRT \
+  (AARCH64_EXTRA_TUNE_APPROX_SQRT_DF | AARCH64_EXTRA_TUNE_APPROX_SQRT_SF)
+
 extern struct tune_params aarch64_tune_params;
 
 HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned);
@@ -361,7 +364,7 @@  void aarch64_register_pragmas (void);
 void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 void aarch64_save_restore_target_globals (tree);
-void aarch64_emit_approx_rsqrt (rtx, rtx);
+bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce..47ccb18 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -405,7 +405,7 @@ 
 		     UNSPEC_RSQRT))]
   "TARGET_SIMD"
 {
-  aarch64_emit_approx_rsqrt (operands[0], operands[1]);
+  aarch64_emit_approx_sqrt (operands[0], operands[1], true);
   DONE;
 })
 
@@ -4307,7 +4307,16 @@ 
 
 ;; sqrt
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:VDQF 0 "register_operand")
+	(sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+    DONE;
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:VDQF 0 "register_operand" "=w")
         (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
   "TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..725a79c 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -30,4 +30,5 @@ 
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
 AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
+AARCH64_EXTRA_TUNING_OPTION ("approx_sqrt", APPROX_SQRT_DF)
+AARCH64_EXTRA_TUNING_OPTION ("approx_sqrtf", APPROX_SQRT_SF)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ed0daa5..b155b74 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -38,6 +38,7 @@ 
 #include "recog.h"
 #include "diagnostic.h"
 #include "insn-attr.h"
+#include "insn-flags.h"
 #include "alias.h"
 #include "fold-const.h"
 #include "stor-layout.h"
@@ -538,7 +539,8 @@  static const struct tune_params exynosm1_tunings =
   48,	/* max_case_values.  */
   64,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_APPROX_RSQRT
+   | AARCH64_EXTRA_TUNE_APPROX_SQRT) /* tune_flags.  */
 };
 
 static const struct tune_params thunderx_tunings =
@@ -7498,46 +7500,115 @@  get_rsqrts_type (machine_mode mode)
   }
 }
 
-/* Emit instruction sequence to compute the reciprocal square root using the
-   Newton-Raphson series.  Iterate over the series twice for SF
-   and thrice for DF.  */
+/* Emit instruction sequence to compute either the approximate square root
+   or its approximate reciprocal.  */
 
-void
-aarch64_emit_approx_rsqrt (rtx dst, rtx src)
+bool
+aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
 {
-  machine_mode mode = GET_MODE (src);
-  gcc_assert (
-    mode == SFmode || mode == V2SFmode || mode == V4SFmode
-	|| mode == DFmode || mode == V2DFmode);
+  machine_mode mode = GET_MODE (dst);
+  machine_mode mmsk = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
+				       GET_MODE_NUNITS (mode));
+  bool scalar = !VECTOR_MODE_P (mode);
+
+  if (!flag_finite_math_only
+      || flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_function_for_size_p (cfun)
+      || !((recp && (flag_mrecip_low_precision_sqrt
+		     || (aarch64_tune_params.extra_tuning_flags
+			 & AARCH64_EXTRA_TUNE_APPROX_RSQRT)))
+	   || (!recp && (flag_mlow_precision_sqrt
+			 || (GET_MODE_INNER (mode) == SFmode
+			     && (aarch64_tune_params.extra_tuning_flags
+				 & AARCH64_EXTRA_TUNE_APPROX_SQRT_SF))
+			 || (GET_MODE_INNER (mode) == DFmode
+			     && (aarch64_tune_params.extra_tuning_flags
+				 & AARCH64_EXTRA_TUNE_APPROX_SQRT_DF))))))
+    return false;
 
-  rtx xsrc = gen_reg_rtx (mode);
-  emit_move_insn (xsrc, src);
-  rtx x0 = gen_reg_rtx (mode);
+  rtx xne, xmsk;
+  if (!recp)
+    {
+      /* When calculating the approximate square root...  */
+      if (scalar)
+	{
+	  /* Compare argument with 0.0 and set the CC.  */
+	  rtx xcc = aarch64_gen_compare_reg (NE, src, CONST0_RTX (mode));
+	  xne = gen_rtx_NE (VOIDmode, xcc, const0_rtx);
+	}
+      else
+	{
+	  /* Compare the argument with 0.0 and set a vector mask.  */
+	  xmsk = gen_reg_rtx (mmsk);
+	  switch (mode)
+	  {
+	    case V2SFmode:
+	      emit_insn (gen_aarch64_cmeqv2sf (xmsk, src, CONST0_RTX (mode)));
+	      break;
+
+	    case V4SFmode:
+	      emit_insn (gen_aarch64_cmeqv4sf (xmsk, src, CONST0_RTX (mode)));
+	      break;
+
+	    case V2DFmode:
+	      emit_insn (gen_aarch64_cmeqv2df (xmsk, src, CONST0_RTX (mode)));
+	      break;
 
-  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+	    default:
+	      gcc_unreachable ();
+	  }
+	}
+    }
 
-  bool double_mode = (mode == DFmode || mode == V2DFmode);
+  /* Estimate the approximate reciprocal square root.  */
+  rtx xdst = gen_reg_rtx (mode);
+  emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
 
-  int iterations = double_mode ? 3 : 2;
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
 
-  /* Optionally iterate over the series one less time than otherwise.  */
-  if (flag_mrecip_low_precision_sqrt)
+  /* Optionally iterate over the series once less for faster performance
+     while sacrificing the accuracy.  */
+  if ((recp && flag_mrecip_low_precision_sqrt)
+      || (!recp && flag_mlow_precision_sqrt))
     iterations--;
 
-  for (int i = 0; i < iterations; ++i)
+  /* Iterate over the series to calculate the approximate reciprocal square root.  */
+  while (iterations--)
     {
-      rtx x1 = gen_reg_rtx (mode);
       rtx x2 = gen_reg_rtx (mode);
-      rtx x3 = gen_reg_rtx (mode);
-      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+      emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
 
-      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+      rtx x1 = gen_reg_rtx (mode);
+      emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
 
-      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
-      x0 = x1;
+      emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
     }
 
-  emit_move_insn (dst, x0);
+  if (!recp)
+    {
+      /* Qualify the approximate reciprocal square root when the argument is 0.0.  */
+      if (scalar)
+	/* Conditionally set the result to 0.0.  */
+	emit_set_insn (xdst, gen_rtx_IF_THEN_ELSE (mode, xne, xdst, src));
+      else
+	{
+	  /* Mask off any resulting vector element to 0.0.  */
+	  rtx xtmp = gen_reg_rtx (mmsk);
+	  emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
+					    gen_rtx_SUBREG (mmsk, xdst, 0)));
+	  emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
+	}
+
+      /* Calculate the approximate square root.  */
+      emit_set_insn (dst, gen_rtx_MULT (mode, xdst, src));
+    }
+  else
+    /* Return the calculated approximate reciprocal square root.  */
+    emit_move_insn (dst, xdst);
+
+  return true;
 }
 
 /* Return the number of instructions that can be issued per cycle.  */
@@ -8144,6 +8215,12 @@  aarch64_override_options_after_change_1 (struct gcc_options *opts)
       && (aarch64_cmodel == AARCH64_CMODEL_TINY
 	  || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
     aarch64_nopcrelative_literal_loads = false;
+
+  /* When enabling the lower precision Newton series for the square root, also
+     enable it for the reciprocal square root, since the later is an
+     intermediary step for the latter.  */
+  if (flag_mlow_precision_sqrt)
+    flag_mrecip_low_precision_sqrt = true;
 }
 
 /* 'Unpack' up the internal tuning structs and update the options
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 68676c9..43fa318 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4665,7 +4665,16 @@ 
   [(set_attr "type" "ffarith<s>")]
 )
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:GPF 0 "register_operand")
+        (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+    DONE;
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:GPF 0 "register_operand" "=w")
         (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
   "TARGET_FLOAT"
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index c637ff4..ffd5540 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -151,5 +151,10 @@  PC relative literal loads.
 
 mlow-precision-recip-sqrt
 Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
+When calculating the approximate reciprocal square root,
+use one less step than otherwise, thus reducing latency and precision.
+
+mlow-precision-sqrt
+Common Var(flag_mlow_precision_sqrt) Optimization
+When calculating the approximate square root,
+use one less step than otherwise, thus reducing latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 99ac11b..433a9f2 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -573,6 +573,7 @@  Objective-C and Objective-C++ Dialects}.
 -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
 -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
+-mlow-precision-sqrt -mno-low-precision-sqrt@gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12908,6 +12909,15 @@  uses one less step than otherwise, thus reducing latency and precision.
 This is only relevant if @option{-ffast-math} enables the reciprocal square root
 approximation, which in turn depends on the target processor.
 
+@item -mlow-precision-sqrt
+@item -mno-low-precision-sqrt
+@opindex -mlow-precision-sqrt
+@opindex -mno-low-precision-sqrt
+When calculating the square root approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the square root
+approximation, which in turn depends on the target processor.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture and, optionally, one or
-- 
1.9.1