diff mbox

[2/3,AArch64] Emit square root using the Newton series

Message ID 5748D0D6.80902@samsung.com
State New
Headers show

Commit Message

Evandro Menezes May 27, 2016, 10:57 p.m. UTC
On 05/25/16 10:52, James Greenhalgh wrote:
> On Wed, Apr 27, 2016 at 04:15:45PM -0500, Evandro Menezes wrote:
>>     gcc/
>>          * config/aarch64/aarch64-protos.h
>>          (aarch64_emit_approx_rsqrt): Replace with new function
>>          "aarch64_emit_approx_sqrt".
>>          (tune_params): New member "approx_sqrt_modes".
>>          * config/aarch64/aarch64.c
>>          (generic_tunings): New member "approx_rsqrt_modes".
>>          (cortexa35_tunings): Likewise.
>>          (cortexa53_tunings): Likewise.
>>          (cortexa57_tunings): Likewise.
>>          (cortexa72_tunings): Likewise.
>>          (exynosm1_tunings): Likewise.
>>          (thunderx_tunings): Likewise.
>>          (xgene1_tunings): Likewise.
>>          (aarch64_emit_approx_rsqrt): Replace with new function
>>          "aarch64_emit_approx_sqrt".
>>          (aarch64_override_options_after_change_1): Handle new option.
>>          * config/aarch64/aarch64-simd.md
>>          (rsqrt<mode>2): Use new function instead.
>>          (sqrt<mode>2): New expansion and insn definitions.
>>          * config/aarch64/aarch64.md: Likewise.
>>          * config/aarch64/aarch64.opt
>>          (mlow-precision-sqrt): Add new option description.
>>          * doc/invoke.texi (mlow-precision-sqrt): Likewise.
>>
>>
>   
>> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
>> index 50f1d24..437f6af 100644
>> --- a/gcc/config/aarch64/aarch64-protos.h
>> +++ b/gcc/config/aarch64/aarch64-protos.h
>> @@ -244,6 +244,7 @@ struct tune_params
>>     } autoprefetcher_model;
>>   
>>     unsigned int extra_tuning_flags;
>> +  unsigned int approx_sqrt_modes;
>>     unsigned int approx_rsqrt_modes;
>>   };
> This should go in struct recommended in 1/3 too.

OK

>>   
>> @@ -396,7 +397,7 @@ void aarch64_register_pragmas (void);
>>   void aarch64_relayout_simd_types (void);
>>   void aarch64_reset_previous_fndecl (void);
>>   void aarch64_save_restore_target_globals (tree);
>> -void aarch64_emit_approx_rsqrt (rtx, rtx);
>> +bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
>>   
>>   /* Initialize builtins for SIMD intrinsics.  */
>>   void init_aarch64_simd_builtins (void);
>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index 68381bf..589871b 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -38,6 +38,7 @@
>>   #include "recog.h"
>>   #include "diagnostic.h"
>>   #include "insn-attr.h"
>> +#include "insn-flags.h"
> Can you remember what you needed this include for? I couldn't spot it below
> and removing the include didn't seem to cause any trouble to the build.
> If it isn't needed, drop it.

I don't recall.  Dropped.

>>   #include "insn-modes.h"
>>   #include "alias.h"
>>   #include "fold-const.h"
>> @@ -7521,46 +7530,78 @@ get_rsqrts_type (machine_mode mode)
>>     }
>>   }
>>   
>> -/* Emit instruction sequence to compute the reciprocal square root using the
>> -   Newton-Raphson series.  Iterate over the series twice for SF
>> -   and thrice for DF.  */
>> +/* Emit instruction sequence to compute either the approximate square root
>> +   or its approximate reciprocal.  */
> As you are updating this function a new parameter and a return value, please
> comment it appropriately. Describing the purpose of the parameter and the
> meaning of the return value.

OK

>> -void
>> -aarch64_emit_approx_rsqrt (rtx dst, rtx src)
>> +bool
>> +aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
>>   {
>> -  machine_mode mode = GET_MODE (src);
>> -  gcc_assert (
>> -    mode == SFmode || mode == V2SFmode || mode == V4SFmode
>> -	|| mode == DFmode || mode == V2DFmode);
>> -
>> -  rtx xsrc = gen_reg_rtx (mode);
>> -  emit_move_insn (xsrc, src);
>> -  rtx x0 = gen_reg_rtx (mode);
>> +  machine_mode mode = GET_MODE (dst);
>> +  machine_mode mmsk = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
>> +				       GET_MODE_NUNITS (mode));
>> +
>> +  if (!flag_finite_math_only
>> +      || flag_trapping_math
>> +      || !flag_unsafe_math_optimizations
>> +      || optimize_function_for_size_p (cfun)
>> +      || !((recp && (flag_mrecip_low_precision_sqrt
>> +		     || (aarch64_tune_params.approx_rsqrt_modes
>> +			 & AARCH64_APPROX_MODE (mode))))
>> +	   || (!recp && (flag_mlow_precision_sqrt
>> +			 || (aarch64_tune_params.approx_sqrt_modes
>> +			     & AARCH64_APPROX_MODE (mode))))))
>> +    return false;
> Can you pull out these sub-clauses and assign them somewhere for clarity.
> These expressions are a bit too big to grok at a distance. I'd rather be
> reading:
>
> bool use_low_precision_rsqrt_p
>    = recp
>       && (flag_mrecip_low_precision_sqrt
> 	 || (aarch64_tune_params.approx_rsqrt_modes
> 	     & AARCH64_APPROX_MODE (mode)))
>     <...>
>
>    !((use_low_precision_sqrt_p)
>       || (use_low_precision_rsqrt_p))

OK

>> -  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
>> +  rtx xmsk = gen_reg_rtx (mmsk);
>> +  if (!recp)
>> +    /* When calculating the approximate square root, compare the argument with
>> +       0.0 and create a mask.  */
>> +    emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
>> +							  CONST0_RTX (mode)))));
>>   
>> -  bool double_mode = (mode == DFmode || mode == V2DFmode);
>> +  /* Estimate the approximate reciprocal square root.  */
>> +  rtx xdst = gen_reg_rtx (mode);
>> +  emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
>>   
>> -  int iterations = double_mode ? 3 : 2;
>> +  /* Iterate over the series twice for SF and thrice for DF.  */
>> +  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
>>   
>> -  /* Optionally iterate over the series one less time than otherwise.  */
>> -  if (flag_mrecip_low_precision_sqrt)
>> +  /* Optionally iterate over the series once less for faster performance
>> +     while sacrificing the accuracy.  */
>> +  if ((recp && flag_mrecip_low_precision_sqrt)
>> +      || (!recp && flag_mlow_precision_sqrt))
>>       iterations--;
>>   
>> -  for (int i = 0; i < iterations; ++i)
>> +  /* Iterate over the series to calculate the approximate reciprocal square root.  */
>> +  rtx x1 = gen_reg_rtx (mode);
> Why do you generate a register for x1 outside the loop but x2 inside it?
> Is it just because you want x1 to survive this scope? It seems you could
> either reuse a single x2 register too, or repeatedly generate a new x1
> inside the loop too.

Yes, x1 is needed from here through the last step before the function 
returns.

>> +  while (iterations--)
>>       {
>> -      rtx x1 = gen_reg_rtx (mode);
>>         rtx x2 = gen_reg_rtx (mode);
>> -      rtx x3 = gen_reg_rtx (mode);
>> -      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
>> +      emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
>> +
>> +      emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
>>   
>> -      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
>> +      if (iterations > 0)
>> +	emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
>> +    }
>> +
>> +  if (!recp)
>> +    {
>> +      /* Qualify the approximate reciprocal square root when the argument is
>> +	 0.0 by squashing the intermediary result to 0.0.  */
>> +      rtx xtmp = gen_reg_rtx (mmsk);
>> +      emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
>> +					      gen_rtx_SUBREG (mmsk, xdst, 0)));
>> +      emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
>>   
>> -      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
>> -      x0 = x1;
>> +      /* Calculate the approximate square root.  */
>> +      emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
>>       }
>>   
>> -  emit_move_insn (dst, x0);
>> +  /* Return the approximation.  */
>> +  emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
>> +
>> +  return true;
>>   }
>>   
>>   /* Return the number of instructions that can be issued per cycle.  */
>> @@ -8090,6 +8131,12 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
>>         && (aarch64_cmodel == AARCH64_CMODEL_TINY
>>   	  || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
>>       aarch64_nopcrelative_literal_loads = false;
>> +
>> +  /* When enabling the lower precision Newton series for the square root, also
>> +     enable it for the reciprocal square root, since the later is an
>> +     intermediary step for the latter.  */
>> +  if (flag_mlow_precision_sqrt)
>> +    flag_mrecip_low_precision_sqrt = true;
>>   }
>>   
>>   /* 'Unpack' up the internal tuning structs and update the options
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index 9b282f1..aab3e00 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -4683,7 +4683,16 @@
>>     [(set_attr "type" "ffarith<s>")]
>>   )
>>   
>> -(define_insn "sqrt<mode>2"
>> +(define_expand "sqrt<mode>2"
>> +  [(set (match_operand:GPF 0 "register_operand")
>> +        (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
>> +  "TARGET_SIMD"
>> +{
>> +  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
>> +    DONE;
>> +})
>> +
>> +(define_insn "*sqrt<mode>2"
>>     [(set (match_operand:GPF 0 "register_operand" "=w")
>>           (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
>>     "TARGET_FLOAT"
>> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
>> index c637ff4..ffd5540 100644
>> --- a/gcc/config/aarch64/aarch64.opt
>> +++ b/gcc/config/aarch64/aarch64.opt
>> @@ -151,5 +151,10 @@ PC relative literal loads.
>>   
>>   mlow-precision-recip-sqrt
>>   Common Var(flag_mrecip_low_precision_sqrt) Optimization
>> -When calculating the reciprocal square root approximation,
>> -uses one less step than otherwise, thus reducing latency and precision.
>> +When calculating the approximate reciprocal square root,
>> +use one less step than otherwise, thus reducing latency and precision.
>> +
>> +mlow-precision-sqrt
>> +Common Var(flag_mlow_precision_sqrt) Optimization
>> +When calculating the approximate square root,
>> +use one less step than otherwise, thus reducing latency and precision.
> Do we really want separate options for each of these optimizations? Could
> they all be wrapped under one -mlow-precision-fp or similar? I can see
> benefits to either approach, but let me know your thoughts.

Since the results of these approximations are not accurate, I prefer to 
let the user decide which of them may be relaxed.  Again, I lean towards 
the way done for RS6000.

Thank you,

Comments

James Greenhalgh June 1, 2016, 9 a.m. UTC | #1
On Fri, May 27, 2016 at 05:57:26PM -0500, Evandro Menezes wrote:
> 2016-04-04  Evandro Menezes  <e.menezes@samsung.com>
>             Wilco Dijkstra  <wilco.dijkstra@arm.com>
> 
> gcc/
> 	* config/aarch64/aarch64-protos.h
> 	(aarch64_emit_approx_rsqrt): Replace with new function
> 	"aarch64_emit_approx_sqrt".
> 	(cpu_approx_modes): New member "sqrt".
> 	* config/aarch64/aarch64.c
> 	(generic_approx_modes): New member "sqrt".
> 	(exynosm1_approx_modes): Likewise.
> 	(xgene1_approx_modes): Likewise.
> 	(aarch64_emit_approx_rsqrt): Replace with new function
> 	"aarch64_emit_approx_sqrt".
> 	(aarch64_override_options_after_change_1): Handle new option.
> 	* config/aarch64/aarch64-simd.md
> 	(rsqrt<mode>2): Use new function instead.
> 	(sqrt<mode>2): New expansion and insn definitions.
> 	* config/aarch64/aarch64.md: Likewise.
> 	* config/aarch64/aarch64.opt
> 	(mlow-precision-sqrt): Add new option description.
> 	* doc/invoke.texi (mlow-precision-sqrt): Likewise.

> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 6156281..2f407fd 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -192,7 +192,8 @@ struct cpu_branch_cost
>  /* Allowed modes for approximations.  */
>  struct cpu_approx_modes
>  {
> -  const unsigned int recip_sqrt; /* Reciprocal square root.  */
> +  const unsigned int sqrt;		/* Square root.  */
> +  const unsigned int recip_sqrt;	/* Reciprocal square root.  */
>  };
>  
>  struct tune_params
> @@ -388,7 +389,7 @@ void aarch64_register_pragmas (void);
>  void aarch64_relayout_simd_types (void);
>  void aarch64_reset_previous_fndecl (void);
>  void aarch64_save_restore_target_globals (tree);
> -void aarch64_emit_approx_rsqrt (rtx, rtx);
> +bool aarch64_emit_approx_sqrt (rtx, rtx, bool);

There's a goal to try to keep these in alphabetical order (first by return
type, then by name). This should go up by the other "bool" return types.

> +/* Emit instruction sequence to compute either the approximate square root
> +   or its approximate reciprocal, depending on the flag RECP, and return
> +   whether the sequence was emitted or not.  */
>  
> -void
> -aarch64_emit_approx_rsqrt (rtx dst, rtx src)
> +bool
> +aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
>  {
> -  machine_mode mode = GET_MODE (src);
> -  gcc_assert (
> -    mode == SFmode || mode == V2SFmode || mode == V4SFmode
> -	|| mode == DFmode || mode == V2DFmode);
> -
> -  rtx xsrc = gen_reg_rtx (mode);
> -  emit_move_insn (xsrc, src);
> -  rtx x0 = gen_reg_rtx (mode);
> +  machine_mode mode = GET_MODE (dst);
> +  machine_mode mmsk = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
> +				       GET_MODE_NUNITS (mode));

Long line. You can run your patch through contrib/check-gnu-style.sh to find
these and other GNU style issues.

> +  bool use_approx_sqrt_p = (!recp
> +			    && (flag_mlow_precision_sqrt
> +			        || (aarch64_tune_params.approx_modes->sqrt
> +				    & AARCH64_APPROX_MODE (mode))));
> +  bool use_approx_rsqrt_p = (recp
> +			     && (flag_mrecip_low_precision_sqrt
> +				 || (aarch64_tune_params.approx_modes->recip_sqrt

Long line.

> +				     & AARCH64_APPROX_MODE (mode))));
> +
> +  if (!flag_finite_math_only
> +      || flag_trapping_math
> +      || !flag_unsafe_math_optimizations



> +      || optimize_function_for_size_p (cfun)
> +      || !(use_approx_sqrt_p || use_approx_rsqrt_p))

Swap these two cases to avoid the slightly more expensive call if we fail
the cheap flags check.

> +    return false;
>  
> -  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
> +  rtx xmsk = gen_reg_rtx (mmsk);
> +  if (!recp)
> +    /* When calculating the approximate square root, compare the argument with
> +       0.0 and create a mask.  */
> +    emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
> +							  CONST0_RTX (mode)))));

I guess you've done it this way rather than calling gen_aarch64_cmeq<mode>
directly to avoid having a switch on mode? I wonder whether it is worth just
writing that helper function to make it explicit what instruction we want
to match?

>  
> -  bool double_mode = (mode == DFmode || mode == V2DFmode);
> +  /* Estimate the approximate reciprocal square root.  */
> +  rtx xdst = gen_reg_rtx (mode);
> +  emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
>  
> -  int iterations = double_mode ? 3 : 2;
> +  /* Iterate over the series twice for SF and thrice for DF.  */
> +  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
>  
> -  /* Optionally iterate over the series one less time than otherwise.  */
> -  if (flag_mrecip_low_precision_sqrt)
> +  /* Optionally iterate over the series once less for faster performance
> +     while sacrificing the accuracy.  */
> +  if ((recp && flag_mrecip_low_precision_sqrt)
> +      || (!recp && flag_mlow_precision_sqrt))
>      iterations--;
>  
> -  for (int i = 0; i < iterations; ++i)
> +  /* Iterate over the series to calculate the approximate reciprocal square root.  */

Long line.

> +  rtx x1 = gen_reg_rtx (mode);
> +  while (iterations--)
>      {
> -      rtx x1 = gen_reg_rtx (mode);
>        rtx x2 = gen_reg_rtx (mode);
> -      rtx x3 = gen_reg_rtx (mode);
> -      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
> +      emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
>  
> -      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
> +      emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
>  
> -      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
> -      x0 = x1;
> +      if (iterations > 0)
> +	emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
>      }
>  
> -  emit_move_insn (dst, x0);
> +  if (!recp)
> +    {
> +      /* Qualify the approximate reciprocal square root when the argument is
> +	 0.0 by squashing the intermediary result to 0.0.  */
> +      rtx xtmp = gen_reg_rtx (mmsk);
> +      emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
> +					      gen_rtx_SUBREG (mmsk, xdst, 0)));
> +      emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
> +
> +      /* Calculate the approximate square root.  */
> +      emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
> +    }
> +
> +  /* Finalize the approximation.  */
> +  emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
> +
> +  return true;
>  }
>  
>  /* Return the number of instructions that can be issued per cycle.  */
> @@ -8108,6 +8146,12 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
>        && (aarch64_cmodel == AARCH64_CMODEL_TINY
>  	  || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
>      aarch64_nopcrelative_literal_loads = false;
> +
> +  /* When enabling the lower precision Newton series for the square root, also
> +     enable it for the reciprocal square root, since the later is an
> +     intermediary step for the latter.  */

This doesn't make sense to me. I think you mean "since the former is an
intermediary step ...".

> +  if (flag_mlow_precision_sqrt)
> +    flag_mrecip_low_precision_sqrt = true;
>  }
>  
>  /* 'Unpack' up the internal tuning structs and update the options
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 9b282f1..aab3e00 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -4683,7 +4683,16 @@
>    [(set_attr "type" "ffarith<s>")]
>  )
>  
> -(define_insn "sqrt<mode>2"
> +(define_expand "sqrt<mode>2"
> +  [(set (match_operand:GPF 0 "register_operand")
> +        (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
> +  "TARGET_SIMD"

This should be TARGET_FLOAT.

> +{
> +  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
> +    DONE;
> +})
> +
> +(define_insn "*sqrt<mode>2"
>    [(set (match_operand:GPF 0 "register_operand" "=w")
>          (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
>    "TARGET_FLOAT"
> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
> index c637ff4..ffd5540 100644
> --- a/gcc/config/aarch64/aarch64.opt
> +++ b/gcc/config/aarch64/aarch64.opt
> @@ -151,5 +151,10 @@ PC relative literal loads.
>  
>  mlow-precision-recip-sqrt
>  Common Var(flag_mrecip_low_precision_sqrt) Optimization
> -When calculating the reciprocal square root approximation,
> -uses one less step than otherwise, thus reducing latency and precision.
> +When calculating the approximate reciprocal square root,
> +use one less step than otherwise, thus reducing latency and precision.
> +
> +mlow-precision-sqrt
> +Common Var(flag_mlow_precision_sqrt) Optimization
> +When calculating the approximate square root,
> +use one less step than otherwise, thus reducing latency and precision.
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 4340b08..76b7a5c 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -574,6 +574,7 @@ Objective-C and Objective-C++ Dialects}.
>  -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
>  -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
>  -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
> +-mlow-precision-sqrt -mno-low-precision-sqrt@gol
>  -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
>  
>  @emph{Adapteva Epiphany Options}
> @@ -12941,6 +12942,15 @@ uses one less step than otherwise, thus reducing latency and precision.
>  This is only relevant if @option{-ffast-math} enables the reciprocal square root
>  approximation.
>  
> +@item -mlow-precision-sqrt
> +@item -mno-low-precision-sqrt
> +@opindex -mlow-precision-sqrt
> +@opindex -mno-low-precision-sqrt
> +When calculating the square root approximation,
> +uses one less step than otherwise, thus reducing latency and precision.

Looks like you've reintroduced the "uses" typo you fixed in the aarch64.opt
hunk above. Regardless, can you look at Wilco's wording for
-mlow-precision-recip-sqrt on trunk and decide whether that would be more
appropriate?

> +This is only relevant if @option{-ffast-math} enables the square root
> +approximation.
> +
>  @item -march=@var{name}
>  @opindex march
>  Specify the name of the target architecture and, optionally, one or

Thanks,
James
diff mbox

Patch

From 42e1a3d1289ba3951d3b9a7fd523422b6c0e56c8 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.menezes@samsung.com>
Date: Mon, 4 Apr 2016 11:23:29 -0500
Subject: [PATCH 2/3] [AArch64] Emit square root using the Newton series

2016-04-04  Evandro Menezes  <e.menezes@samsung.com>
            Wilco Dijkstra  <wilco.dijkstra@arm.com>

gcc/
	* config/aarch64/aarch64-protos.h
	(aarch64_emit_approx_rsqrt): Replace with new function
	"aarch64_emit_approx_sqrt".
	(cpu_approx_modes): New member "sqrt".
	* config/aarch64/aarch64.c
	(generic_approx_modes): New member "sqrt".
	(exynosm1_approx_modes): Likewise.
	(xgene1_approx_modes): Likewise.
	(aarch64_emit_approx_rsqrt): Replace with new function
	"aarch64_emit_approx_sqrt".
	(aarch64_override_options_after_change_1): Handle new option.
	* config/aarch64/aarch64-simd.md
	(rsqrt<mode>2): Use new function instead.
	(sqrt<mode>2): New expansion and insn definitions.
	* config/aarch64/aarch64.md: Likewise.
	* config/aarch64/aarch64.opt
	(mlow-precision-sqrt): Add new option description.
	* doc/invoke.texi (mlow-precision-sqrt): Likewise.
---
 gcc/config/aarch64/aarch64-protos.h |   5 +-
 gcc/config/aarch64/aarch64-simd.md  |  13 ++++-
 gcc/config/aarch64/aarch64.c        | 108 +++++++++++++++++++++++++-----------
 gcc/config/aarch64/aarch64.md       |  11 +++-
 gcc/config/aarch64/aarch64.opt      |   9 ++-
 gcc/doc/invoke.texi                 |  10 ++++
 6 files changed, 117 insertions(+), 39 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 6156281..2f407fd 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -192,7 +192,8 @@  struct cpu_branch_cost
 /* Allowed modes for approximations.  */
 struct cpu_approx_modes
 {
-  const unsigned int recip_sqrt; /* Reciprocal square root.  */
+  const unsigned int sqrt;		/* Square root.  */
+  const unsigned int recip_sqrt;	/* Reciprocal square root.  */
 };
 
 struct tune_params
@@ -388,7 +389,7 @@  void aarch64_register_pragmas (void);
 void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 void aarch64_save_restore_target_globals (tree);
-void aarch64_emit_approx_rsqrt (rtx, rtx);
+bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce..47ccb18 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -405,7 +405,7 @@ 
 		     UNSPEC_RSQRT))]
   "TARGET_SIMD"
 {
-  aarch64_emit_approx_rsqrt (operands[0], operands[1]);
+  aarch64_emit_approx_sqrt (operands[0], operands[1], true);
   DONE;
 })
 
@@ -4307,7 +4307,16 @@ 
 
 ;; sqrt
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:VDQF 0 "register_operand")
+	(sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+    DONE;
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:VDQF 0 "register_operand" "=w")
         (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
   "TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e532cfc..d55ca34 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -397,18 +397,21 @@  static const struct cpu_branch_cost cortexa57_branch_cost =
 /* Generic approximation modes.  */
 static const cpu_approx_modes generic_approx_modes =
 {
+  AARCH64_APPROX_NONE,	/* sqrt  */
   AARCH64_APPROX_NONE	/* recip_sqrt  */
 };
 
 /* Approximation modes for Exynos M1.  */
 static const cpu_approx_modes exynosm1_approx_modes =
 {
+  AARCH64_APPROX_ALL,	/* sqrt  */
   AARCH64_APPROX_ALL	/* recip_sqrt  */
 };
 
 /* Approximation modes for Xgene1.  */
 static const cpu_approx_modes xgene1_approx_modes =
 {
+  AARCH64_APPROX_NONE,	/* sqrt  */
   AARCH64_APPROX_ALL	/* recip_sqrt  */
 };
 
@@ -7503,8 +7506,8 @@  aarch64_builtin_reciprocal (tree fndecl)
 
 typedef rtx (*rsqrte_type) (rtx, rtx);
 
-/* Select reciprocal square root initial estimate
-   insn depending on machine mode.  */
+/* Select reciprocal square root initial estimate insn depending on machine
+   mode.  */
 
 rsqrte_type
 get_rsqrte_type (machine_mode mode)
@@ -7516,14 +7519,13 @@  get_rsqrte_type (machine_mode mode)
     case V2DFmode: return gen_aarch64_rsqrte_v2df2;
     case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
     case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
-    default: gcc_unreachable ();
+    default:       gcc_unreachable ();
   }
 }
 
 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
 
-/* Select reciprocal square root Newton-Raphson step
-   insn depending on machine mode.  */
+/* Select reciprocal square root series step insn depending on machine mode.  */
 
 rsqrts_type
 get_rsqrts_type (machine_mode mode)
@@ -7535,50 +7537,86 @@  get_rsqrts_type (machine_mode mode)
     case V2DFmode: return gen_aarch64_rsqrts_v2df3;
     case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
     case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
-    default: gcc_unreachable ();
+    default:       gcc_unreachable ();
   }
 }
 
-/* Emit instruction sequence to compute the reciprocal square root using the
-   Newton-Raphson series.  Iterate over the series twice for SF
-   and thrice for DF.  */
+/* Emit instruction sequence to compute either the approximate square root
+   or its approximate reciprocal, depending on the flag RECP, and return
+   whether the sequence was emitted or not.  */
 
-void
-aarch64_emit_approx_rsqrt (rtx dst, rtx src)
+bool
+aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
 {
-  machine_mode mode = GET_MODE (src);
-  gcc_assert (
-    mode == SFmode || mode == V2SFmode || mode == V4SFmode
-	|| mode == DFmode || mode == V2DFmode);
-
-  rtx xsrc = gen_reg_rtx (mode);
-  emit_move_insn (xsrc, src);
-  rtx x0 = gen_reg_rtx (mode);
+  machine_mode mode = GET_MODE (dst);
+  machine_mode mmsk = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
+				       GET_MODE_NUNITS (mode));
+  bool use_approx_sqrt_p = (!recp
+			    && (flag_mlow_precision_sqrt
+			        || (aarch64_tune_params.approx_modes->sqrt
+				    & AARCH64_APPROX_MODE (mode))));
+  bool use_approx_rsqrt_p = (recp
+			     && (flag_mrecip_low_precision_sqrt
+				 || (aarch64_tune_params.approx_modes->recip_sqrt
+				     & AARCH64_APPROX_MODE (mode))));
+
+  if (!flag_finite_math_only
+      || flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_function_for_size_p (cfun)
+      || !(use_approx_sqrt_p || use_approx_rsqrt_p))
+    return false;
 
-  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+  rtx xmsk = gen_reg_rtx (mmsk);
+  if (!recp)
+    /* When calculating the approximate square root, compare the argument with
+       0.0 and create a mask.  */
+    emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
+							  CONST0_RTX (mode)))));
 
-  bool double_mode = (mode == DFmode || mode == V2DFmode);
+  /* Estimate the approximate reciprocal square root.  */
+  rtx xdst = gen_reg_rtx (mode);
+  emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
 
-  int iterations = double_mode ? 3 : 2;
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
 
-  /* Optionally iterate over the series one less time than otherwise.  */
-  if (flag_mrecip_low_precision_sqrt)
+  /* Optionally iterate over the series once less for faster performance
+     while sacrificing the accuracy.  */
+  if ((recp && flag_mrecip_low_precision_sqrt)
+      || (!recp && flag_mlow_precision_sqrt))
     iterations--;
 
-  for (int i = 0; i < iterations; ++i)
+  /* Iterate over the series to calculate the approximate reciprocal square root.  */
+  rtx x1 = gen_reg_rtx (mode);
+  while (iterations--)
     {
-      rtx x1 = gen_reg_rtx (mode);
       rtx x2 = gen_reg_rtx (mode);
-      rtx x3 = gen_reg_rtx (mode);
-      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+      emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
 
-      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+      emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
 
-      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
-      x0 = x1;
+      if (iterations > 0)
+	emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
     }
 
-  emit_move_insn (dst, x0);
+  if (!recp)
+    {
+      /* Qualify the approximate reciprocal square root when the argument is
+	 0.0 by squashing the intermediary result to 0.0.  */
+      rtx xtmp = gen_reg_rtx (mmsk);
+      emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
+					      gen_rtx_SUBREG (mmsk, xdst, 0)));
+      emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
+
+      /* Calculate the approximate square root.  */
+      emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
+    }
+
+  /* Finalize the approximation.  */
+  emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
+
+  return true;
 }
 
 /* Return the number of instructions that can be issued per cycle.  */
@@ -8108,6 +8146,12 @@  aarch64_override_options_after_change_1 (struct gcc_options *opts)
       && (aarch64_cmodel == AARCH64_CMODEL_TINY
 	  || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
     aarch64_nopcrelative_literal_loads = false;
+
+  /* When enabling the lower precision Newton series for the square root, also
+     enable it for the reciprocal square root, since the later is an
+     intermediary step for the latter.  */
+  if (flag_mlow_precision_sqrt)
+    flag_mrecip_low_precision_sqrt = true;
 }
 
 /* 'Unpack' up the internal tuning structs and update the options
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 9b282f1..aab3e00 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4683,7 +4683,16 @@ 
   [(set_attr "type" "ffarith<s>")]
 )
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:GPF 0 "register_operand")
+        (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+    DONE;
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:GPF 0 "register_operand" "=w")
         (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
   "TARGET_FLOAT"
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index c637ff4..ffd5540 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -151,5 +151,10 @@  PC relative literal loads.
 
 mlow-precision-recip-sqrt
 Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
+When calculating the approximate reciprocal square root,
+use one less step than otherwise, thus reducing latency and precision.
+
+mlow-precision-sqrt
+Common Var(flag_mlow_precision_sqrt) Optimization
+When calculating the approximate square root,
+use one less step than otherwise, thus reducing latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 4340b08..76b7a5c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -574,6 +574,7 @@  Objective-C and Objective-C++ Dialects}.
 -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
 -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
 -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
+-mlow-precision-sqrt -mno-low-precision-sqrt@gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
 
 @emph{Adapteva Epiphany Options}
@@ -12941,6 +12942,15 @@  uses one less step than otherwise, thus reducing latency and precision.
 This is only relevant if @option{-ffast-math} enables the reciprocal square root
 approximation.
 
+@item -mlow-precision-sqrt
+@item -mno-low-precision-sqrt
+@opindex -mlow-precision-sqrt
+@opindex -mno-low-precision-sqrt
+When calculating the square root approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the square root
+approximation.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture and, optionally, one or
-- 
2.6.3