@@ -1,3 +1,23 @@
+2015-10-19 Benedikt Huber <benedikt.huber@theobroma-systems.com>
+ Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
+
+ * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
+ * config/aarch64/aarch64-protos.h: Declare.
+ * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
+ frsqrts.
+ * config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
+ * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
+ applicable.
+ * config/aarch64/aarch64.md: Added enum entries.
+ * config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
+ * testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for
+ assembly checks.
+ * testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure
+ frsqrts and frsqrte are not emitted.
+ * testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and
+ frsqrte are emitted.
+ * testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.
+
2015-10-16 Trevor Saunders <tbsaunde+gcc@tbsaunde.org>
* lra-constraints.c (add_next_usage_insn): Change argument type
@@ -324,6 +324,11 @@ enum aarch64_builtins
AARCH64_BUILTIN_GET_FPSR,
AARCH64_BUILTIN_SET_FPSR,
+ AARCH64_BUILTIN_RSQRT_DF,
+ AARCH64_BUILTIN_RSQRT_SF,
+ AARCH64_BUILTIN_RSQRT_V2DF,
+ AARCH64_BUILTIN_RSQRT_V2SF,
+ AARCH64_BUILTIN_RSQRT_V4SF,
AARCH64_SIMD_BUILTIN_BASE,
AARCH64_SIMD_BUILTIN_LANE_CHECK,
#include "aarch64-simd-builtins.def"
@@ -822,6 +827,46 @@ aarch64_init_crc32_builtins ()
}
}
+/* Add builtins for reciprocal square root. */
+
+void
+aarch64_init_builtin_rsqrt (void)
+{
+ tree fndecl = NULL;
+ tree ftype = NULL;
+
+ tree V2SF_type_node = build_vector_type (float_type_node, 2);
+ tree V2DF_type_node = build_vector_type (double_type_node, 2);
+ tree V4SF_type_node = build_vector_type (float_type_node, 4);
+
+ struct builtin_decls_data
+ {
+ tree type_node;
+ const char *builtin_name;
+ int function_code;
+ };
+
+ builtin_decls_data bdda[] =
+ {
+ { double_type_node, "__builtin_aarch64_rsqrt_df", AARCH64_BUILTIN_RSQRT_DF },
+ { float_type_node, "__builtin_aarch64_rsqrt_sf", AARCH64_BUILTIN_RSQRT_SF },
+ { V2DF_type_node, "__builtin_aarch64_rsqrt_v2df", AARCH64_BUILTIN_RSQRT_V2DF },
+ { V2SF_type_node, "__builtin_aarch64_rsqrt_v2sf", AARCH64_BUILTIN_RSQRT_V2SF },
+ { V4SF_type_node, "__builtin_aarch64_rsqrt_v4sf", AARCH64_BUILTIN_RSQRT_V4SF }
+ };
+
+ builtin_decls_data *bdd = bdda;
+ builtin_decls_data *bdd_end = bdd + (sizeof (bdda) / sizeof (builtin_decls_data));
+
+ for (; bdd < bdd_end; bdd++)
+ {
+ ftype = build_function_type_list (bdd->type_node, bdd->type_node, NULL_TREE);
+ fndecl = add_builtin_function (bdd->builtin_name,
+ ftype, bdd->function_code, BUILT_IN_MD, NULL, NULL_TREE);
+ aarch64_builtin_decls[bdd->function_code] = fndecl;
+ }
+}
+
void
aarch64_init_builtins (void)
{
@@ -853,6 +898,7 @@ aarch64_init_builtins (void)
aarch64_init_simd_builtins ();
aarch64_init_crc32_builtins ();
+ aarch64_init_builtin_rsqrt ();
}
tree
@@ -1116,6 +1162,44 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
return target;
}
+/* Function to expand reciprocal square root builtins. */
+
+static rtx
+aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
+{
+ tree arg0 = CALL_EXPR_ARG (exp, 0);
+ rtx op0 = expand_normal (arg0);
+
+ rtx (*gen) (rtx, rtx);
+
+ switch (fcode)
+ {
+ case AARCH64_BUILTIN_RSQRT_DF:
+ gen = gen_aarch64_rsqrt_df2;
+ break;
+ case AARCH64_BUILTIN_RSQRT_SF:
+ gen = gen_aarch64_rsqrt_sf2;
+ break;
+ case AARCH64_BUILTIN_RSQRT_V2DF:
+ gen = gen_aarch64_rsqrt_v2df2;
+ break;
+ case AARCH64_BUILTIN_RSQRT_V2SF:
+ gen = gen_aarch64_rsqrt_v2sf2;
+ break;
+ case AARCH64_BUILTIN_RSQRT_V4SF:
+ gen = gen_aarch64_rsqrt_v4sf2;
+ break;
+ default: gcc_unreachable ();
+ }
+
+ if (!target)
+ target = gen_reg_rtx (GET_MODE (op0));
+
+ emit_insn (gen (target, op0));
+
+ return target;
+}
+
/* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient. */
rtx
@@ -1163,6 +1247,13 @@ aarch64_expand_builtin (tree exp,
else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
return aarch64_crc32_expand_builtin (fcode, exp, target);
+ if (fcode == AARCH64_BUILTIN_RSQRT_DF
+ || fcode == AARCH64_BUILTIN_RSQRT_SF
+ || fcode == AARCH64_BUILTIN_RSQRT_V2DF
+ || fcode == AARCH64_BUILTIN_RSQRT_V2SF
+ || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
+ return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+
gcc_unreachable ();
}
@@ -1320,6 +1411,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
return NULL_TREE;
}
+/* Return builtin for reciprocal square root. */
+
+tree
+aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
+{
+ if (md_fn)
+ {
+ if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
+ return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
+ if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
+ return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
+ if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
+ return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
+ }
+ else
+ {
+ if (fn == BUILT_IN_SQRT)
+ return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
+ if (fn == BUILT_IN_SQRTF)
+ return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
+ }
+ return NULL_TREE;
+}
+
#undef VAR1
#define VAR1(T, N, MAP, A) \
case AARCH64_SIMD_BUILTIN_##T##_##N##A:
@@ -332,6 +332,8 @@ void aarch64_register_pragmas (void);
void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void);
+void aarch64_emit_swrsqrt (rtx, rtx);
+
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
@@ -383,6 +385,8 @@ rtx aarch64_expand_builtin (tree exp,
int ignore ATTRIBUTE_UNUSED);
tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED);
+tree aarch64_builtin_rsqrt (unsigned int, bool);
+
tree
aarch64_builtin_vectorized_function (tree fndecl,
tree type_out,
@@ -354,6 +354,33 @@
[(set_attr "type" "neon_fp_mul_d_scalar_q")]
)
+(define_insn "aarch64_rsqrte_<mode>2"
+ [(set (match_operand:VALLF 0 "register_operand" "=w")
+ (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+ UNSPEC_RSQRTE))]
+ "TARGET_SIMD"
+ "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+ [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+
+(define_insn "aarch64_rsqrts_<mode>3"
+ [(set (match_operand:VALLF 0 "register_operand" "=w")
+ (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+ (match_operand:VALLF 2 "register_operand" "w")]
+ UNSPEC_RSQRTS))]
+ "TARGET_SIMD"
+ "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+ [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+
+(define_expand "aarch64_rsqrt_<mode>2"
+ [(set (match_operand:VALLF 0 "register_operand" "=w")
+ (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+ UNSPEC_RSQRT))]
+ "TARGET_SIMD"
+{
+ aarch64_emit_swrsqrt (operands[0], operands[1]);
+ DONE;
+})
+
(define_insn "*aarch64_mul3_elt_to_64v2df"
[(set (match_operand:DF 0 "register_operand" "=w")
(mult:DF
@@ -29,4 +29,5 @@
AARCH64_TUNE_ to give an enum name. */
AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
+AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
@@ -392,7 +392,8 @@ static const struct tune_params cortexa57_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
- (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
+ | AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
};
static const struct tune_params cortexa72_tunings =
@@ -456,7 +457,7 @@ static const struct tune_params xgene1_tunings =
1, /* vec_reassoc_width. */
2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
};
/* Support for fine-grained override of the tuning structures. */
@@ -6989,6 +6990,105 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
return aarch64_tune_params.memmov_cost;
}
+/* Function to decide when to use
+ reciprocal square root builtins. */
+
+static tree
+aarch64_builtin_reciprocal (unsigned int fn,
+ bool md_fn,
+ bool)
+{
+ if (flag_trapping_math
+ || !flag_unsafe_math_optimizations
+ || optimize_size
+ || ! (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_RECIP_SQRT))
+ {
+ return NULL_TREE;
+ }
+
+ return aarch64_builtin_rsqrt (fn, md_fn);
+}
+
+typedef rtx (*rsqrte_type) (rtx, rtx);
+
+/* Select reciprocal square root initial estimate
+ insn depending on machine mode. */
+
+rsqrte_type
+get_rsqrte_type (machine_mode mode)
+{
+ switch (mode)
+ {
+ case DFmode: return gen_aarch64_rsqrte_df2;
+ case SFmode: return gen_aarch64_rsqrte_sf2;
+ case V2DFmode: return gen_aarch64_rsqrte_v2df2;
+ case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
+ case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
+ default: gcc_unreachable ();
+ }
+}
+
+typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
+
+/* Select reciprocal square root Newton-Raphson step
+ insn depending on machine mode. */
+
+rsqrts_type
+get_rsqrts_type (machine_mode mode)
+{
+ switch (mode)
+ {
+ case DFmode: return gen_aarch64_rsqrts_df3;
+ case SFmode: return gen_aarch64_rsqrts_sf3;
+ case V2DFmode: return gen_aarch64_rsqrts_v2df3;
+ case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
+ case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
+ default: gcc_unreachable ();
+ }
+}
+
+/* Emit instruction sequence to compute
+ reciprocal square root. Use two Newton-Raphson steps
+ for single precision and three for double precision. */
+
+void
+aarch64_emit_swrsqrt (rtx dst, rtx src)
+{
+ machine_mode mode = GET_MODE (src);
+ gcc_assert (
+ mode == SFmode || mode == V2SFmode || mode == V4SFmode
+ || mode == DFmode || mode == V2DFmode);
+
+ rtx xsrc = gen_reg_rtx (mode);
+ emit_move_insn (xsrc, src);
+ rtx x0 = gen_reg_rtx (mode);
+
+ emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+
+ bool double_mode = (mode == DFmode || mode == V2DFmode);
+
+ int iterations = double_mode ? 3 : 2;
+
+ if (flag_mrecip_low_precision_sqrt)
+ iterations--;
+
+ for (int i = 0; i < iterations; ++i)
+ {
+ rtx x1 = gen_reg_rtx (mode);
+ rtx x2 = gen_reg_rtx (mode);
+ rtx x3 = gen_reg_rtx (mode);
+ emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+
+ emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+
+ emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+ x0 = x1;
+ }
+
+ emit_move_insn (dst, x0);
+}
+
/* Return the number of instructions that can be issued per cycle. */
static int
aarch64_sched_issue_rate (void)
@@ -13327,6 +13427,9 @@ aarch64_promoted_type (const_tree t)
#undef TARGET_BUILTIN_DECL
#define TARGET_BUILTIN_DECL aarch64_builtin_decl
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
+
#undef TARGET_EXPAND_BUILTIN
#define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
@@ -126,6 +126,9 @@
UNSPEC_VSTRUCTDUMMY
UNSPEC_SP_SET
UNSPEC_SP_TEST
+ UNSPEC_RSQRT
+ UNSPEC_RSQRTE
+ UNSPEC_RSQRTS
])
(define_c_enum "unspecv" [
@@ -148,3 +148,8 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
mpc-relative-literal-loads
Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
PC relative literal loads.
+
+mlow-precision-recip-sqrt
+Common Var(flag_mrecip_low_precision_sqrt) Optimization
+When calculating a sqrt approximation, run fewer steps.
+This reduces precision, but can result in faster computation.
@@ -520,6 +520,7 @@ Objective-C and Objective-C++ Dialects}.
-mtls-size=@var{size} @gol
-mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol
-mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol
+-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
-march=@var{name} -mcpu=@var{name} -mtune=@var{name}}
@emph{Adapteva Epiphany Options}
@@ -12482,6 +12483,17 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
This erratum workaround is made at link time and this will only pass the
corresponding flag to the linker.
+@item -mlow-precision-recip-sqrt
+@item -mno-low-precision-recip-sqrt
+@opindex -mlow-precision-recip-sqrt
+@opindex -mno-low-precision-recip-sqrt
+The square root estimate uses two steps instead of three for double-precision,
+and one step instead of two for single-precision.
+Thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} activates
+reciprocal square root estimate instructions.
+Which in turn depends on the target processor.
+
@item -march=@var{name}
@opindex march
Specify the name of the target architecture, optionally suffixed by one or
new file mode 100644
@@ -0,0 +1,111 @@
+/* Test for the recip_sqrt tuning
+ ensuring functionality and sufficient accuracy. */
+/* { dg-do run } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic -moverride=tune=recip_sqrt" } */
+
+#define PI 3.141592653589793
+#define SQRT2 1.4142135623730951
+
+#define PI_4 0.7853981633974483
+#define SQRT1_2 0.7071067811865475
+
+/* 2^25+1, float has 24 significand bits
+ according to Single-precision floating-point format. */
+#define TESTA8_FLT 33554433
+/* 2^54+1, double has 53 significand bits
+ according to Double-precision floating-point format. */
+#define TESTA8_DBL 18014398509481985
+
+#define EPSILON_double __DBL_EPSILON__
+#define EPSILON_float __FLT_EPSILON__
+#define ABS_double __builtin_fabs
+#define ABS_float __builtin_fabsf
+#define SQRT_double __builtin_sqrt
+#define SQRT_float __builtin_sqrtf
+#define ISNAN_double __builtin_isnan
+#define ISNAN_float __builtin_isnanf
+
+extern void abort (void);
+
+#define TESTTYPE(TYPE) \
+TYPE \
+rsqrt_##TYPE (TYPE a) \
+{ \
+ return 1.0/SQRT_##TYPE (a); \
+} \
+\
+int \
+equals_##TYPE (TYPE a, TYPE b) \
+{ \
+ return (a == b || \
+ (ISNAN_##TYPE (a) && ISNAN_##TYPE (b)) || \
+ (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \
+} \
+\
+void \
+t_##TYPE (TYPE a, TYPE result) \
+{ \
+ TYPE r = rsqrt_##TYPE (a); \
+ if (!equals_##TYPE (r, result)) \
+ { \
+ abort (); \
+ } \
+} \
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+int
+main ()
+{
+ double nan = __builtin_nan ("");
+ double inf = __builtin_inf ();
+ float nanf = __builtin_nanf ("");
+ float inff = __builtin_inff ();
+
+ t_double (1.0/256, 0X1.00000000000000P+4);
+ t_double (1.0, 0X1.00000000000000P+0);
+ t_double (-1.0, nan);
+ t_double (11.0, 0X1.34BF63D1568260P-2);
+ t_double (0.0, inf);
+ t_double (inf, 0X0.00000000000000P+0);
+ t_double (nan, nan);
+ t_double (-nan, -nan);
+ t_double (__DBL_MAX__, 0X1.00000000000010P-512);
+ t_double (__DBL_MIN__, 0X1.00000000000000P+511);
+ t_double (PI, 0X1.20DD750429B6D0P-1);
+ t_double (PI_4, 0X1.20DD750429B6D0P+0);
+ t_double (SQRT2, 0X1.AE89F995AD3AE0P-1);
+ t_double (SQRT1_2, 0X1.306FE0A31B7150P+0);
+ t_double (-PI, nan);
+ t_double (-SQRT2, nan);
+ t_double (TESTA8_DBL, 0X1.00000000000000P-27);
+
+ t_float (1.0/256, 0X1.00000000000000P+4);
+ t_float (1.0, 0X1.00000000000000P+0);
+ t_float (-1.0, nanf);
+ t_float (11.0, 0X1.34BF6400000000P-2);
+ t_float (0.0, inff);
+ t_float (inff, 0X0.00000000000000P+0);
+ t_float (nanf, nanf);
+ t_float (-nanf, -nanf);
+ t_float (__FLT_MAX__, 0X1.00000200000000P-64);
+ t_float (__FLT_MIN__, 0X1.00000000000000P+63);
+ t_float (PI, 0X1.20DD7400000000P-1);
+ t_float (PI_4, 0X1.20DD7400000000P+0);
+ t_float (SQRT2, 0X1.AE89FA00000000P-1);
+ t_float (SQRT1_2, 0X1.306FE000000000P+0);
+ t_float (-PI, nanf);
+ t_float (-SQRT2, nanf);
+ t_float (TESTA8_FLT, 0X1.6A09E600000000P-13);
+
+/* With -ffast-math these return positive INF. */
+/* t_double (-0.0, -inf); */
+/* t_float (-0.0, -inff); */
+
+/* The reason here is that -ffast-math flushes to zero. */
+/* t_double (__DBL_MIN__/256, 0X1.00000000000000P+515); */
+/* t_float (__FLT_MIN__/256, 0X1.00000000000000P+67); */
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,25 @@
+/* Test for the recip_sqrt tuning
+ ensuring the correct instructions are generated. */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic -moverride=tune=recip_sqrt" } */
+
+#include "rsqrt_asm_check_common.h"
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
+
+
+/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */
+/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
new file mode 100644
@@ -0,0 +1,42 @@
+#define sqrt_float __builtin_sqrtf
+#define sqrt_double __builtin_sqrt
+
+#define TESTTYPE(TYPE) \
+typedef struct { \
+ TYPE a; \
+ TYPE b; \
+ TYPE c; \
+ TYPE d; \
+} s4_##TYPE; \
+\
+typedef struct { \
+ TYPE a; \
+ TYPE b; \
+} s2_##TYPE; \
+\
+s4_##TYPE \
+rsqrtv4_##TYPE (s4_##TYPE i) \
+{ \
+ s4_##TYPE o; \
+ o.a = 1.0 / sqrt_##TYPE (i.a); \
+ o.b = 1.0 / sqrt_##TYPE (i.b); \
+ o.c = 1.0 / sqrt_##TYPE (i.c); \
+ o.d = 1.0 / sqrt_##TYPE (i.d); \
+ return o; \
+} \
+\
+s2_##TYPE \
+rsqrtv2_##TYPE (s2_##TYPE i) \
+{ \
+ s2_##TYPE o; \
+ o.a = 1.0 / sqrt_##TYPE (i.a); \
+ o.b = 1.0 / sqrt_##TYPE (i.b); \
+ return o; \
+} \
+\
+TYPE \
+rsqrt_##TYPE (TYPE i) \
+{ \
+ return 1.0 / sqrt_##TYPE (i); \
+} \
+
new file mode 100644
@@ -0,0 +1,12 @@
+/* Test for the recip_sqrt tuning
+ ensuring the correct instructions are generated. */
+/* { dg-do compile } */
+/* { dg-options "-O3 --std=c99 --save-temps -fverbose-asm -funsafe-math-optimizations -fno-math-errno -mtune=generic -mcpu=generic" } */
+
+#include "rsqrt_asm_check_common.h"
+
+TESTTYPE (double)
+TESTTYPE (float)
+
+/* { dg-final { scan-assembler-times "frsqrte" 0 } } */
+/* { dg-final { scan-assembler-times "frsqrts" 0 } } */
* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf. * config/aarch64/aarch64-protos.h: Declare. * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and frsqrts. * config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt. * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when applicable. * config/aarch64/aarch64.md: Added enum entries. * config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt. * testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for assembly checks. * testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure frsqrts and frsqrte are not emitted. * testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and frsqrte are emitted. * testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt. Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> --- gcc/ChangeLog | 20 ++++ gcc/config/aarch64/aarch64-builtins.c | 115 +++++++++++++++++++++ gcc/config/aarch64/aarch64-protos.h | 4 + gcc/config/aarch64/aarch64-simd.md | 27 +++++ gcc/config/aarch64/aarch64-tuning-flags.def | 1 + gcc/config/aarch64/aarch64.c | 107 ++++++++++++++++++- gcc/config/aarch64/aarch64.md | 3 + gcc/config/aarch64/aarch64.opt | 5 + gcc/doc/invoke.texi | 12 +++ gcc/testsuite/gcc.target/aarch64/rsqrt_1.c | 111 ++++++++++++++++++++ .../gcc.target/aarch64/rsqrt_asm_check_1.c | 25 +++++ .../gcc.target/aarch64/rsqrt_asm_check_common.h | 42 ++++++++ .../aarch64/rsqrt_asm_check_negative_1.c | 12 +++ 13 files changed, 482 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c