Optimise constant IFN_WHILE_ULTs

Message ID	mptblwts5ch.fsf@arm.com
State	New
Headers	show Return-Path: <gcc-patches-return-506801-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:from :to:subject:date:message-id:mime-version:content-type; q=dns; s= default; b=iywngZvdoKO9W48orXk5YJUVU6A/9RQP7l++U6XfhKpZ+hpvYKXeB AIgKGmzyrmpnnuyE5uR4dD/pZymZwFAUXUfBYfHBT3yrCVCBjNgFqrWlyfslgJo/ QAqzPyYdOM9Ls5i8F2xYzaS43YVGeHDQIhb8QY4IOtfsAocvtZ4Ncw= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org From: Richard Sandiford <richard.sandiford@arm.com> To: gcc-patches@gcc.gnu.org Mail-Followup-To: gcc-patches@gcc.gnu.org, richard.sandiford@arm.com Subject: Optimise constant IFN_WHILE_ULTs Date: Tue, 13 Aug 2019 11:54:38 +0100 Message-ID: <mptblwts5ch.fsf@arm.com> User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.1 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain
Series	Optimise constant IFN_WHILE_ULTs \| expand Optimise constant IFN_WHILE_ULTs

Index: gcc/tree.h =================================================================== --- gcc/tree.h 2019-08-13 10:38:29.003944679 +0100 +++ gcc/tree.h 2019-08-13 11:46:02.110720978 +0100 @@ -4314,6 +4314,7 @@ extern tree build_vector_from_val (tree, extern tree build_uniform_cst (tree, tree); extern tree build_vec_series (tree, tree, tree); extern tree build_index_vector (tree, poly_uint64, poly_uint64); +extern tree build_vector_a_then_b (tree, unsigned int, tree, tree); extern void recompute_constructor_flags (tree); extern void verify_constructor_flags (tree); extern tree build_constructor (tree, vec<constructor_elt, va_gc> * CXX_MEM_STAT_INFO); Index: gcc/tree.c =================================================================== --- gcc/tree.c 2019-08-13 10:38:29.003944679 +0100 +++ gcc/tree.c 2019-08-13 11:46:02.110720978 +0100 @@ -1981,6 +1981,23 @@ build_index_vector (tree vec_type, poly_ return v.build (); } +/* Return a VECTOR_CST of type VEC_TYPE in which the first NUM_A + elements are A and the rest are B. */ + +tree +build_vector_a_then_b (tree vec_type, unsigned int num_a, tree a, tree b) +{ + gcc_assert (known_le (num_a, TYPE_VECTOR_SUBPARTS (vec_type))); + unsigned int count = constant_lower_bound (TYPE_VECTOR_SUBPARTS (vec_type)); + /* Optimize the constant case. */ + if ((count & 1) == 0 && TYPE_VECTOR_SUBPARTS (vec_type).is_constant ()) + count /= 2; + tree_vector_builder builder (vec_type, count, 2); + for (unsigned int i = 0; i < count * 2; ++i) + builder.quick_push (i < num_a ? a : b); + return builder.build (); +} + /* Something has messed with the elements of CONSTRUCTOR C after it was built; calculate TREE_CONSTANT and TREE_SIDE_EFFECTS. */ Index: gcc/fold-const-call.c =================================================================== --- gcc/fold-const-call.c 2019-03-08 18:15:31.292760907 +0000 +++ gcc/fold-const-call.c 2019-08-13 11:46:02.106721009 +0100 @@ -691,6 +691,36 @@ fold_const_vec_convert (tree ret_type, t /* Try to evaluate: + IFN_WHILE_ULT (ARG0, ARG1, (TYPE) { ... }) + + Return the value on success and null on failure. */ + +static tree +fold_while_ult (tree type, poly_uint64 arg0, poly_uint64 arg1) +{ + if (known_ge (arg0, arg1)) + return build_zero_cst (type); + + if (maybe_ge (arg0, arg1)) + return NULL_TREE; + + poly_uint64 diff = arg1 - arg0; + poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type); + if (known_ge (diff, nelts)) + return build_all_ones_cst (type); + + unsigned HOST_WIDE_INT const_diff; + if (known_le (diff, nelts) && diff.is_constant (&const_diff)) + { + tree minus_one = build_minus_one_cst (TREE_TYPE (type)); + tree zero = build_zero_cst (TREE_TYPE (type)); + return build_vector_a_then_b (type, const_diff, minus_one, zero); + } + return NULL_TREE; +} + +/* Try to evaluate: + *RESULT = FN (*ARG) in format FORMAT. Return true on success. */ @@ -1782,6 +1812,14 @@ fold_const_call (combined_fn fn, tree ty } return NULL_TREE; + case CFN_WHILE_ULT: + { + poly_uint64 parg0, parg1; + if (poly_int_tree_p (arg0, &parg0) && poly_int_tree_p (arg1, &parg1)) + return fold_while_ult (type, parg0, parg1); + return NULL_TREE; + } + default: return fold_const_call_1 (fn, type, arg0, arg1, arg2); } Index: gcc/config/aarch64/aarch64-protos.h =================================================================== --- gcc/config/aarch64/aarch64-protos.h 2019-08-13 11:39:54.745376082 +0100 +++ gcc/config/aarch64/aarch64-protos.h 2019-08-13 11:46:02.098721065 +0100 @@ -406,6 +406,33 @@ enum aarch64_key_type { extern struct tune_params aarch64_tune_params; +/* The available SVE predicate patterns, known in the ACLE as "svpattern". */ +#define AARCH64_FOR_SVPATTERN(T) \ + T (POW2, pow2, 0) \ + T (VL1, vl1, 1) \ + T (VL2, vl2, 2) \ + T (VL3, vl3, 3) \ + T (VL4, vl4, 4) \ + T (VL5, vl5, 5) \ + T (VL6, vl6, 6) \ + T (VL7, vl7, 7) \ + T (VL8, vl8, 8) \ + T (VL16, vl16, 9) \ + T (VL32, vl32, 10) \ + T (VL64, vl64, 11) \ + T (VL128, vl128, 12) \ + T (VL256, vl256, 13) \ + T (MUL4, mul4, 29) \ + T (MUL3, mul3, 30) \ + T (ALL, all, 31) + +#define AARCH64_SVENUM(UPPER, LOWER, VALUE) AARCH64_SV_##UPPER = VALUE, +enum aarch64_svpattern { + AARCH64_FOR_SVPATTERN (AARCH64_SVENUM) + AARCH64_NUM_SVPATTERNS +}; +#undef AARCH64_SVENUM + void aarch64_post_cfi_startproc (void); poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned); int aarch64_get_condition_code (rtx); Index: gcc/config/aarch64/aarch64-sve.md =================================================================== --- gcc/config/aarch64/aarch64-sve.md 2019-08-13 11:39:54.745376082 +0100 +++ gcc/config/aarch64/aarch64-sve.md 2019-08-13 11:46:02.098721065 +0100 @@ -481,12 +481,18 @@ (define_expand "mov<mode>" { if (GET_CODE (operands[0]) == MEM) operands[1] = force_reg (<MODE>mode, operands[1]); + + if (CONSTANT_P (operands[1])) + { + aarch64_expand_mov_immediate (operands[0], operands[1]); + DONE; + } } ) (define_insn "*aarch64_sve_mov<mode>" [(set (match_operand:PRED_ALL 0 "nonimmediate_operand" "=Upa, m, Upa, Upa") - (match_operand:PRED_ALL 1 "general_operand" "Upa, Upa, m, Dn"))] + (match_operand:PRED_ALL 1 "aarch64_mov_operand" "Upa, Upa, m, Dn"))] "TARGET_SVE && (register_operand (operands[0], <MODE>mode) || register_operand (operands[1], <MODE>mode))" @@ -2923,7 +2929,7 @@ (define_insn "*pred_cmp<cmp_op><mode>" ;; Set element I of the result if operand1 + J < operand2 for all J in [0, I], ;; with the comparison being unsigned. -(define_insn "while_ult<GPI:mode><PRED_ALL:mode>" +(define_insn "@while_ult<GPI:mode><PRED_ALL:mode>" [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa") (unspec:PRED_ALL [(match_operand:GPI 1 "aarch64_reg_or_zero" "rZ") (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")] Index: gcc/config/aarch64/aarch64.c =================================================================== --- gcc/config/aarch64/aarch64.c 2019-08-13 11:39:54.749376055 +0100 +++ gcc/config/aarch64/aarch64.c 2019-08-13 11:46:02.106721009 +0100 @@ -83,7 +83,7 @@ #define POINTER_BYTES (POINTER_SIZE / BI /* Information about a legitimate vector immediate operand. */ struct simd_immediate_info { - enum insn_type { MOV, MVN, INDEX }; + enum insn_type { MOV, MVN, INDEX, PTRUE }; enum modifier_type { LSL, MSL }; simd_immediate_info () {} @@ -92,6 +92,7 @@ struct simd_immediate_info insn_type = MOV, modifier_type = LSL, unsigned int = 0); simd_immediate_info (scalar_mode, rtx, rtx); + simd_immediate_info (scalar_int_mode, aarch64_svpattern); /* The mode of the elements. */ scalar_mode elt_mode; @@ -120,6 +121,9 @@ struct simd_immediate_info subsequent element. */ rtx base, step; } index; + + /* For PTRUE. */ + aarch64_svpattern pattern; } u; }; @@ -159,6 +163,16 @@ struct simd_immediate_info u.index.step = step_in; } +/* Construct a predicate that controls elements of mode ELT_MODE_IN + and has PTRUE pattern PATTERN_IN. */ +inline simd_immediate_info +::simd_immediate_info (scalar_int_mode elt_mode_in, + aarch64_svpattern pattern_in) + : elt_mode (elt_mode_in), insn (PTRUE) +{ + u.pattern = pattern_in; +} + /* The current code model. */ enum aarch64_code_model aarch64_cmodel; @@ -1334,6 +1348,22 @@ static const char *const aarch64_sve_con "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv" }; +/* Return the assembly token for svpattern value VALUE. */ + +static const char * +svpattern_token (enum aarch64_svpattern pattern) +{ + switch (pattern) + { +#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER; + AARCH64_FOR_SVPATTERN (CASE) +#undef CASE + case AARCH64_NUM_SVPATTERNS: + break; + } + gcc_unreachable (); +} + /* Generate code to enable conditional branches in functions over 1 MiB. */ const char * aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest, @@ -2529,6 +2559,146 @@ aarch64_force_temporary (machine_mode mo } } +/* Return true if predicate value X is a constant in which every element + is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI + value, i.e. as a predicate in which all bits are significant. */ + +static bool +aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x) +{ + if (GET_CODE (x) != CONST_VECTOR) + return false; + + unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode), + GET_MODE_NUNITS (GET_MODE (x))); + unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor; + unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x); + builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern); + + unsigned int nelts = const_vector_encoded_nelts (x); + for (unsigned int i = 0; i < nelts; ++i) + { + rtx elt = CONST_VECTOR_ENCODED_ELT (x, i); + if (!CONST_INT_P (elt)) + return false; + + builder.quick_push (elt); + for (unsigned int j = 1; j < factor; ++j) + builder.quick_push (const0_rtx); + } + builder.finalize (); + return true; +} + +/* BUILDER contains a predicate constant of mode VNx16BI. Return the + widest predicate element size it can have (that is, the largest size + for which each element would still be 0 or 1). */ + +unsigned int +aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder) +{ + /* Start with the most optimistic assumption: that we only need + one bit per pattern. This is what we will use if only the first + bit in each pattern is ever set. */ + unsigned int mask = GET_MODE_SIZE (DImode); + mask |= builder.npatterns (); + + /* Look for set bits. */ + unsigned int nelts = builder.encoded_nelts (); + for (unsigned int i = 1; i < nelts; ++i) + if (INTVAL (builder.elt (i)) != 0) + { + if (i & 1) + return 1; + mask |= i; + } + return mask & -mask; +} + +/* BUILDER is a predicate constant of mode VNx16BI. Consider the value + that the constant would have with predicate element size ELT_SIZE + (ignoring the upper bits in each element) and return: + + * -1 if all bits are set + * N if the predicate has N leading set bits followed by all clear bits + * 0 if the predicate does not have any of these forms. */ + +int +aarch64_partial_ptrue_length (rtx_vector_builder &builder, + unsigned int elt_size) +{ + /* If nelts_per_pattern is 3, we have set bits followed by clear bits + followed by set bits. */ + if (builder.nelts_per_pattern () == 3) + return 0; + + /* Skip over leading set bits. */ + unsigned int nelts = builder.encoded_nelts (); + unsigned int i = 0; + for (; i < nelts; i += elt_size) + if (INTVAL (builder.elt (i)) == 0) + break; + unsigned int vl = i / elt_size; + + /* Check for the all-true case. */ + if (i == nelts) + return -1; + + /* If nelts_per_pattern is 1, then either VL is zero, or we have a + repeating pattern of set bits followed by clear bits. */ + if (builder.nelts_per_pattern () != 2) + return 0; + + /* We have a "foreground" value and a duplicated "background" value. + If the background might repeat and the last set bit belongs to it, + we might have set bits followed by clear bits followed by set bits. */ + if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ())) + return 0; + + /* Make sure that the rest are all clear. */ + for (; i < nelts; i += elt_size) + if (INTVAL (builder.elt (i)) != 0) + return 0; + + return vl; +} + +/* See if there is an svpattern that encodes an SVE predicate of mode + PRED_MODE in which the first VL bits are set and the rest are clear. + Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS. + A VL of -1 indicates an all-true vector. */ + +aarch64_svpattern +aarch64_svpattern_for_vl (machine_mode pred_mode, int vl) +{ + if (vl < 0) + return AARCH64_SV_ALL; + + if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode))) + return AARCH64_NUM_SVPATTERNS; + + if (vl >= 1 && vl <= 8) + return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1)); + + if (vl >= 16 && vl <= 256 && pow2p_hwi (vl)) + return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4)); + + int max_vl; + if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl)) + { + if (vl == (max_vl / 3) * 3) + return AARCH64_SV_MUL3; + /* These would only trigger for non-power-of-2 lengths. */ + if (vl == (max_vl & -4)) + return AARCH64_SV_MUL4; + if (vl == (1 << floor_log2 (max_vl))) + return AARCH64_SV_POW2; + if (vl == max_vl) + return AARCH64_SV_ALL; + } + return AARCH64_NUM_SVPATTERNS; +} + /* Return an all-true predicate register of mode MODE. */ rtx @@ -3447,6 +3617,17 @@ aarch64_expand_sve_const_vector (rtx tar return target; } +/* Use WHILE to set predicate register DEST so that the first VL bits + are set and the rest are clear. */ + +static void +aarch64_sve_move_pred_via_while (rtx dest, unsigned int vl) +{ + rtx limit = force_reg (DImode, gen_int_mode (vl, DImode)); + emit_insn (gen_while_ult (DImode, GET_MODE (dest), + dest, const0_rtx, limit)); +} + /* Set DEST to immediate IMM. */ void @@ -3580,6 +3761,19 @@ aarch64_expand_mov_immediate (rtx dest, return; } + rtx_vector_builder builder; + if (GET_MODE_CLASS (GET_MODE (imm)) == MODE_VECTOR_BOOL + && aarch64_get_sve_pred_bits (builder, imm)) + { + unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder); + int vl = aarch64_partial_ptrue_length (builder, elt_size); + if (vl > 0) + { + aarch64_sve_move_pred_via_while (dest, vl); + return; + } + } + if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode)) if (rtx res = aarch64_expand_sve_const_vector (dest, imm)) { @@ -14776,6 +14970,44 @@ aarch64_sve_valid_immediate (unsigned HO return false; } +/* Return true if X is a valid SVE predicate. If INFO is nonnull, use + it to describe valid immediates. */ + +static bool +aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info) +{ + if (x == CONST0_RTX (GET_MODE (x))) + { + if (info) + *info = simd_immediate_info (DImode, 0); + return true; + } + + /* Analyze the value as a VNx16BImode. This should be relatively + efficient, since rtx_vector_builder has enough built-in capacity + to store all VLA predicate constants without needing the heap. */ + rtx_vector_builder builder; + if (!aarch64_get_sve_pred_bits (builder, x)) + return false; + + unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder); + if (int vl = aarch64_partial_ptrue_length (builder, elt_size)) + { + machine_mode mode = aarch64_sve_pred_mode (elt_size).require (); + aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl); + if (pattern != AARCH64_NUM_SVPATTERNS) + { + if (info) + { + scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode); + *info = simd_immediate_info (int_mode, pattern); + } + return true; + } + } + return false; +} + /* Return true if OP is a valid SIMD immediate for the operation described by WHICH. If INFO is nonnull, use it to describe valid immediates. */ @@ -14788,6 +15020,9 @@ aarch64_simd_valid_immediate (rtx op, si if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT)) return false; + if (vec_flags & VEC_SVE_PRED) + return aarch64_sve_pred_valid_immediate (op, info); + scalar_mode elt_mode = GET_MODE_INNER (mode); rtx base, step; unsigned int n_elts; @@ -14812,21 +15047,6 @@ aarch64_simd_valid_immediate (rtx op, si else return false; - /* Handle PFALSE and PTRUE. */ - if (vec_flags & VEC_SVE_PRED) - { - if (op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode)) - { - if (info) - { - scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode); - *info = simd_immediate_info (int_mode, op == CONSTM1_RTX (mode)); - } - return true; - } - return false; - } - scalar_float_mode elt_float_mode; if (n_elts == 1 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode)) @@ -16570,14 +16790,23 @@ aarch64_output_sve_mov_immediate (rtx co if (aarch64_sve_pred_mode_p (vec_mode)) { static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")]; - unsigned int total_bytes; - if (info.u.mov.value == const0_rtx) - snprintf (buf, sizeof (buf), "pfalse\t%%0.b"); - else if (BYTES_PER_SVE_VECTOR.is_constant (&total_bytes)) - snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char, - total_bytes / GET_MODE_SIZE (info.elt_mode)); + if (info.insn == simd_immediate_info::MOV) + { + gcc_assert (info.u.mov.value == const0_rtx); + snprintf (buf, sizeof (buf), "pfalse\t%%0.b"); + } else - snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", element_char); + { + gcc_assert (info.insn == simd_immediate_info::PTRUE); + unsigned int total_bytes; + if (info.u.pattern == AARCH64_SV_ALL + && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes)) + snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char, + total_bytes / GET_MODE_SIZE (info.elt_mode)); + else + snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char, + svpattern_token (info.u.pattern)); + } return buf; } Index: gcc/testsuite/gcc.target/aarch64/sve/spill_2.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/sve/spill_2.c 2019-03-08 18:14:29.768994780 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve/spill_2.c 2019-08-13 11:46:02.106721009 +0100 @@ -9,29 +9,30 @@ #define TEST_LOOP(TYPE) \ void \ multi_loop_##TYPE (TYPE *x, TYPE val) \ { \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 9; ++i) \ x[i] += val; \ consumer (x); \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 9; ++i) \ x[i] += val; \ consumer (x); \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 9; ++i) \ x[i] += val; \ consumer (x); \ } /* One iteration is enough. */ TEST_LOOP (uint8_t); +/* Two iterations are enough. We specialize the second two loops based + on whether the first executes once or twice. */ TEST_LOOP (uint16_t); -/* Two iterations are enough. Complete unrolling makes sense - even at -O2. */ +/* Three iterations are needed; ought to stay a loop. */ TEST_LOOP (uint32_t); -/* Four iterations are needed; ought to stay a loop. */ +/* Five iterations are needed; ought to stay a loop. */ TEST_LOOP (uint64_t); /* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.b} 3 } } */ -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.h} 3 } } */ -/* { dg-final { scan-assembler {\twhilelo\tp[0-9]\.s} } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.h} 8 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.s} 6 } } */ /* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.d} 6 } } */ /* { dg-final { scan-assembler-not {\tldr\tz[0-9]} } } */ /* { dg-final { scan-assembler-not {\tstr\tz[0-9]} } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/while_6.c =================================================================== --- /dev/null 2019-07-30 08:53:31.317691683 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/while_6.c 2019-08-13 11:46:02.106721009 +0100 @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */ + +#include <stdint.h> + +#define ADD_LOOP(TYPE) \ + TYPE __attribute__ ((noinline, noclone)) \ + vec_while_##TYPE (TYPE *restrict a) \ + { \ + for (int i = 0; i < 7; ++i) \ + a[i] += 1; \ + } + +#define TEST_ALL(T) \ + T (int8_t) \ + T (int16_t) \ + T (int32_t) \ + T (int64_t) + +TEST_ALL (ADD_LOOP) + +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl7\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl7\n} 1 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/while_7.c =================================================================== --- /dev/null 2019-07-30 08:53:31.317691683 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/while_7.c 2019-08-13 11:46:02.106721009 +0100 @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */ + +#include <stdint.h> + +#define ADD_LOOP(TYPE) \ + TYPE __attribute__ ((noinline, noclone)) \ + vec_while_##TYPE (TYPE *restrict a) \ + { \ + for (int i = 0; i < 8; ++i) \ + a[i] += 1; \ + } + +#define TEST_ALL(T) \ + T (int8_t) \ + T (int16_t) \ + T (int32_t) \ + T (int64_t) + +TEST_ALL (ADD_LOOP) + +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl8\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl8\n} 1 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/while_8.c =================================================================== --- /dev/null 2019-07-30 08:53:31.317691683 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/while_8.c 2019-08-13 11:46:02.106721009 +0100 @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */ + +#include <stdint.h> + +#define ADD_LOOP(TYPE) \ + TYPE __attribute__ ((noinline, noclone)) \ + vec_while_##TYPE (TYPE *restrict a) \ + { \ + for (int i = 0; i < 9; ++i) \ + a[i] += 1; \ + } + +#define TEST_ALL(T) \ + T (int8_t) \ + T (int16_t) \ + T (int32_t) \ + T (int64_t) + +TEST_ALL (ADD_LOOP) + +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b,} 1 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 2 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/while_9.c =================================================================== --- /dev/null 2019-07-30 08:53:31.317691683 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/while_9.c 2019-08-13 11:46:02.106721009 +0100 @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */ + +#include <stdint.h> + +#define ADD_LOOP(TYPE) \ + TYPE __attribute__ ((noinline, noclone)) \ + vec_while_##TYPE (TYPE *restrict a) \ + { \ + for (int i = 0; i < 16; ++i) \ + a[i] += 1; \ + } + +#define TEST_ALL(T) \ + T (int8_t) \ + T (int16_t) \ + T (int32_t) \ + T (int64_t) + +TEST_ALL (ADD_LOOP) + +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl16\n} 1 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 2 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/while_10.c =================================================================== --- /dev/null 2019-07-30 08:53:31.317691683 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/while_10.c 2019-08-13 11:46:02.106721009 +0100 @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=512" } */ + +#include <stdint.h> + +#define ADD_LOOP(TYPE, COUNT) \ + TYPE __attribute__ ((noinline, noclone)) \ + vec_while_##TYPE (TYPE *restrict a) \ + { \ + for (int i = 0; i < COUNT; ++i) \ + a[i] += 1; \ + } + +#define TEST_ALL(T) \ + T (int8_t, 63) \ + T (int16_t, 30) \ + T (int32_t, 15) \ + T (int64_t, 6) + +TEST_ALL (ADD_LOOP) + +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, mul3\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, mul3\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, mul3\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl6\n} 1 } } */

Optimise constant IFN_WHILE_ULTs

Commit Message

Comments

Patch