[AArch64,1/2] Improve codegen of vector compares inc. tst instruction

Message ID	53F32A86.4060109@arm.com
State	New
Headers	show Return-Path: <gcc-patches-return-375627-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:subject:content-type; q= dns; s=default; b=H9KzwQnSbkq+0sgQUoUnFSyY9BCQyc1IlrZEBHHiuPg22J kC9oIbwFv0RcolNfqa8tf+XTF81ilNcioTiQMuwxU4I5i1NS7RGf5d+NmVb/WQPl G14H0aM0pBL5bwJK/YgmU5hpV1ogB5cqpId/TVkt6m8t3G7AQzy9LewRc90So= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Message-ID: <53F32A86.4060109@arm.com> Date: Tue, 19 Aug 2014 11:44:22 +0100 From: Alan Lawrence <alan.lawrence@arm.com> User-Agent: Thunderbird 2.0.0.24 (X11/20101213) MIME-Version: 1.0 To: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org> Subject: [PATCH AArch64 1/2] Improve codegen of vector compares inc. tst instruction Content-Type: multipart/mixed; boundary="------------020004020907030904080409"

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index 5217f4a..4fb8ec0 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -146,6 +146,11 @@ aarch64_types_binop_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_maybe_immediate }; #define TYPES_BINOP (aarch64_types_binop_qualifiers) static enum aarch64_type_qualifiers +aarch64_types_cmtst_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_none, qualifier_none, qualifier_none, + qualifier_internal, qualifier_internal }; +#define TYPES_TST (aarch64_types_cmtst_qualifiers) +static enum aarch64_type_qualifiers aarch64_types_binopv_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_void, qualifier_none, qualifier_none }; #define TYPES_BINOPV (aarch64_types_binopv_qualifiers) @@ -1297,7 +1302,7 @@ aarch64_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED, tree *args, BUILTIN_VALLDI (BINOP, cmeq, 0) return fold_build2 (EQ_EXPR, type, args[0], args[1]); break; - BUILTIN_VSDQ_I_DI (BINOP, cmtst, 0) + BUILTIN_VSDQ_I_DI (TST, cmtst, 0) { tree and_node = fold_build2 (BIT_AND_EXPR, type, args[0], args[1]); tree vec_zero_node = build_zero_cst (type); diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index cca3bc9..5c8013d 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -179,6 +179,7 @@ bool aarch64_cannot_change_mode_class (enum machine_mode, enum reg_class); enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx, enum aarch64_symbol_context); +bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT); bool aarch64_constant_address_p (rtx); bool aarch64_expand_movmem (rtx *); bool aarch64_float_const_zero_rtx_p (rtx); diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 4f3bd12..6aa45b6 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -246,7 +246,7 @@ /* Implemented by aarch64_cm<cmp><mode>. */ BUILTIN_VSDQ_I_DI (BINOP, cmgeu, 0) BUILTIN_VSDQ_I_DI (BINOP, cmgtu, 0) - BUILTIN_VSDQ_I_DI (BINOP, cmtst, 0) + BUILTIN_VSDQ_I_DI (TST, cmtst, 0) /* Implemented by reduc_<sur>plus_<mode>. */ BUILTIN_VALL (UNOP, reduc_splus_, 10) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index f5fa4ae..4d5d840 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1871,58 +1871,94 @@ (match_operand:VDQ 2 "nonmemory_operand")))] "TARGET_SIMD" { - int inverse = 0, has_zero_imm_form = 0; rtx op1 = operands[1]; rtx op2 = operands[2]; rtx mask = gen_reg_rtx (<MODE>mode); + enum rtx_code code = GET_CODE (operands[3]); + + /* Switching OP1 and OP2 is necessary for NE (to output a cmeq insn), + and desirable for other comparisons if it results in FOO ? -1 : 0 + (this allows direct use of the comparison result without a bsl). */ + if (code == NE + || (code != EQ + && op1 == CONST0_RTX (<V_cmp_result>mode) + && op2 == CONSTM1_RTX (<V_cmp_result>mode))) + { + op1 = operands[2]; + op2 = operands[1]; + switch (code) + { + case LE: code = GT; break; + case LT: code = GE; break; + case GE: code = LT; break; + case GT: code = LE; break; + /* No case EQ. */ + case NE: code = EQ; break; + case LTU: code = GEU; break; + case LEU: code = GTU; break; + case GTU: code = LEU; break; + case GEU: code = LTU; break; + default: gcc_unreachable (); + } + } - switch (GET_CODE (operands[3])) + /* Make sure we can handle the last operand. */ + switch (code) { + case NE: + /* Normalized to EQ above. */ + gcc_unreachable (); + case LE: case LT: - case NE: - inverse = 1; - /* Fall through. */ case GE: case GT: case EQ: - has_zero_imm_form = 1; - break; - case LEU: - case LTU: - inverse = 1; - break; + /* These instructions have a form taking an immediate zero. */ + if (operands[5] == CONST0_RTX (<MODE>mode)) + break; + /* Fall through, as may need to load into register. */ default: + if (!REG_P (operands[5])) + operands[5] = force_reg (<MODE>mode, operands[5]); break; } - if (!REG_P (operands[5]) - && (operands[5] != CONST0_RTX (<MODE>mode) || !has_zero_imm_form)) - operands[5] = force_reg (<MODE>mode, operands[5]); - - switch (GET_CODE (operands[3])) + switch (code) { case LT: + emit_insn (gen_aarch64_cmlt<mode> (mask, operands[4], operands[5])); + break; + case GE: emit_insn (gen_aarch64_cmge<mode> (mask, operands[4], operands[5])); break; case LE: + emit_insn (gen_aarch64_cmle<mode> (mask, operands[4], operands[5])); + break; + case GT: emit_insn (gen_aarch64_cmgt<mode> (mask, operands[4], operands[5])); break; case LTU: + emit_insn (gen_aarch64_cmgtu<mode> (mask, operands[5], operands[4])); + break; + case GEU: emit_insn (gen_aarch64_cmgeu<mode> (mask, operands[4], operands[5])); break; case LEU: + emit_insn (gen_aarch64_cmgeu<mode> (mask, operands[5], operands[4])); + break; + case GTU: emit_insn (gen_aarch64_cmgtu<mode> (mask, operands[4], operands[5])); break; - case NE: + /* NE has been normalized to EQ above. */ case EQ: emit_insn (gen_aarch64_cmeq<mode> (mask, operands[4], operands[5])); break; @@ -1931,12 +1967,6 @@ gcc_unreachable (); } - if (inverse) - { - op1 = operands[2]; - op2 = operands[1]; - } - /* If we have (a = (b CMP c) ? -1 : 0); Then we can simply move the generated mask. */ @@ -3891,14 +3921,22 @@ ;; cmtst +;; Although neg (ne (and x y) 0) is the natural way of expressing a cmtst, +;; we don't have any insns using ne, and aarch64_vcond_internal outputs +;; not (neg (eq (and x y) 0)) +;; which is rewritten by simplify_rtx as +;; plus (eq (and x y) 0) -1. + (define_insn "aarch64_cmtst<mode>" [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w") - (neg:<V_cmp_result> - (ne:<V_cmp_result> + (plus:<V_cmp_result> + (eq:<V_cmp_result> (and:VDQ (match_operand:VDQ 1 "register_operand" "w") (match_operand:VDQ 2 "register_operand" "w")) - (vec_duplicate:<V_cmp_result> (const_int 0)))))] + (match_operand:VDQ 3 "aarch64_simd_imm_zero")) + (match_operand:<V_cmp_result> 4 "aarch64_simd_imm_minus_one"))) + ] "TARGET_SIMD" "cmtst\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>" [(set_attr "type" "neon_tst<q>")] diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index e7946fc..6a877c2 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -137,9 +137,6 @@ static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED; static void aarch64_override_options_after_change (void); static bool aarch64_vector_mode_supported_p (enum machine_mode); static unsigned bit_count (unsigned HOST_WIDE_INT); -static bool aarch64_const_vec_all_same_int_p (rtx, - HOST_WIDE_INT, HOST_WIDE_INT); - static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode, const unsigned char *sel); static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool); @@ -3679,6 +3676,36 @@ aarch64_get_condition_code (rtx x) } } +bool +aarch64_const_vec_all_same_in_range_p (rtx x, + HOST_WIDE_INT minval, + HOST_WIDE_INT maxval) +{ + HOST_WIDE_INT firstval; + int count, i; + + if (GET_CODE (x) != CONST_VECTOR + || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT) + return false; + + firstval = INTVAL (CONST_VECTOR_ELT (x, 0)); + if (firstval < minval || firstval > maxval) + return false; + + count = CONST_VECTOR_NUNITS (x); + for (i = 1; i < count; i++) + if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval) + return false; + + return true; +} + +bool +aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val) +{ + return aarch64_const_vec_all_same_in_range_p (x, val, val); +} + static unsigned bit_count (unsigned HOST_WIDE_INT value) { @@ -3921,9 +3948,10 @@ aarch64_print_operand (FILE *f, rtx x, char code) case CONST_VECTOR: if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT) { - gcc_assert (aarch64_const_vec_all_same_int_p (x, - HOST_WIDE_INT_MIN, - HOST_WIDE_INT_MAX)); + gcc_assert ( + aarch64_const_vec_all_same_in_range_p (x, + HOST_WIDE_INT_MIN, + HOST_WIDE_INT_MAX)); asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0))); } else if (aarch64_simd_imm_zero_p (x, GET_MODE (x))) @@ -7826,39 +7854,15 @@ aarch64_simd_valid_immediate (rtx op, enum machine_mode mode, bool inverse, #undef CHECK } -static bool -aarch64_const_vec_all_same_int_p (rtx x, - HOST_WIDE_INT minval, - HOST_WIDE_INT maxval) -{ - HOST_WIDE_INT firstval; - int count, i; - - if (GET_CODE (x) != CONST_VECTOR - || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT) - return false; - - firstval = INTVAL (CONST_VECTOR_ELT (x, 0)); - if (firstval < minval || firstval > maxval) - return false; - - count = CONST_VECTOR_NUNITS (x); - for (i = 1; i < count; i++) - if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval) - return false; - - return true; -} - /* Check of immediate shift constants are within range. */ bool aarch64_simd_shift_imm_p (rtx x, enum machine_mode mode, bool left) { int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT; if (left) - return aarch64_const_vec_all_same_int_p (x, 0, bit_width - 1); + return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1); else - return aarch64_const_vec_all_same_int_p (x, 1, bit_width); + return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width); } /* Return true if X is a uniform vector where all elements diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 3dd83ca..18133eb 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -261,3 +261,9 @@ { return aarch64_simd_imm_zero_p (op, mode); }) + +(define_special_predicate "aarch64_simd_imm_minus_one" + (match_code "const_vector") +{ + return aarch64_const_vec_all_same_int_p (op, -1); +}) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons.x b/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons.x new file mode 100644 index 0000000..3b468eb --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons.x @@ -0,0 +1,68 @@ +/* test_vcXXX wrappers for all the vcXXX (vector compare) and vtst intrinsics + in arm_neon.h (excluding the 64x1 variants as these generally produce scalar + not vector ops). */ +#include "arm_neon.h" + +#define DONT_FORCE(X) + +#define FORCE_SIMD(V1) asm volatile ("mov %d0, %1.d[0]" \ + : "=w"(V1) \ + : "w"(V1) \ + : /* No clobbers */); + +#define OP1(SIZE, OP, BASETYPE, SUFFIX, FORCE) uint##SIZE##_t \ +test_v##OP##SUFFIX (BASETYPE##SIZE##_t a) \ +{ \ + uint##SIZE##_t res; \ + FORCE (a); \ + res = v##OP##SUFFIX (a); \ + FORCE (res); \ + return res; \ +} + +#define OP2(SIZE, OP, BASETYPE, SUFFIX, FORCE) uint##SIZE##_t \ +test_v##OP##SUFFIX (BASETYPE##SIZE##_t a, BASETYPE##SIZE##_t b) \ +{ \ + uint##SIZE##_t res; \ + FORCE (a); \ + FORCE (b); \ + res = v##OP##SUFFIX (a, b); \ + FORCE (res); \ + return res; \ +} + +#define UNSIGNED_OPS(SIZE, BASETYPE, SUFFIX, FORCE) \ +OP2 (SIZE, tst, BASETYPE, SUFFIX, FORCE) \ +OP1 (SIZE, ceqz, BASETYPE, SUFFIX, FORCE) \ +OP2 (SIZE, ceq, BASETYPE, SUFFIX, FORCE) \ +OP2 (SIZE, cge, BASETYPE, SUFFIX, FORCE) \ +OP2 (SIZE, cgt, BASETYPE, SUFFIX, FORCE) \ +OP2 (SIZE, cle, BASETYPE, SUFFIX, FORCE) \ +OP2 (SIZE, clt, BASETYPE, SUFFIX, FORCE) + +#define ALL_OPS(SIZE, BASETYPE, SUFFIX, FORCE) \ +OP1 (SIZE, cgez, BASETYPE, SUFFIX, FORCE) \ +OP1 (SIZE, cgtz, BASETYPE, SUFFIX, FORCE) \ +OP1 (SIZE, clez, BASETYPE, SUFFIX, FORCE) \ +OP1 (SIZE, cltz, BASETYPE, SUFFIX, FORCE) \ +UNSIGNED_OPS (SIZE, BASETYPE, SUFFIX, FORCE) + +ALL_OPS (8x8, int, _s8, DONT_FORCE) +ALL_OPS (16x4, int, _s16, DONT_FORCE) +ALL_OPS (32x2, int, _s32, DONT_FORCE) +ALL_OPS (64x1, int, _s64, DONT_FORCE) +ALL_OPS (64, int, d_s64, FORCE_SIMD) +ALL_OPS (8x16, int, q_s8, DONT_FORCE) +ALL_OPS (16x8, int, q_s16, DONT_FORCE) +ALL_OPS (32x4, int, q_s32, DONT_FORCE) +ALL_OPS (64x2, int, q_s64, DONT_FORCE) +UNSIGNED_OPS (8x8, uint, _u8, DONT_FORCE) +UNSIGNED_OPS (16x4, uint, _u16, DONT_FORCE) +UNSIGNED_OPS (32x2, uint, _u32, DONT_FORCE) +UNSIGNED_OPS (64x1, uint, _u64, DONT_FORCE) +UNSIGNED_OPS (64, uint, d_u64, FORCE_SIMD) +UNSIGNED_OPS (8x16, uint, q_u8, DONT_FORCE) +UNSIGNED_OPS (16x8, uint, q_u16, DONT_FORCE) +UNSIGNED_OPS (32x4, uint, q_u32, DONT_FORCE) +UNSIGNED_OPS (64x2, uint, q_u64, DONT_FORCE) + diff --git a/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons_1.c b/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons_1.c new file mode 100644 index 0000000..86c6ed2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons_1.c @@ -0,0 +1,47 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-inline" } */ + +/* Scan-assembler test, so, incorporate as little other code as possible. */ + +#include "arm_neon.h" +#include "int_comparisons.x" + +/* Operations on all 18 integer types: (q?)_[su](8|16|32|64), d_[su]64. + (d?)_[us]64 generate regs of form 'd0' rather than e.g. 'v0.2d'. */ +/* { dg-final { scan-assembler-times "\[ \t\]cmeq\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 14 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmeq\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?0" 4 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmeq\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmeq\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmtst\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmtst\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */ + +/* vcge + vcle both implemented with cmge (signed) or cmhs (unsigned). */ +/* { dg-final { scan-assembler-times "\[ \t\]cmge\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmge\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmhs\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmhs\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */ + +/* vcgt + vclt both implemented with cmgt (signed) or cmhi (unsigned). */ +/* { dg-final { scan-assembler-times "\[ \t\]cmgt\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmgt\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmhi\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmhi\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */ + +/* Comparisons against immediate zero, on the 8 signed integer types only. */ + +/* { dg-final { scan-assembler-times "\[ \t\]cmge\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 7 } } */ +/* For int64_t and int64x1_t, combine_simplify_rtx failure of + https://gcc.gnu.org/ml/gcc/2014-06/msg00253.html + prevents generation of cmge....#0, instead producing mvn + sshr. */ +/* { #dg-final { scan-assembler-times "\[ \t\]cmge\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?0" 2 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmgt\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 7 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmgt\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?0" 2 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmle\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 7 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmle\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?0" 2 } } */ +/* { dg-final { scan-assembler-times "\[ \t\]cmlt\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 7 } } */ +/* For int64_t and int64x1_t, cmlt ... #0 and sshr ... #63 are equivalent, + so allow either. cmgez issue above results in extra 2 * sshr....63. */ +/* { dg-final { scan-assembler-times "\[ \t\](?:cmlt|sshr)\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?(?:0|63)" 4 } } */ + +// All should have been compiled into single insns without inverting result: +/* { dg-final { scan-assembler-not "not" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons_2.c b/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons_2.c new file mode 100644 index 0000000..3588231 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons_2.c @@ -0,0 +1,131 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fno-inline" } */ +/* Stops the test_xxx methods being inlined into main, thus preventing constant + propagation. */ + +#include "int_comparisons.x" + +extern void abort (void); + +#define CHECK2(R0, R1) if (res[0] != R0 || res[1] != R1) abort () + +#define TEST2(BASETYPE, SUFFIX, RESTYPE, ST1_SUFFIX) { \ + BASETYPE##_t _a[2] = {2, 3}; \ + BASETYPE##x2_t a = vld1##SUFFIX (_a); \ + BASETYPE##_t _b[2] = {1, 3}; \ + BASETYPE##x2_t b = vld1##SUFFIX (_b); \ + RESTYPE res[2]; \ + vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (a, b)); CHECK2 (0, 0); \ + vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (b, a)); CHECK2 (-1, 0); \ + vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (a, b)); CHECK2 (0, -1); \ + vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (b, a)); CHECK2 (-1, -1); \ + vst1##ST1_SUFFIX (res, test_vceq##SUFFIX (a, b)); CHECK2 (0, -1); \ + vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (a, b)); CHECK2 (-1, -1); \ + vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (b, a)); CHECK2 (0, -1); \ + vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (a, b)); CHECK2 (-1, 0); \ + vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (b, a)); CHECK2 (0, 0); \ + vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a, b)); CHECK2 (0, -1); \ + vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a + 1, b)); CHECK2 (-1, 0); \ +} + +#define CHECK4(T, R0, R1, R2, R3) \ + if (res[0] != (T)R0 || res[1] != (T)R1 \ + || res[2] != (T)R2 || res[3] != (T)R3) abort () + +#define TEST4(BASETYPE, SUFFIX, RESTYPE, ST1_SUFFIX) { \ + BASETYPE##_t _a[4] = {1, 2, 3, 4}; \ + BASETYPE##x4_t a = vld1##SUFFIX (_a); \ + BASETYPE##_t _b[4] = {4, 2, 1, 3}; \ + BASETYPE##x4_t b = vld1##SUFFIX (_b); \ + RESTYPE res[4]; \ + vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (a, b)); \ + CHECK4 (RESTYPE, -1, 0, 0, 0); \ + vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (a, b)); \ + CHECK4 (RESTYPE, -1, -1, 0, 0); \ + vst1##ST1_SUFFIX (res, test_vceq##SUFFIX (a, b)); \ + CHECK4 (RESTYPE, 0, -1, 0, 0); \ + vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (a, b)); \ + CHECK4 (RESTYPE, 0, -1, -1, -1); \ + vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (a, b)); \ + CHECK4 (RESTYPE, 0, 0, -1, -1); \ + vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a, b)); \ + CHECK4 (RESTYPE, 0, -1, -1, 0); \ +} + +#define CHECK8(T, R0, R1, R2, R3, R4, R5, R6, R7) \ + if (res[0] != (T)R0 || res[1] != (T)R1 || res[2] != (T)R2 || res[3] != (T)R3 \ + || res[4] != (T)R4 || res[5] != (T)R5 || res[6] != (T)R6 \ + || res[7] != (T)R7) abort () + +#define TEST8(BASETYPE, SUFFIX, RESTYPE, ST1_SUFFIX) { \ + BASETYPE##_t _a[8] = {1, 2, 3, 4, 5, 6, 7, 8}; \ + BASETYPE##x8_t a = vld1##SUFFIX (_a); \ + BASETYPE##_t _b[8] = {4, 2, 1, 3, 2, 6, 8, 9}; \ + BASETYPE##x8_t b = vld1##SUFFIX (_b); \ + RESTYPE res[8]; \ + vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (a, b)); \ + CHECK8 (RESTYPE, -1, 0, 0, 0, 0, 0, -1, -1); \ + vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (a, b)); \ + CHECK8 (RESTYPE, -1, -1, 0, 0, 0, -1, -1, -1); \ + vst1##ST1_SUFFIX (res, test_vceq##SUFFIX (a, b)); \ + CHECK8 (RESTYPE, 0, -1, 0, 0, 0, -1, 0, 0); \ + vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (a, b)); \ + CHECK8 (RESTYPE, 0, -1, -1, -1, -1, -1, 0, 0); \ + vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (a, b)); \ + CHECK8 (RESTYPE, 0, 0, -1, -1, -1, 0, 0, 0); \ + vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a, b)); \ + CHECK8 (RESTYPE, 0, -1, -1, 0, 0, -1, 0, -1); \ +} + +/* 16-way tests use same 8 values twice. */ +#define CHECK16(T, R0, R1, R2, R3, R4, R5, R6, R7) \ + if (res[0] != (T)R0 || res[1] != (T)R1 || res[2] != (T)R2 || res[3] != (T)R3 \ + || res[4] != (T)R4 || res[5] != (T)R5 || res[6] != (T)R6 \ + || res[7] != (T)R7 || res[8] != (T)R0 || res[9] != (T)R1 \ + || res[10] != (T)R2 || res[11] != (T)R3 || res[12] != (T)R4 \ + || res[13] != (T)R5 || res[14] != (T)R6 || res[15] != (T)R7) abort () + +#define TEST16(BASETYPE, SUFFIX, RESTYPE, ST1_SUFFIX) { \ + BASETYPE##_t _a[16] = {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}; \ + BASETYPE##x16_t a = vld1##SUFFIX (_a); \ + BASETYPE##_t _b[16] = {4, 2, 1, 3, 2, 6, 8, 9, 4, 2, 1, 3, 2, 6, 8, 9}; \ + BASETYPE##x16_t b = vld1##SUFFIX (_b); \ + RESTYPE res[16]; \ + vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (a, b)); \ + CHECK16 (RESTYPE, -1, 0, 0, 0, 0, 0, -1, -1); \ + vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (a, b)); \ + CHECK16 (RESTYPE, -1, -1, 0, 0, 0, -1, -1, -1); \ + vst1##ST1_SUFFIX (res, test_vceq##SUFFIX (a, b)); \ + CHECK16 (RESTYPE, 0, -1, 0, 0, 0, -1, 0, 0); \ + vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (a, b)); \ + CHECK16 (RESTYPE, 0, -1, -1, -1, -1, -1, 0, 0); \ + vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (a, b)); \ + CHECK16 (RESTYPE, 0, 0, -1, -1, -1, 0, 0, 0); \ + vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a, b)); \ + CHECK16 (RESTYPE, 0, -1, -1, 0, 0, -1, 0, -1); \ +} + +int +main (int argc, char **argv) +{ + TEST2 (int32, _s32, uint32_t, _u32); + TEST2 (uint32, _u32, uint32_t, _u32); + TEST2 (int64, q_s64, uint64_t, q_u64); + TEST2 (uint64, q_u64, uint64_t, q_u64); + + TEST4 (int16, _s16, uint16_t, _u16); + TEST4 (uint16, _u16, uint16_t, _u16); + TEST4 (int32, q_s32, uint32_t, q_u32); + TEST4 (uint32, q_u32, uint32_t, q_u32); + + TEST8 (int8, _s8, uint8_t, _u8); + TEST8 (uint8, _u8, uint8_t, _u8); + TEST8 (int16, q_s16, uint16_t, q_u16); + TEST8 (uint16, q_u16, uint16_t, q_u16); + + TEST16 (int8, q_s8, uint8_t, q_u8); + TEST16 (uint8, q_u8, uint8_t, q_u8); + + return 0; +} +

[AArch64,1/2] Improve codegen of vector compares inc. tst instruction

Commit Message

Comments

Patch