[RFC,Vectorizer,AArch64] Fix PR/61114 by redefining REDUC_xxx_EXPR tree codes to return scalars

Message ID	53DB699B.1010506@arm.com
State	New
Headers	show Return-Path: <gcc-patches-return-373832-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:subject:content-type; q= dns; s=default; b=tB9BL+yNR+6xRxoppun0H4NYdedCkuscLVMwzt7tVX9h8c VSKQho94On7qXOLRUUH8fvOYTcfW0blknO3/a7uW4IXtHyJ9MWqpq/TUvfOBLY24 nGb9VnDk/L2HY9BwpDAO0EeuPInw6LgHfrbkBxJNbFqxyfFJBIUMNMIBnsghQ= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Message-ID: <53DB699B.1010506@arm.com> Date: Fri, 01 Aug 2014 11:19:07 +0100 From: Alan Lawrence <alan.lawrence@arm.com> User-Agent: Thunderbird 2.0.0.24 (X11/20101213) MIME-Version: 1.0 To: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org> Subject: [RFC][Vectorizer, AArch64] Fix PR/61114 by redefining REDUC_xxx_EXPR tree codes to return scalars Content-Type: multipart/mixed; boundary="------------070309060504010900080505"

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index fee17ecf637436c8704f565be2eb9ef23891209a..77ed36ecc4cade4c2c6cafd16070198dacb0b869 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -1396,21 +1396,21 @@ aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi) switch (fcode) { - BUILTIN_VALL (UNOP, reduc_splus_, 10) + BUILTIN_VALL (UNOP, reduc_splus_, 0) new_stmt = gimple_build_assign_with_ops ( REDUC_PLUS_EXPR, gimple_call_lhs (stmt), args[0], NULL_TREE); break; - BUILTIN_VDQIF (UNOP, reduc_smax_, 10) + BUILTIN_VDQIF (UNOP, reduc_smax_, 0) new_stmt = gimple_build_assign_with_ops ( REDUC_MAX_EXPR, gimple_call_lhs (stmt), args[0], NULL_TREE); break; - BUILTIN_VDQIF (UNOP, reduc_smin_, 10) + BUILTIN_VDQIF (UNOP, reduc_smin_, 0) new_stmt = gimple_build_assign_with_ops ( REDUC_MIN_EXPR, gimple_call_lhs (stmt), diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 268432cc117b7027ee9472fc5a4f9b1ea13bea0f..ef363c676bf5bb05cfdaf0f111324d42c3f3d992 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -251,13 +251,19 @@ BUILTIN_VSDQ_I_DI (BINOP, cmgtu, 0) BUILTIN_VSDQ_I_DI (BINOP, cmtst, 0) + /* Implemented by aarch64_reduc_splus_<mode>. */ + BUILTIN_VALL (UNOP, reduc_splus_, 0) + /* Implemented by reduc_<sur>plus_<mode>. */ - BUILTIN_VALL (UNOP, reduc_splus_, 10) BUILTIN_VDQ (UNOP, reduc_uplus_, 10) + /* Implemented by aarch64_reduc_smax_<mode>. */ + BUILTIN_VDQIF (UNOP, reduc_smax_, 0) + + /* Implemented by aarch64_reduc_smin_<mode>. */ + BUILTIN_VDQIF (UNOP, reduc_smin_, 0) + /* Implemented by reduc_<maxmin_uns>_<mode>. */ - BUILTIN_VDQIF (UNOP, reduc_smax_, 10) - BUILTIN_VDQIF (UNOP, reduc_smin_, 10) BUILTIN_VDQ_BHSI (UNOP, reduc_umax_, 10) BUILTIN_VDQ_BHSI (UNOP, reduc_umin_, 10) BUILTIN_VDQF (UNOP, reduc_smax_nan_, 10) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 6300b9b6c7ac06384d2e59bbac1a0d5445975bb6..4ade92c7f47bf71dc993d25621ec839ea867e3d5 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1719,6 +1719,19 @@ ;; 'across lanes' add. +;; Template for outputting a scalar, so we can create __builtins which can be +;; gimple_fold'd to the REDUC_PLUS_EXPR tree code. +(define_expand "aarch64_reduc_splus_<mode>" + [(set (match_operand:<VEL> 0 "register_operand") + (match_operand:VALL 1 "register_operand"))] + "TARGET_SIMD" + { + /* Must be handled by aarch64_gimple_fold_builtin. */ + gcc_unreachable (); + FAIL; + } +) + (define_insn "reduc_<sur>plus_<mode>" [(set (match_operand:VDQV 0 "register_operand" "=w") (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")] @@ -1776,6 +1789,31 @@ ;; 'across lanes' max and min ops. +;; Template for outputting a scalar, so we can create __builtins which can be +;; gimple_fold'd to the REDUC_MAX_EXPR tree code. The V2DI isn't used. +(define_expand "aarch64_reduc_smax_<mode>" + [(set (match_operand:<VEL> 0 "register_operand") + (match_operand:VALL 1 "register_operand"))] + "TARGET_SIMD" + { + /* Must be handled in aarch64_gimple_fold_builtin. */ + gcc_unreachable (); + FAIL; + } +) + +;; Likewise for REDUC_MIN_EXPR tree code. +(define_expand "aarch64_reduc_smin_<mode>" + [(set (match_operand:<VEL> 0 "register_operand") + (match_operand:VALL 1 "register_operand"))] + "TARGET_SIMD" + { + /* Must be handled in aarch64_gimple_fold_builtin. */ + gcc_unreachable (); + FAIL; + } +) + (define_insn "reduc_<maxmin_uns>_<mode>" [(set (match_operand:VDQV_S 0 "register_operand" "=w") (unspec:VDQV_S [(match_operand:VDQV_S 1 "register_operand" "w")] diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 83ac5e96d422ceccadcb212ec792665b78c03fae..b4d7e892e8ea2e4df3dedce7980b771da4b922e2 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -13532,19 +13532,19 @@ vaddd_u64 (uint64_t __a, uint64_t __b) __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vaddv_s8 (int8x8_t __a) { - return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), 0); + return __builtin_aarch64_reduc_splus_v8qi (__a); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vaddv_s16 (int16x4_t __a) { - return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), 0); + return __builtin_aarch64_reduc_splus_v4hi (__a); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vaddv_s32 (int32x2_t __a) { - return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), 0); + return __builtin_aarch64_reduc_splus_v2si (__a); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) @@ -13574,26 +13574,25 @@ vaddv_u32 (uint32x2_t __a) __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vaddvq_s8 (int8x16_t __a) { - return vgetq_lane_s8 (__builtin_aarch64_reduc_splus_v16qi (__a), - 0); + return __builtin_aarch64_reduc_splus_v16qi (__a); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vaddvq_s16 (int16x8_t __a) { - return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), 0); + return __builtin_aarch64_reduc_splus_v8hi (__a); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vaddvq_s32 (int32x4_t __a) { - return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), 0); + return __builtin_aarch64_reduc_splus_v4si (__a); } __extension__ static __inline int64_t __attribute__ ((__always_inline__)) vaddvq_s64 (int64x2_t __a) { - return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), 0); + return __builtin_aarch64_reduc_splus_v2di (__a); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) @@ -13631,22 +13630,19 @@ vaddvq_u64 (uint64x2_t __a) __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vaddv_f32 (float32x2_t __a) { - float32x2_t __t = __builtin_aarch64_reduc_splus_v2sf (__a); - return vget_lane_f32 (__t, 0); + return __builtin_aarch64_reduc_splus_v2sf (__a); } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vaddvq_f32 (float32x4_t __a) { - float32x4_t __t = __builtin_aarch64_reduc_splus_v4sf (__a); - return vgetq_lane_f32 (__t, 0); + return __builtin_aarch64_reduc_splus_v4sf (__a); } __extension__ static __inline float64_t __attribute__ ((__always_inline__)) vaddvq_f64 (float64x2_t __a) { - float64x2_t __t = __builtin_aarch64_reduc_splus_v2df (__a); - return vgetq_lane_f64 (__t, 0); + return __builtin_aarch64_reduc_splus_v2df (__a); } /* vbsl */ @@ -18125,19 +18121,19 @@ vmaxv_f32 (float32x2_t __a) __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vmaxv_s8 (int8x8_t __a) { - return vget_lane_s8 (__builtin_aarch64_reduc_smax_v8qi (__a), 0); + return __builtin_aarch64_reduc_smax_v8qi (__a); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vmaxv_s16 (int16x4_t __a) { - return vget_lane_s16 (__builtin_aarch64_reduc_smax_v4hi (__a), 0); + return __builtin_aarch64_reduc_smax_v4hi (__a); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vmaxv_s32 (int32x2_t __a) { - return vget_lane_s32 (__builtin_aarch64_reduc_smax_v2si (__a), 0); + return __builtin_aarch64_reduc_smax_v2si (__a); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) @@ -18181,19 +18177,19 @@ vmaxvq_f64 (float64x2_t __a) __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vmaxvq_s8 (int8x16_t __a) { - return vgetq_lane_s8 (__builtin_aarch64_reduc_smax_v16qi (__a), 0); + return __builtin_aarch64_reduc_smax_v16qi (__a); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vmaxvq_s16 (int16x8_t __a) { - return vgetq_lane_s16 (__builtin_aarch64_reduc_smax_v8hi (__a), 0); + return __builtin_aarch64_reduc_smax_v8hi (__a); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vmaxvq_s32 (int32x4_t __a) { - return vgetq_lane_s32 (__builtin_aarch64_reduc_smax_v4si (__a), 0); + return __builtin_aarch64_reduc_smax_v4si (__a); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) @@ -18225,20 +18221,19 @@ vmaxvq_u32 (uint32x4_t __a) __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vmaxnmv_f32 (float32x2_t __a) { - return vget_lane_f32 (__builtin_aarch64_reduc_smax_v2sf (__a), - 0); + return __builtin_aarch64_reduc_smax_v2sf (__a); } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vmaxnmvq_f32 (float32x4_t __a) { - return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_v4sf (__a), 0); + return __builtin_aarch64_reduc_smax_v4sf (__a); } __extension__ static __inline float64_t __attribute__ ((__always_inline__)) vmaxnmvq_f64 (float64x2_t __a) { - return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_v2df (__a), 0); + return __builtin_aarch64_reduc_smax_v2df (__a); } /* vmin */ @@ -18371,20 +18366,19 @@ vminv_f32 (float32x2_t __a) __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vminv_s8 (int8x8_t __a) { - return vget_lane_s8 (__builtin_aarch64_reduc_smin_v8qi (__a), - 0); + return __builtin_aarch64_reduc_smin_v8qi (__a); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vminv_s16 (int16x4_t __a) { - return vget_lane_s16 (__builtin_aarch64_reduc_smin_v4hi (__a), 0); + return __builtin_aarch64_reduc_smin_v4hi (__a); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vminv_s32 (int32x2_t __a) { - return vget_lane_s32 (__builtin_aarch64_reduc_smin_v2si (__a), 0); + return __builtin_aarch64_reduc_smin_v2si (__a); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) @@ -18428,19 +18422,19 @@ vminvq_f64 (float64x2_t __a) __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vminvq_s8 (int8x16_t __a) { - return vgetq_lane_s8 (__builtin_aarch64_reduc_smin_v16qi (__a), 0); + return __builtin_aarch64_reduc_smin_v16qi (__a); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vminvq_s16 (int16x8_t __a) { - return vgetq_lane_s16 (__builtin_aarch64_reduc_smin_v8hi (__a), 0); + return __builtin_aarch64_reduc_smin_v8hi (__a); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vminvq_s32 (int32x4_t __a) { - return vgetq_lane_s32 (__builtin_aarch64_reduc_smin_v4si (__a), 0); + return __builtin_aarch64_reduc_smin_v4si (__a); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) @@ -18472,19 +18466,19 @@ vminvq_u32 (uint32x4_t __a) __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vminnmv_f32 (float32x2_t __a) { - return vget_lane_f32 (__builtin_aarch64_reduc_smin_v2sf (__a), 0); + return __builtin_aarch64_reduc_smin_v2sf (__a); } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vminnmvq_f32 (float32x4_t __a) { - return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_v4sf (__a), 0); + return __builtin_aarch64_reduc_smin_v4sf (__a); } __extension__ static __inline float64_t __attribute__ ((__always_inline__)) vminnmvq_f64 (float64x2_t __a) { - return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_v2df (__a), 0); + return __builtin_aarch64_reduc_smin_v2df (__a); } /* vmla */ diff --git a/gcc/expr.c b/gcc/expr.c index 4d2163f721b092e35fad464d71797aebaf0cb6e3..dfec5b1bf12e9fb82f88d35cad109e511f10c8d2 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -9019,7 +9019,17 @@ expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode, { op0 = expand_normal (treeop0); this_optab = optab_for_tree_code (code, type, optab_default); - temp = expand_unop (mode, this_optab, op0, target, unsignedp); + enum machine_mode vec_mode = TYPE_MODE (TREE_TYPE (treeop0)); + temp = expand_unop (vec_mode, this_optab, op0, NULL_RTX, unsignedp); + gcc_assert (temp); + /* The tree code produces a scalar result, but (somewhat by convention) + the optab produces a vector with the result in element 0 if + little-endian, or element N-1 if big-endian. So pull the scalar + result out of that element. */ + int index = BYTES_BIG_ENDIAN ? GET_MODE_NUNITS (vec_mode) - 1 : 0; + int bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode)); + temp = extract_bit_field (temp, bitsize, bitsize * index, unsignedp, + target, mode, mode); gcc_assert (temp); return temp; } diff --git a/gcc/fold-const.c b/gcc/fold-const.c index d22eac15962a7abfb605bb79b6f9b7809228dab3..3597750a4998ed1a714ef05f9484495c50baa029 100644 --- a/gcc/fold-const.c +++ b/gcc/fold-const.c @@ -8439,12 +8439,13 @@ fold_unary_loc (location_t loc, enum tree_code code, tree type, tree op0) case REDUC_MAX_EXPR: case REDUC_PLUS_EXPR: { - unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i; + unsigned int nelts, i; tree *elts; enum tree_code subcode; if (TREE_CODE (op0) != VECTOR_CST) return NULL_TREE; + nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0)); elts = XALLOCAVEC (tree, nelts); if (!vec_cst_ctor_to_array (op0, elts)) @@ -8463,10 +8464,9 @@ fold_unary_loc (location_t loc, enum tree_code code, tree type, tree op0) elts[0] = const_binop (subcode, elts[0], elts[i]); if (elts[0] == NULL_TREE || !CONSTANT_CLASS_P (elts[0])) return NULL_TREE; - elts[i] = build_zero_cst (TREE_TYPE (type)); } - return build_vector (type, elts); + return elts[0]; } default: diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c index abf09d5304d002641634ea45e68c7c8939825a1f..68b57637add0c4e1b610cdb5182e95d849541869 100644 --- a/gcc/tree-cfg.c +++ b/gcc/tree-cfg.c @@ -3531,12 +3531,21 @@ verify_gimple_assign_unary (gimple stmt) return false; } - - case VEC_UNPACK_HI_EXPR: - case VEC_UNPACK_LO_EXPR: case REDUC_MAX_EXPR: case REDUC_MIN_EXPR: case REDUC_PLUS_EXPR: + if (!VECTOR_TYPE_P (rhs1_type) + || !useless_type_conversion_p (lhs_type, TREE_TYPE (rhs1_type))) + { + error ("reduction should convert from vector to element type"); + debug_generic_expr (lhs_type); + debug_generic_expr (rhs1_type); + return true; + } + return false; + + case VEC_UNPACK_HI_EXPR: + case VEC_UNPACK_LO_EXPR: case VEC_UNPACK_FLOAT_HI_EXPR: case VEC_UNPACK_FLOAT_LO_EXPR: /* FIXME. */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 7e013f3b549a07bd44789bd4d3e3701eec7c51dc..35a8fde5b8f77393765a57f2833d799e529d0d8d 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -1892,9 +1892,9 @@ vect_analyze_loop (struct loop *loop) Output: REDUC_CODE - the corresponding tree-code to be used to reduce the - vector of partial results into a single scalar result (which - will also reside in a vector) or ERROR_MARK if the operation is - a supported reduction operation, but does not have such tree-code. + vector of partial results into a single scalar result, or ERROR_MARK + if the operation is a supported reduction operation, but does not have + such tree-code. Return FALSE if CODE currently cannot be vectorized as reduction. */ @@ -4175,14 +4175,12 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt, dump_printf_loc (MSG_NOTE, vect_location, "Reduce using direct vector reduction.\n"); - vec_dest = vect_create_destination_var (scalar_dest, vectype); - tmp = build1 (reduc_code, vectype, new_phi_result); - epilog_stmt = gimple_build_assign (vec_dest, tmp); - new_temp = make_ssa_name (vec_dest, epilog_stmt); + tmp = build1 (reduc_code, scalar_type, new_phi_result); + epilog_stmt = gimple_build_assign (new_scalar_dest, tmp); + new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); gimple_assign_set_lhs (epilog_stmt, new_temp); gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - - extract_scalar_result = true; + scalar_results.safe_push (new_temp); } else { diff --git a/gcc/tree.def b/gcc/tree.def index 84ffe93aa6fdc827f18ca81225bca007d50b50f6..e9af52e554babb100d49ea14f47c805cd5024949 100644 --- a/gcc/tree.def +++ b/gcc/tree.def @@ -1157,10 +1157,9 @@ DEFTREECODE (TRANSACTION_EXPR, "transaction_expr", tcc_expression, 1) result (e.g. summing the elements of the vector, finding the minimum over the vector elements, etc). Operand 0 is a vector. - The expression returns a vector of the same type, with the first - element in the vector holding the result of the reduction of all elements - of the operand. The content of the other elements in the returned vector - is undefined. */ + The expression returns a scalar, with type the same as the elements of the + vector, holding the result of the reduction of all elements of the operand. + */ DEFTREECODE (REDUC_MAX_EXPR, "reduc_max_expr", tcc_unary, 1) DEFTREECODE (REDUC_MIN_EXPR, "reduc_min_expr", tcc_unary, 1) DEFTREECODE (REDUC_PLUS_EXPR, "reduc_plus_expr", tcc_unary, 1)

[RFC,Vectorizer,AArch64] Fix PR/61114 by redefining REDUC_xxx_EXPR tree codes to return scalars

Commit Message

Comments

Patch