s390x: Implement vector cost model

Message ID	20240320105356.16494-1-jchrist@linux.ibm.com
State	New
Headers	show Return-Path: <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 3B5503858C35 From: Juergen Christ <jchrist@linux.ibm.com> To: gcc-patches@gcc.gnu.org Cc: krebbel@linux.ibm.com Subject: [PATCH] s390x: Implement vector cost model Date: Wed, 20 Mar 2024 11:53:56 +0100 Message-Id: <20240320105356.16494-1-jchrist@linux.ibm.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org
Series	s390x: Implement vector cost model \| expand s390x: Implement vector cost model

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 372a23244032..b9dab1cf8a85 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -88,6 +88,7 @@ along with GCC; see the file COPYING3. If not see #include "ipa-prop.h" #include "ipa-fnsummary.h" #include "sched-int.h" +#include "tree-vectorizer.h" /* This file should be included last. */ #include "target-def.h" @@ -4199,6 +4200,130 @@ s390_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, } } +/* s390-specific vector costs */ +class s390_vector_costs : public vector_costs +{ + stmt_vec_info skipfinalpart; +public: + s390_vector_costs (vec_info *, bool); + + unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind, + stmt_vec_info stmt_info, slp_tree node, + tree vectype, int misalign, + vect_cost_model_location where) override; +}; + +s390_vector_costs::s390_vector_costs(vec_info *vinfo, bool costing_for_scalar) + : vector_costs(vinfo, costing_for_scalar) +{ +} + +unsigned int +s390_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, + stmt_vec_info stmt_info, slp_tree node, + tree vectype, int misalign, + vect_cost_model_location where) +{ + bool fp = false; + int costs = s390_builtin_vectorization_cost (kind, vectype, misalign); + + if (vectype != NULL) + fp = FLOAT_TYPE_P (vectype); + + if ((kind == scalar_to_vec || kind == vec_construct) + && node + && SLP_TREE_DEF_TYPE (node) == vect_external_def) + { + unsigned int i; + tree op; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) + if (TREE_CODE (op) == SSA_NAME) + TREE_VISITED (op) = 0; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) + { + if (TREE_CODE (op) != SSA_NAME + || TREE_VISITED (op)) + continue; + TREE_VISITED (op) = 1; + gimple *def = SSA_NAME_DEF_STMT (op); + tree temp; + if (is_gimple_assign(def) + && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)) + && (temp = gimple_assign_rhs1(def)) + && TREE_CODE (temp) == SSA_NAME + && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (def)), + TREE_TYPE (temp))) + def = SSA_NAME_DEF_STMT (temp); + if (!gimple_assign_load_p (def)) + { + /* For scalar_to_vec from a fp register, we might not + cross the register files. So keep the penalty small. + ??? If we have to cross, we actually cross twice + leading to a huge runtime penalty. Should we reflect + this here? */ + if (kind == scalar_to_vec && fp) + costs += 2; + else + costs += 3; + } + } + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) + if (TREE_CODE (op) == SSA_NAME) + TREE_VISITED (op) = 0; + } + if (kind == scalar_stmt && stmt_info && is_gimple_assign (stmt_info->stmt)) + { + const gassign *assign = dyn_cast<const gassign*> (stmt_info->stmt); + tree comptype = NULL_TREE; + if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR) + comptype = TREE_TYPE (gimple_assign_rhs1 (assign)); + if (gimple_assign_rhs_code (assign) == BIT_FIELD_REF) + comptype = TREE_TYPE (TREE_OPERAND (gimple_assign_rhs1 (assign), 0)); + if (comptype != NULL_TREE && VECTOR_TYPE_P (comptype)) + { + /* This will be a vlvg or vlgv that crosses the register files. */ + costs += 3; + } + } + if (stmt_info + && (STMT_VINFO_TYPE (stmt_info) == store_vec_info_type + || STMT_VINFO_TYPE (stmt_info) == load_vec_info_type)) + { + if (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE) + { + /* gimple represents elementwise unloading as two steps + (vec_to_scalar followed by scalar_store). s390 stores + lanes to memory in one operation. Similarly, elementwise + loading is represented as scalar_load for each lane + followed by a vec_construct. s390 loads directly in the + appropriate lanes. The second operation does not + exist. */ + if (kind == scalar_to_vec || kind == scalar_load) + skipfinalpart = stmt_info; + if ((kind == scalar_store || kind == vec_construct) + && skipfinalpart == stmt_info) + return 0; + } + else if (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_CONTIGUOUS_REVERSE) + { + /* gimple represents reversal via a vec_perm followed by the + load/store. s390 has vector load/store reversed + instructions. The permute operation does not exist. */ + if (kind == vec_perm) + return 0; + } + } + costs *= count; + return record_stmt_cost (stmt_info, where, (unsigned int) costs); +} + +/* Implement targetm.vectorize.create_costs. */ +static vector_costs * +s390_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar) +{ + return new s390_vector_costs(vinfo, costing_for_scalar); +} + /* If OP is a SYMBOL_REF of a thread-local symbol, return its TLS mode, otherwise return 0. */ @@ -18088,6 +18213,8 @@ s390_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \ s390_builtin_vectorization_cost +#undef TARGET_VECTORIZE_CREATE_COSTS +#define TARGET_VECTORIZE_CREATE_COSTS s390_vectorize_create_costs #undef TARGET_MACHINE_DEPENDENT_REORG #define TARGET_MACHINE_DEPENDENT_REORG s390_reorg diff --git a/gcc/testsuite/gcc.target/s390/vector/loop-1.c b/gcc/testsuite/gcc.target/s390/vector/loop-1.c new file mode 100644 index 000000000000..4a75fe2c2c0e --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/loop-1.c @@ -0,0 +1,82 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -march=z13" } */ + +#define N 32 + +void contiguous +(int *restrict out, int *restrict in, int m) +{ + int i; + + for (i = 0; i < N; ++i) + out[i] = in[i] * m; +} + +void contiguous_permute__load +(int *restrict out, int *restrict in, int m) +{ + int i; + + for (i = 0; i < N; ++i) + out[i] = in[2 * i] * m; +} + +void contiguous_permute__store +(int *restrict out, int *restrict in, int m) +{ + int i; + + for (i = 0; i < N; ++i) + out[2 * i] = in[i] * m; +} + +void elementwise +(int *restrict out, int *restrict in, int m, int s) +{ + int i; + + for (i = 0; i < N; ++i) + out[i] = in[s * i] * m; +} + +void contiguous_reverse +(int *restrict out, int *restrict in, int m) +{ + int i; + + for (i = N - 1; i >= 0; --i) + out[i] = in[i] * m; +} + +#if 0 +/* This does not work currently. + => "not falling back to elementwise accesses" */ +void contiguous_permute__load_reversed +(int *restrict out, int *restrict in, int m) +{ + int i; + + for (i = N - 1; i >= 0; --i) + out[i] = in[2 * i] * m; +} +#endif + +void contiguous_permute__store_reversed +(int *restrict out, int *restrict in, int m) +{ + int i; + + for (i = N - 1; i >= 0; --i) + out[2 * i] = in[i] * m; +} + +void elementwise__reversed +(int *restrict out, int *restrict in, int m, int s) +{ + int i; + + for (i = N - 1; i >= 0; --i) + out[i] = in[s * i] * m; +} + +/* { dg-final { scan-tree-dump-not "couldn't vectorize loop" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/s390/vector/slp-1.c b/gcc/testsuite/gcc.target/s390/vector/slp-1.c new file mode 100644 index 000000000000..5ee93b649cc7 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/slp-1.c @@ -0,0 +1,68 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-slp-all -march=z15" } */ + +void vrep +(int *x) +{ + x[0] = 42; + x[1] = 42; + x[2] = 42; + x[3] = 42; +} + +void vgbm +(int *x) +{ + x[0] = 0xff00; + x[1] = 0xff00; + x[2] = 0xff00; + x[3] = 0xff00; +} + +void vgm +(int *x) +{ + x[0] = 0x7e; + x[1] = 0x7e; + x[2] = 0x7e; + x[3] = 0x7e; +} + +void vl +(int *x) +{ + x[0] = 42; + x[1] = 0xff00; + x[2] = 0x7e; + x[3] = 0; +} + +void vl_vst +(int *restrict o, int *restrict i) +{ + o[0] = i[0]; + o[1] = i[1]; + o[2] = i[2]; + o[3] = i[3]; +} + +void vlrepf +(int *restrict o, int *restrict i) +{ + o[0] = i[0]; + o[1] = i[0]; + o[2] = i[0]; + o[3] = i[0]; +} + +// Needs z15 +void vcefb +(float *restrict o, int *restrict i) +{ + o[0] = i[0]; + o[1] = i[1]; + o[2] = i[2]; + o[3] = i[3]; +} + +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 7 "slp2" } } */ diff --git a/gcc/testsuite/gcc.target/s390/vector/slp-2.c b/gcc/testsuite/gcc.target/s390/vector/slp-2.c new file mode 100644 index 000000000000..b0dc44319922 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/slp-2.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-slp-all -march=z13" } */ + +void elementwise +(int *o, int i0, int i1, int i2, unsigned int i3) +{ + o[0] = i0; + o[1] = i1; + o[2] = i2; + o[3] = i3; +} + +void elementreplicate +(int *o, int i) +{ + o[0] = i; + o[1] = i; + o[2] = i; + o[3] = i; +} + +void mult +(int *o, int i0, int i1, int i2, int i3, int m) +{ + o[0] = i0 * m; + o[1] = i1 * m; + o[2] = i2 * m; + o[3] = i3 * m; +} + +/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 3 "slp2" } } */

s390x: Implement vector cost model

Commit Message

Patch