diff mbox series

s390x: Implement vector cost model

Message ID 20240320105356.16494-1-jchrist@linux.ibm.com
State New
Headers show
Series s390x: Implement vector cost model | expand

Commit Message

Juergen Christ March 20, 2024, 10:53 a.m. UTC
Hi,

s390x used the basic cost model which does not correctly model the cost of
register file crossing or the availability of certain instructions to
simplify reversed operations.  Implement an own cost model to better control
when to vectorize.

gcc/ChangeLog:

	* config/s390/s390.cc (class s390_vector_costs): Implement.
	(s390_vector_costs::s390_vector_costs): Dito.
	(s390_vector_costs::add_stmt_cost): Dito.
	(s390_vectorize_create_costs): Dito.
	(TARGET_VECTORIZE_CREATE_COSTS): Dito.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/vector/loop-1.c: New test.
	* gcc.target/s390/vector/slp-1.c: New test.
	* gcc.target/s390/vector/slp-2.c: New test.

Signed-off-by: Juergen Christ <jchrist@linux.ibm.com>

Bootstrapped and tested on s390x.  Ok for master?

---
 gcc/config/s390/s390.cc                       | 127 ++++++++++++++++++
 gcc/testsuite/gcc.target/s390/vector/loop-1.c |  82 +++++++++++
 gcc/testsuite/gcc.target/s390/vector/slp-1.c  |  68 ++++++++++
 gcc/testsuite/gcc.target/s390/vector/slp-2.c  |  31 +++++
 4 files changed, 308 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/loop-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/slp-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/slp-2.c
diff mbox series

Patch

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 372a23244032..b9dab1cf8a85 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -88,6 +88,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "ipa-prop.h"
 #include "ipa-fnsummary.h"
 #include "sched-int.h"
+#include "tree-vectorizer.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -4199,6 +4200,130 @@  s390_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
     }
 }
 
+/* s390-specific vector costs */
+class s390_vector_costs : public vector_costs
+{
+  stmt_vec_info skipfinalpart;
+public:
+  s390_vector_costs (vec_info *, bool);
+
+  unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
+			      stmt_vec_info stmt_info, slp_tree node,
+			      tree vectype, int misalign,
+			      vect_cost_model_location where) override;
+};
+
+s390_vector_costs::s390_vector_costs(vec_info *vinfo, bool costing_for_scalar)
+  : vector_costs(vinfo, costing_for_scalar)
+{
+}
+
+unsigned int
+s390_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
+				  stmt_vec_info stmt_info, slp_tree node,
+				  tree vectype, int misalign,
+				  vect_cost_model_location where)
+{
+  bool fp = false;
+  int costs = s390_builtin_vectorization_cost (kind, vectype, misalign);
+
+  if (vectype != NULL)
+    fp = FLOAT_TYPE_P (vectype);
+
+  if ((kind == scalar_to_vec || kind == vec_construct)
+      && node
+      && SLP_TREE_DEF_TYPE (node) == vect_external_def)
+    {
+      unsigned int i;
+      tree op;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+	if (TREE_CODE (op) == SSA_NAME)
+	  TREE_VISITED (op) = 0;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+	{
+	  if (TREE_CODE (op) != SSA_NAME
+	      || TREE_VISITED (op))
+	    continue;
+	  TREE_VISITED (op) = 1;
+	  gimple *def = SSA_NAME_DEF_STMT (op);
+	  tree temp;
+	  if (is_gimple_assign(def)
+	      && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def))
+	      && (temp = gimple_assign_rhs1(def))
+	      && TREE_CODE (temp) == SSA_NAME
+	      && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (def)),
+					TREE_TYPE (temp)))
+	    def = SSA_NAME_DEF_STMT (temp);
+	  if (!gimple_assign_load_p (def))
+	    {
+	      /* For scalar_to_vec from a fp register, we might not
+		 cross the register files.  So keep the penalty small.
+		 ??? If we have to cross, we actually cross twice
+		 leading to a huge runtime penalty.  Should we reflect
+		 this here?  */
+	      if (kind == scalar_to_vec && fp)
+		costs += 2;
+	      else
+		costs += 3;
+	    }
+	}
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+	if (TREE_CODE (op) == SSA_NAME)
+	  TREE_VISITED (op) = 0;
+    }
+  if (kind == scalar_stmt && stmt_info && is_gimple_assign (stmt_info->stmt))
+    {
+      const gassign *assign = dyn_cast<const gassign*> (stmt_info->stmt);
+      tree comptype = NULL_TREE;
+      if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR)
+	comptype = TREE_TYPE (gimple_assign_rhs1 (assign));
+      if (gimple_assign_rhs_code (assign) == BIT_FIELD_REF)
+	comptype = TREE_TYPE (TREE_OPERAND (gimple_assign_rhs1 (assign), 0));
+      if (comptype != NULL_TREE && VECTOR_TYPE_P (comptype))
+	{
+	  /* This will be a vlvg or vlgv that crosses the register files.  */
+	  costs += 3;
+	}
+    }
+  if (stmt_info
+      && (STMT_VINFO_TYPE (stmt_info) == store_vec_info_type
+	  || STMT_VINFO_TYPE (stmt_info) == load_vec_info_type))
+    {
+      if (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE)
+	{
+	  /* gimple represents elementwise unloading as two steps
+	     (vec_to_scalar followed by scalar_store).  s390 stores
+	     lanes to memory in one operation.  Similarly, elementwise
+	     loading is represented as scalar_load for each lane
+	     followed by a vec_construct.  s390 loads directly in the
+	     appropriate lanes.  The second operation does not
+	     exist.  */
+	  if (kind == scalar_to_vec || kind == scalar_load)
+	    skipfinalpart = stmt_info;
+	  if ((kind == scalar_store || kind == vec_construct)
+	      && skipfinalpart == stmt_info)
+	    return 0;
+	}
+      else if (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_CONTIGUOUS_REVERSE)
+	{
+	  /* gimple represents reversal via a vec_perm followed by the
+	     load/store.  s390 has vector load/store reversed
+	     instructions.  The permute operation does not exist.  */
+	  if (kind == vec_perm)
+	    return 0;
+	}
+    }
+  costs *= count;
+  return record_stmt_cost (stmt_info, where, (unsigned int) costs);
+}
+
+/* Implement targetm.vectorize.create_costs. */
+static vector_costs *
+s390_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
+{
+  return new s390_vector_costs(vinfo, costing_for_scalar);
+}
+
 /* If OP is a SYMBOL_REF of a thread-local symbol, return its TLS mode,
    otherwise return 0.  */
 
@@ -18088,6 +18213,8 @@  s390_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
   s390_builtin_vectorization_cost
+#undef TARGET_VECTORIZE_CREATE_COSTS
+#define TARGET_VECTORIZE_CREATE_COSTS s390_vectorize_create_costs
 
 #undef TARGET_MACHINE_DEPENDENT_REORG
 #define TARGET_MACHINE_DEPENDENT_REORG s390_reorg
diff --git a/gcc/testsuite/gcc.target/s390/vector/loop-1.c b/gcc/testsuite/gcc.target/s390/vector/loop-1.c
new file mode 100644
index 000000000000..4a75fe2c2c0e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/loop-1.c
@@ -0,0 +1,82 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all -march=z13" } */
+
+#define N 32
+
+void contiguous
+(int *restrict out, int *restrict in, int m)
+{
+  int i;
+
+  for (i = 0; i < N; ++i)
+    out[i] = in[i] * m;
+}
+
+void contiguous_permute__load
+(int *restrict out, int *restrict in, int m)
+{
+  int i;
+
+  for (i = 0; i < N; ++i)
+    out[i] = in[2 * i] * m;
+}
+
+void contiguous_permute__store
+(int *restrict out, int *restrict in, int m)
+{
+  int i;
+
+  for (i = 0; i < N; ++i)
+    out[2 * i] = in[i] * m;
+}
+
+void elementwise
+(int *restrict out, int *restrict in, int m, int s)
+{
+  int i;
+
+  for (i = 0; i < N; ++i)
+    out[i] = in[s * i] * m;
+}
+
+void contiguous_reverse
+(int *restrict out, int *restrict in, int m)
+{
+  int i;
+
+  for (i = N - 1; i >= 0; --i)
+    out[i] = in[i] * m;
+}
+
+#if 0
+/* This does not work currently.
+ => "not falling back to elementwise accesses"  */
+void contiguous_permute__load_reversed
+(int *restrict out, int *restrict in, int m)
+{
+  int i;
+
+  for (i = N - 1; i >= 0; --i)
+    out[i] = in[2 * i] * m;
+}
+#endif
+
+void contiguous_permute__store_reversed
+(int *restrict out, int *restrict in, int m)
+{
+  int i;
+
+  for (i = N - 1; i >= 0; --i)
+    out[2 * i] = in[i] * m;
+}
+
+void elementwise__reversed
+(int *restrict out, int *restrict in, int m, int s)
+{
+  int i;
+
+  for (i = N - 1; i >= 0; --i)
+    out[i] = in[s * i] * m;
+}
+
+/* { dg-final { scan-tree-dump-not "couldn't vectorize loop" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/s390/vector/slp-1.c b/gcc/testsuite/gcc.target/s390/vector/slp-1.c
new file mode 100644
index 000000000000..5ee93b649cc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/slp-1.c
@@ -0,0 +1,68 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-slp-all -march=z15" } */
+
+void vrep
+(int *x)
+{
+  x[0] = 42;
+  x[1] = 42;
+  x[2] = 42;
+  x[3] = 42;
+}
+
+void vgbm
+(int *x)
+{
+  x[0] = 0xff00;
+  x[1] = 0xff00;
+  x[2] = 0xff00;
+  x[3] = 0xff00;
+}
+
+void vgm
+(int *x)
+{
+  x[0] = 0x7e;
+  x[1] = 0x7e;
+  x[2] = 0x7e;
+  x[3] = 0x7e;
+}
+
+void vl
+(int *x)
+{
+  x[0] = 42;
+  x[1] = 0xff00;
+  x[2] = 0x7e;
+  x[3] = 0;
+}
+
+void vl_vst
+(int *restrict o, int *restrict i)
+{
+  o[0] = i[0];
+  o[1] = i[1];
+  o[2] = i[2];
+  o[3] = i[3];
+}
+
+void vlrepf
+(int *restrict o, int *restrict i)
+{
+  o[0] = i[0];
+  o[1] = i[0];
+  o[2] = i[0];
+  o[3] = i[0];
+}
+
+// Needs z15
+void vcefb
+(float *restrict o, int *restrict i)
+{
+  o[0] = i[0];
+  o[1] = i[1];
+  o[2] = i[2];
+  o[3] = i[3];
+}
+
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 7 "slp2" } } */
diff --git a/gcc/testsuite/gcc.target/s390/vector/slp-2.c b/gcc/testsuite/gcc.target/s390/vector/slp-2.c
new file mode 100644
index 000000000000..b0dc44319922
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/slp-2.c
@@ -0,0 +1,31 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-slp-all -march=z13" } */
+
+void elementwise
+(int *o, int i0, int i1, int i2, unsigned int i3)
+{
+  o[0] = i0;
+  o[1] = i1;
+  o[2] = i2;
+  o[3] = i3;
+}
+
+void elementreplicate
+(int *o, int i)
+{
+  o[0] = i;
+  o[1] = i;
+  o[2] = i;
+  o[3] = i;
+}
+
+void mult
+(int *o, int i0, int i1, int i2, int i3, int m)
+{
+  o[0] = i0 * m;
+  o[1] = i1 * m;
+  o[2] = i2 * m;
+  o[3] = i3 * m;
+}
+
+/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 3 "slp2" } } */