diff mbox series

[v2] s390x: Optimize vector permute with constant indexes

Message ID 20240409143140.22429-1-jchrist@linux.ibm.com
State New
Headers show
Series [v2] s390x: Optimize vector permute with constant indexes | expand

Commit Message

Juergen Christ April 9, 2024, 2:31 p.m. UTC
Loop vectorizer can generate vector permutes with constant indexes
where all indexes are equal.  Optimize this case to use vector
replicate instead of vector permute.

gcc/ChangeLog:

	* config/s390/s390.cc (expand_perm_as_replicate): Implement.
	(vectorize_vec_perm_const_1): Call new function.
	* config/s390/vx-builtins.md (vec_splat<mode>): Change to...
	(@vec_splat<mode>): ...this.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/vector/vec-expand-replicate.c: New test.

Bootstrapped and regtested on s390x.  Ok for trunk?

Signed-off-by: Juergen Christ <jchrist@linux.ibm.com>
---
 gcc/config/s390/s390.cc                       | 33 ++++++++++
 gcc/config/s390/vx-builtins.md                |  2 +-
 .../s390/vector/vec-expand-replicate.c        | 60 +++++++++++++++++++
 3 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c

Comments

Andreas Krebbel April 9, 2024, 3:01 p.m. UTC | #1
On 4/9/24 16:31, Juergen Christ wrote:
> Loop vectorizer can generate vector permutes with constant indexes
> where all indexes are equal.  Optimize this case to use vector
> replicate instead of vector permute.
> 
> gcc/ChangeLog:
> 
> 	* config/s390/s390.cc (expand_perm_as_replicate): Implement.
> 	(vectorize_vec_perm_const_1): Call new function.
> 	* config/s390/vx-builtins.md (vec_splat<mode>): Change to...
> 	(@vec_splat<mode>): ...this.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/s390/vector/vec-expand-replicate.c: New test.
> 
> Bootstrapped and regtested on s390x.  Ok for trunk?

Does this also work when using the vec_perm intrinsic or would we need to define a matching RTX for
that?

Ok. Thanks!

Andreas
Juergen Christ April 9, 2024, 4:07 p.m. UTC | #2
Am Tue, Apr 09, 2024 at 05:01:18PM +0200 schrieb Andreas Krebbel:
> On 4/9/24 16:31, Juergen Christ wrote:
> > Loop vectorizer can generate vector permutes with constant indexes
> > where all indexes are equal.  Optimize this case to use vector
> > replicate instead of vector permute.
> > 
> > gcc/ChangeLog:
> > 
> > 	* config/s390/s390.cc (expand_perm_as_replicate): Implement.
> > 	(vectorize_vec_perm_const_1): Call new function.
> > 	* config/s390/vx-builtins.md (vec_splat<mode>): Change to...
> > 	(@vec_splat<mode>): ...this.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > 	* gcc.target/s390/vector/vec-expand-replicate.c: New test.
> > 
> > Bootstrapped and regtested on s390x.  Ok for trunk?
> 
> Does this also work when using the vec_perm intrinsic or would we need to define a matching RTX for
> that?

Unfortunately, it does not work with vec_perm.

> Ok. Thanks!

Pushed.

Juergen
diff mbox series

Patch

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 372a23244032..3148f163627c 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -17923,6 +17923,36 @@  expand_perm_as_a_vlbr_vstbr_candidate (const struct expand_vec_perm_d &d)
   return false;
 }
 
+static bool
+expand_perm_as_replicate (const struct expand_vec_perm_d &d)
+{
+  unsigned char i;
+  unsigned char elem;
+  rtx base = d.op0;
+  rtx insn;
+  /* Needed to silence maybe-uninitialized warning.  */
+  gcc_assert (d.nelt > 0);
+  elem = d.perm[0];
+  for (i = 1; i < d.nelt; ++i)
+    if (d.perm[i] != elem)
+      return false;
+  if (!d.testing_p)
+    {
+      if (elem >= d.nelt)
+	{
+	  base = d.op1;
+	  elem -= d.nelt;
+	}
+      insn = maybe_gen_vec_splat (d.vmode, d.target, base, GEN_INT (elem));
+      if (insn == NULL_RTX)
+	return false;
+      emit_insn (insn);
+      return true;
+    }
+  else
+    return maybe_code_for_vec_splat (d.vmode) != CODE_FOR_nothing;
+}
+
 /* Try to find the best sequence for the vector permute operation
    described by D.  Return true if the operation could be
    expanded.  */
@@ -17941,6 +17971,9 @@  vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d)
   if (expand_perm_as_a_vlbr_vstbr_candidate (d))
     return true;
 
+  if (expand_perm_as_replicate (d))
+    return true;
+
   return false;
 }
 
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index 432d81a719fc..93c0d408a43e 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -424,7 +424,7 @@ 
 
 
 ; Replicate from vector element
-(define_expand "vec_splat<mode>"
+(define_expand "@vec_splat<mode>"
   [(set (match_operand:V_HW                      0 "register_operand"  "")
 	(vec_duplicate:V_HW (vec_select:<non_vec>
 			     (match_operand:V_HW 1 "register_operand"  "")
diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c
new file mode 100644
index 000000000000..872b1c9321cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c
@@ -0,0 +1,60 @@ 
+/* Check that the vectorize_vec_perm_const expander correctly deals with
+   replication.  Extracted from spec "nab".  */
+
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z13 -fvect-cost-model=unlimited" } */
+
+typedef double POINT_T[3];
+typedef double MATRIX_T[][4];
+typedef struct {
+  POINT_T a_pos;
+} ATOM_T;
+typedef struct {
+  ATOM_T *r_atoms;
+} RESIDUE_T;
+typedef struct strand_t {
+  RESIDUE_T *s_residues;
+} STRAND_T;
+typedef struct strand_t MOLECULE_T;
+double xfm_xyz_oxyz4[4];
+MOLECULE_T add_he2o3transformmol_mol, add_he2o3transformmol_sp;
+RESIDUE_T add_he2o3transformmol_res;
+int add_he2o3transformmol_r, add_he2o3transformmol_a, add_he2o3transformmol_i;
+ATOM_T *add_he2o3transformmol_ap;
+POINT_T add_he2o3transformmol_xyz, add_he2o3transformmol_nxyz;
+static void xfm_xyz(POINT_T oxyz, MATRIX_T mat, POINT_T nxyz) {
+  int i, j;
+  double nxyz4[4];
+  for (i = 0; i < 3; i++)
+    xfm_xyz_oxyz4[i] = oxyz[i];
+  xfm_xyz_oxyz4[3] = 1.0;
+  for (i = 0; i < 4; i++) {
+    nxyz4[i] = 0.0;
+    for (j = 0; j < 4; j++)
+      nxyz4[i] += xfm_xyz_oxyz4[j] * mat[j][i];
+  }
+  for (i = 0; i < 3; i++)
+    nxyz[i] = nxyz4[i];
+}
+void add_he2o3transformmol(MATRIX_T mat, int n) {
+  for (add_he2o3transformmol_sp = add_he2o3transformmol_mol;;)
+    for (add_he2o3transformmol_r = 0;;) {
+      add_he2o3transformmol_res =
+          add_he2o3transformmol_sp.s_residues[add_he2o3transformmol_r];
+      for (add_he2o3transformmol_a = 0; add_he2o3transformmol_a < n; add_he2o3transformmol_a++) {
+        add_he2o3transformmol_ap =
+            &add_he2o3transformmol_res.r_atoms[add_he2o3transformmol_a];
+        for (add_he2o3transformmol_i = 0; add_he2o3transformmol_i < 3;
+             add_he2o3transformmol_i++)
+          add_he2o3transformmol_xyz[add_he2o3transformmol_i] =
+              add_he2o3transformmol_ap->a_pos[add_he2o3transformmol_i];
+        xfm_xyz(add_he2o3transformmol_xyz, mat, add_he2o3transformmol_nxyz);
+        for (add_he2o3transformmol_i = 0; add_he2o3transformmol_i < 3;
+             add_he2o3transformmol_i++)
+          add_he2o3transformmol_ap->a_pos[add_he2o3transformmol_i] =
+              add_he2o3transformmol_nxyz[add_he2o3transformmol_i];
+      }
+    }
+}
+
+/* { dg-final { scan-assembler-not "vperm" } } */