diff mbox series

[4/5] IBM Z: Implement TARGET_VECTORIZE_VEC_PERM_CONST for vector merge

Message ID 20210729073730.23208-5-krebbel@linux.ibm.com
State New
Headers show
Series IBM Z: Implement TARGET_VECTORIZE_VEC_PERM_CONST | expand

Commit Message

Andreas Krebbel July 29, 2021, 7:37 a.m. UTC
This patch implements the TARGET_VECTORIZE_VEC_PERM_CONST in the IBM Z
backend. The initial implementation only exploits the vector merge
instruction but there is more to come.

gcc/ChangeLog:

	* config/s390/s390.c (MAX_VECT_LEN): Define macro.
	(struct expand_vec_perm_d): Define struct.
	(expand_perm_with_merge): New function.
	(vectorize_vec_perm_const_1): New function.
	(s390_vectorize_vec_perm_const): New function.
	(TARGET_VECTORIZE_VEC_PERM_CONST): Define target macro.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/vector/perm-merge.c: New test.
	* gcc.target/s390/vector/vec-types.h: New test.
---
 gcc/config/s390/s390.c                        | 108 ++++++++++++++++++
 .../gcc.target/s390/vector/perm-merge.c       | 104 +++++++++++++++++
 .../gcc.target/s390/vector/vec-types.h        |  35 ++++++
 3 files changed, 247 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/perm-merge.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-types.h
diff mbox series

Patch

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index b1a9ca9d8aa..684241b00b8 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -16928,6 +16928,110 @@  s390_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
   return after_md_seq;
 }
 
+#define MAX_VECT_LEN	16
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  machine_mode vmode;
+  unsigned char nelt;
+  bool testing_p;
+};
+
+/* Try to expand the vector permute operation described by D using the
+   vector merge instructions vml and vmh.  Return true if vector merge
+   could be used.  */
+static bool
+expand_perm_with_merge (const struct expand_vec_perm_d &d)
+{
+  bool merge_lo_p = true;
+  bool merge_hi_p = true;
+
+  if (d.nelt % 2)
+    return false;
+
+  // For V4SI this checks for: { 0, 4, 1, 5 }
+  for (int telt = 0; telt < d.nelt; telt++)
+    if (d.perm[telt] != telt / 2 + (telt % 2) * d.nelt)
+      {
+	merge_hi_p = false;
+	break;
+      }
+
+  if (!merge_hi_p)
+    {
+      // For V4SI this checks for: { 2, 6, 3, 7 }
+      for (int telt = 0; telt < d.nelt; telt++)
+	if (d.perm[telt] != (telt + d.nelt) / 2 + (telt % 2) * d.nelt)
+	  {
+	    merge_lo_p = false;
+	    break;
+	  }
+    }
+  else
+    merge_lo_p = false;
+
+  if (d.testing_p)
+    return merge_lo_p || merge_hi_p;
+
+  if (merge_lo_p || merge_hi_p)
+    s390_expand_merge (d.target, d.op0, d.op1, merge_hi_p);
+
+  return merge_lo_p || merge_hi_p;
+}
+
+/* Try to find the best sequence for the vector permute operation
+   described by D.  Return true if the operation could be
+   expanded.  */
+static bool
+vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d)
+{
+  if (expand_perm_with_merge (d))
+    return true;
+
+  return false;
+}
+
+/* Return true if we can emit instructions for the constant
+   permutation vector in SEL.  If OUTPUT, IN0, IN1 are non-null the
+   hook is supposed to emit the required INSNs.  */
+
+bool
+s390_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, rtx op1,
+			       const vec_perm_indices &sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned char perm[MAX_VECT_LEN];
+  unsigned int i, nelt;
+
+  if (!s390_vector_mode_supported_p (vmode) || GET_MODE_SIZE (vmode) != 16)
+    return false;
+
+  d.target = target;
+  d.op0 = op0;
+  d.op1 = op1;
+
+  d.vmode = vmode;
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = target == NULL_RTX;
+
+  gcc_assert (target == NULL_RTX || REG_P (target));
+  gcc_assert (sel.length () == nelt);
+  gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
+
+  for (i = 0; i < nelt; i++)
+    {
+      unsigned char e = sel[i];
+      gcc_assert (e < 2 * nelt);
+      d.perm[i] = e;
+      perm[i] = e;
+    }
+
+  return vectorize_vec_perm_const_1 (d);
+}
+
 /* Initialize GCC target structure.  */
 
 #undef  TARGET_ASM_ALIGNED_HI_OP
@@ -17238,6 +17342,10 @@  s390_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
 #undef TARGET_MD_ASM_ADJUST
 #define TARGET_MD_ASM_ADJUST s390_md_asm_adjust
 
+#undef TARGET_VECTORIZE_VEC_PERM_CONST
+#define TARGET_VECTORIZE_VEC_PERM_CONST s390_vectorize_vec_perm_const
+
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-s390.h"
diff --git a/gcc/testsuite/gcc.target/s390/vector/perm-merge.c b/gcc/testsuite/gcc.target/s390/vector/perm-merge.c
new file mode 100644
index 00000000000..51b23ddd886
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/perm-merge.c
@@ -0,0 +1,104 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps" } */
+/* { dg-do run { target { s390_z14_hw } } } */
+
+/* { dg-final { scan-assembler-times "\tvmrhb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrhg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlg\t" 3 } } */
+
+#include "vec-types.h"
+
+#define GEN_MERGE_2(VEC_TYPE, HILO, A)			\
+  VEC_TYPE __attribute__((noinline))			\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {	\
+    return (VEC_TYPE){ a[0+A], b[0+A] }; }
+
+#define GEN_MERGE_4(VEC_TYPE, HILO, A)				\
+  VEC_TYPE __attribute__((noinline))				\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {		\
+    return (VEC_TYPE){ a[0+A], b[0+A], a[1+A], b[1+A] }; }
+
+#define GEN_MERGE_8(VEC_TYPE, HILO, A)					\
+  VEC_TYPE __attribute__((noinline))					\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {			\
+    return (VEC_TYPE){ a[0+A], b[0+A], a[1+A], b[1+A], a[2+A], b[2+A], a[3+A], b[3+A] }; }
+
+#define GEN_MERGE_16(VEC_TYPE, HILO, A)					\
+  VEC_TYPE __attribute__((noinline))					\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {			\
+    return (VEC_TYPE){ a[0+A], b[0+A], a[1+A], b[1+A], a[2+A], b[2+A], a[3+A], b[3+A], \
+      a[4+A], b[4+A], a[5+A], b[5+A], a[6+A], b[6+A], a[7+A], b[7+A]}; }
+
+
+GEN_MERGE_16(v16qi, l, 8)
+GEN_MERGE_16(v16qi, h, 0)
+GEN_MERGE_16(uv16qi, l, 8)
+GEN_MERGE_16(uv16qi, h, 0)
+
+GEN_MERGE_8(v8hi, l, 4)
+GEN_MERGE_8(v8hi, h, 0)
+GEN_MERGE_8(uv8hi, l, 4)
+GEN_MERGE_8(uv8hi, h, 0)
+
+GEN_MERGE_4(v4si, l, 2)
+GEN_MERGE_4(v4si, h, 0)
+GEN_MERGE_4(uv4si, l, 2)
+GEN_MERGE_4(uv4si, h, 0)
+
+GEN_MERGE_4(v4sf, l, 2)
+GEN_MERGE_4(v4sf, h, 0)
+
+GEN_MERGE_2(v2di, l, 1)
+GEN_MERGE_2(v2di, h, 0)
+GEN_MERGE_2(uv2di, l, 1)
+GEN_MERGE_2(uv2di, h, 0)
+
+GEN_MERGE_2(v2df, l, 1)
+GEN_MERGE_2(v2df, h, 0)
+
+
+#define CHECK_MERGE_LO(VEC_TYPE, SRC1, SRC2)		\
+  {							\
+    VEC_TYPE v = merge_l_##VEC_TYPE ((SRC1), (SRC2));	\
+    int elts = sizeof(v) / sizeof(v[0]);		\
+    for (int i = 0; i < elts; i++)			\
+      if (v[i] != (i + elts) / 2 + (i % 2) * elts)	\
+	__builtin_abort();				\
+  }
+
+#define CHECK_MERGE_HI(VEC_TYPE, SRC1, SRC2)		\
+  {							\
+    VEC_TYPE v = merge_h_##VEC_TYPE ((SRC1), (SRC2));	\
+    int elts = sizeof(v) / sizeof(v[0]);		\
+    for (int i = 0; i < elts; i++)			\
+      if (v[i] != i / 2 + (i % 2) * elts)		\
+	__builtin_abort();				\
+  }
+
+#define CHECK_MERGE(VEC_TYPE)						\
+  {									\
+    VEC_TYPE a = GEN_SEQ_VEC (VEC_TYPE, 0);				\
+    VEC_TYPE b = GEN_SEQ_VEC (VEC_TYPE, sizeof(VEC_TYPE) / sizeof(a[0])); \
+    CHECK_MERGE_LO (VEC_TYPE, a, b);					\
+    CHECK_MERGE_HI (VEC_TYPE, a, b);					\
+  }
+
+int
+main ()
+{
+  CHECK_MERGE(v16qi);
+  CHECK_MERGE(uv16qi);
+  CHECK_MERGE(v8hi);
+  CHECK_MERGE(uv8hi);
+  CHECK_MERGE(v4si);
+  CHECK_MERGE(uv4si);
+  CHECK_MERGE(v4sf);
+  CHECK_MERGE(v2di);
+  CHECK_MERGE(uv2di);
+  CHECK_MERGE(v2df);
+}
diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-types.h b/gcc/testsuite/gcc.target/s390/vector/vec-types.h
new file mode 100644
index 00000000000..b7ffbe73321
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vec-types.h
@@ -0,0 +1,35 @@ 
+#ifndef VEC_TYPES_H
+#define VEC_TYPES_H 1
+
+typedef __attribute__((vector_size(16))) signed char v16qi;
+typedef __attribute__((vector_size(16))) unsigned char uv16qi;
+
+typedef __attribute__((vector_size(16))) signed short v8hi;
+typedef __attribute__((vector_size(16))) unsigned short uv8hi;
+
+typedef __attribute__((vector_size(16))) signed int v4si;
+typedef __attribute__((vector_size(16))) unsigned int uv4si;
+
+typedef __attribute__((vector_size(16))) signed long long v2di;
+typedef __attribute__((vector_size(16))) unsigned long long uv2di;
+
+#if __SIZEOF_INT128__ == 16
+typedef __attribute__((vector_size(16))) __int128_t v1ti;
+#endif
+
+typedef __attribute__((vector_size(16))) double v2df;
+typedef __attribute__((vector_size(16))) long double v1tf;
+
+#if __ARCH__ >= 12
+typedef __attribute__((vector_size(16))) float v4sf;
+#endif
+
+#define GEN_SEQ_VEC(VEC_TYPE, ADDEND)					\
+  ({ VEC_TYPE dummy;							\
+    const int elts = sizeof(VEC_TYPE) / sizeof(dummy[0]);		\
+    typeof(dummy[0]) __attribute__((aligned(8))) ar[elts];		\
+    for (int i = 0; i < elts; i++)					\
+      ar[i] = (typeof(dummy[0]))(i + (ADDEND));				\
+    *(VEC_TYPE*)ar;})
+
+#endif