diff mbox series

rs6000: Make some BIFs vectorized on P10

Message ID 0f77a46a-4f13-65b8-cb8c-5fd9ed17537c@linux.ibm.com
State New
Headers show
Series rs6000: Make some BIFs vectorized on P10 | expand

Commit Message

Kewen.Lin Aug. 11, 2021, 6:56 a.m. UTC
Hi,

This patch is to add the support to make vectorizer able to
vectorize scalar version of some built-in functions with its
corresponding vector version with Power10 support.

Bootstrapped & regtested on powerpc64le-linux-gnu {P9,P10}
and powerpc64-linux-gnu P8.

Is it ok for trunk?

BR,
Kewen
-----
gcc/ChangeLog:

	* config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add
	support for some built-in functions vectorized on Power10.

gcc/testsuite/ChangeLog:

	* gcc.target/powerpc/dive-vectorize-1.c: New test.
	* gcc.target/powerpc/dive-vectorize-1.h: New test.
	* gcc.target/powerpc/dive-vectorize-2.c: New test.
	* gcc.target/powerpc/dive-vectorize-2.h: New test.
	* gcc.target/powerpc/dive-vectorize-run-1.c: New test.
	* gcc.target/powerpc/dive-vectorize-run-2.c: New test.
	* gcc.target/powerpc/p10-bifs-vectorize-1.c: New test.
	* gcc.target/powerpc/p10-bifs-vectorize-1.h: New test.
	* gcc.target/powerpc/p10-bifs-vectorize-run-1.c: New test.
---
 gcc/config/rs6000/rs6000.c                    | 55 +++++++++++++++++++
 .../gcc.target/powerpc/dive-vectorize-1.c     | 11 ++++
 .../gcc.target/powerpc/dive-vectorize-1.h     | 22 ++++++++
 .../gcc.target/powerpc/dive-vectorize-2.c     | 12 ++++
 .../gcc.target/powerpc/dive-vectorize-2.h     | 22 ++++++++
 .../gcc.target/powerpc/dive-vectorize-run-1.c | 52 ++++++++++++++++++
 .../gcc.target/powerpc/dive-vectorize-run-2.c | 53 ++++++++++++++++++
 .../gcc.target/powerpc/p10-bifs-vectorize-1.c | 15 +++++
 .../gcc.target/powerpc/p10-bifs-vectorize-1.h | 40 ++++++++++++++
 .../powerpc/p10-bifs-vectorize-run-1.c        | 45 +++++++++++++++
 10 files changed, 327 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
 create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
 create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
 create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c

Comments

Li, Pan2 via Gcc-patches Aug. 11, 2021, 4:34 p.m. UTC | #1
Hi Kewen,

FWIW, it's easier on reviewers if you include the patch inline instead 
of as an attachment.

On 8/11/21 1:56 AM, Kewen.Lin wrote:
> Hi,
>
> This patch is to add the support to make vectorizer able to
> vectorize scalar version of some built-in functions with its
> corresponding vector version with Power10 support.
>
> Bootstrapped & regtested on powerpc64le-linux-gnu {P9,P10}
> and powerpc64-linux-gnu P8.
>
> Is it ok for trunk?
>
> BR,
> Kewen
> -----
> gcc/ChangeLog:
>
> 	* config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add
> 	support for some built-in functions vectorized on Power10.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/powerpc/dive-vectorize-1.c: New test.
> 	* gcc.target/powerpc/dive-vectorize-1.h: New test.
> 	* gcc.target/powerpc/dive-vectorize-2.c: New test.
> 	* gcc.target/powerpc/dive-vectorize-2.h: New test.
> 	* gcc.target/powerpc/dive-vectorize-run-1.c: New test.
> 	* gcc.target/powerpc/dive-vectorize-run-2.c: New test.
> 	* gcc.target/powerpc/p10-bifs-vectorize-1.c: New test.
> 	* gcc.target/powerpc/p10-bifs-vectorize-1.h: New test.
> 	* gcc.target/powerpc/p10-bifs-vectorize-run-1.c: New test.

---
  gcc/config/rs6000/rs6000.c                    | 55 +++++++++++++++++++
  .../gcc.target/powerpc/dive-vectorize-1.c     | 11 ++++
  .../gcc.target/powerpc/dive-vectorize-1.h     | 22 ++++++++
  .../gcc.target/powerpc/dive-vectorize-2.c     | 12 ++++
  .../gcc.target/powerpc/dive-vectorize-2.h     | 22 ++++++++
  .../gcc.target/powerpc/dive-vectorize-run-1.c | 52 ++++++++++++++++++
  .../gcc.target/powerpc/dive-vectorize-run-2.c | 53 ++++++++++++++++++
  .../gcc.target/powerpc/p10-bifs-vectorize-1.c | 15 +++++
  .../gcc.target/powerpc/p10-bifs-vectorize-1.h | 40 ++++++++++++++
  .../powerpc/p10-bifs-vectorize-run-1.c        | 45 +++++++++++++++
  10 files changed, 327 insertions(+)
  create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
  create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
  create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
  create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
  create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
  create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
  create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
  create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
  create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 279f00cc648..3eac1d05101 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -5785,6 +5785,61 @@ rs6000_builtin_md_vectorized_function (tree fndecl, tree type_out,
      default:
        break;
      }
+
+  machine_mode in_vmode = TYPE_MODE (type_in);
+  machine_mode out_vmode = TYPE_MODE (type_out);
+
+  /* Power10 supported vectorized built-in functions.  */
+  if (TARGET_POWER10
+      && in_vmode == out_vmode
+      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
+    {
+      machine_mode exp_mode = DImode;
+      machine_mode exp_vmode = V2DImode;
+      enum rs6000_builtins vname = RS6000_BUILTIN_COUNT;

Using this as a flag value looks unnecessary.  Is this just being done to silence a warning?

+      switch (fn)
+	{
+	case MISC_BUILTIN_DIVWE:
+	case MISC_BUILTIN_DIVWEU:
+	  exp_mode = SImode;
+	  exp_vmode = V4SImode;
+	  if (fn == MISC_BUILTIN_DIVWE)
+	    vname = P10V_BUILTIN_DIVES_V4SI;
+	  else
+	    vname = P10V_BUILTIN_DIVEU_V4SI;
+	  break;
+	case MISC_BUILTIN_DIVDE:
+	case MISC_BUILTIN_DIVDEU:
+	  if (fn == MISC_BUILTIN_DIVDE)
+	    vname = P10V_BUILTIN_DIVES_V2DI;
+	  else
+	    vname = P10V_BUILTIN_DIVEU_V2DI;
+	  break;
+	case P10_BUILTIN_CFUGED:
+	  vname = P10V_BUILTIN_VCFUGED;
+	  break;
+	case P10_BUILTIN_CNTLZDM:
+	  vname = P10V_BUILTIN_VCLZDM;
+	  break;
+	case P10_BUILTIN_CNTTZDM:
+	  vname = P10V_BUILTIN_VCTZDM;
+	  break;
+	case P10_BUILTIN_PDEPD:
+	  vname = P10V_BUILTIN_VPDEPD;
+	  break;
+	case P10_BUILTIN_PEXTD:
+	  vname = P10V_BUILTIN_VPEXTD;
+	  break;
+	default:
+	  return NULL_TREE;
+	}
+
+      if (vname != RS6000_BUILTIN_COUNT

Check is not necessary, as you will have returned by now in that case.

Otherwise this patch LGTM.  Thanks!  Still needs maintainer approval, of course.

Bill

+	  && in_mode == exp_mode
+	  && in_vmode == exp_vmode)
+	return rs6000_builtin_decls[vname];
+    }
+
    return NULL_TREE;
  }
  
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
new file mode 100644
index 00000000000..84f1b0a88f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if signed/unsigned int extended divisions get vectorized.  */
+
+#include "dive-vectorize-1.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvdivesw\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvdiveuw\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
new file mode 100644
index 00000000000..119f637b46b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
@@ -0,0 +1,22 @@
+#define N 128
+
+typedef signed int si;
+typedef unsigned int ui;
+
+si si_a[N], si_b[N], si_c[N];
+ui ui_a[N], ui_b[N], ui_c[N];
+
+__attribute__ ((noipa)) void
+test_divwe ()
+{
+  for (int i = 0; i < N; i++)
+    si_c[i] = __builtin_divwe (si_a[i], si_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_divweu ()
+{
+  for (int i = 0; i < N; i++)
+    ui_c[i] = __builtin_divweu (ui_a[i], ui_b[i]);
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
new file mode 100644
index 00000000000..4db0085562f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if signed/unsigned long long extended divisions get vectorized.  */
+
+#include "dive-vectorize-2.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvdivesd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvdiveud\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
new file mode 100644
index 00000000000..1cab56b2e0b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
@@ -0,0 +1,22 @@
+#define N 128
+
+typedef signed long long sLL;
+typedef unsigned long long uLL;
+
+sLL sll_a[N], sll_b[N], sll_c[N];
+uLL ull_a[N], ull_b[N], ull_c[N];
+
+__attribute__ ((noipa)) void
+test_divde ()
+{
+  for (int i = 0; i < N; i++)
+    sll_c[i] = __builtin_divde (sll_a[i], sll_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_divdeu ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_divdeu (ull_a[i], ull_b[i]);
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
new file mode 100644
index 00000000000..1d5cbaa9f9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
@@ -0,0 +1,52 @@
+/* { dg-do run } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include "dive-vectorize-1.h"
+
+/* Check if test cases with signed/unsigned int extended division
+   vectorization run successfully.  */
+
+__attribute__ ((optimize (1))) void
+check_divwe ()
+{
+  test_divwe ();
+  for (int i = 0; i < N; i++)
+    {
+      si exp = __builtin_divwe (si_a[i], si_b[i]);
+      if (exp != si_c[i])
+	__builtin_abort ();
+    }
+}
+
+__attribute__ ((optimize (1))) void
+check_divweu ()
+{
+  test_divweu ();
+  for (int i = 0; i < N; i++)
+    {
+      ui exp = __builtin_divweu (ui_a[i], ui_b[i]);
+      if (exp != ui_c[i])
+	__builtin_abort ();
+    }
+}
+
+int
+main ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      si_a[i] = 0x10 * (i * 3 + 2);
+      si_b[i] = 0x7890 * (i * 3 + 1);
+      ui_a[i] = 0x234 * (i * 11 + 3) - 0xcd * (i * 5 - 7);
+      ui_b[i] = 0x6078 * (i * 7 + 3) + 0xef * (i * 7 - 11);
+      if (si_b[i] == 0 || ui_b[i] == 0)
+	__builtin_abort ();
+    }
+
+  check_divwe ();
+  check_divweu ();
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
new file mode 100644
index 00000000000..921b07b3f1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include "dive-vectorize-2.h"
+
+/* Check if test cases with signed/unsigned int extended division
+   vectorization run successfully.  */
+
+__attribute__ ((optimize (1))) void
+check_divde ()
+{
+  test_divde ();
+  for (int i = 0; i < N; i++)
+    {
+      sLL exp = __builtin_divde (sll_a[i], sll_b[i]);
+      if (exp != sll_c[i])
+	__builtin_abort ();
+    }
+}
+
+__attribute__ ((optimize (1))) void
+check_divdeu ()
+{
+  test_divdeu ();
+  for (int i = 0; i < N; i++)
+    {
+      uLL exp = __builtin_divdeu (ull_a[i], ull_b[i]);
+      if (exp != ull_c[i])
+	__builtin_abort ();
+    }
+}
+
+int
+main ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      sll_a[i] = 0x102 * (i * 3 + 2);
+      sll_b[i] = 0x789ab * (i * 3 + 1);
+      ull_a[i] = 0x2345 * (i * 11 + 3) - 0xcd1 * (i * 5 - 7);
+      ull_b[i] = 0x6078e * (i * 7 + 3) + 0xefa * (i * 7 - 11);
+      if (sll_b[i] == 0 || ull_b[i] == 0)
+	__builtin_abort ();
+    }
+
+  check_divde ();
+  check_divdeu ();
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
new file mode 100644
index 00000000000..9b8b3642ead
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if some Power10 built-in functions get vectorized.  */
+
+#include "p10-bifs-vectorize-1.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 5 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvcfuged\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvclzdm\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvctzdm\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvpdepd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvpextd\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
new file mode 100644
index 00000000000..80b7aacf810
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
@@ -0,0 +1,40 @@
+#define N 32
+
+typedef unsigned long long uLL;
+uLL ull_a[N], ull_b[N], ull_c[N];
+
+__attribute__ ((noipa)) void
+test_cfuged ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_cfuged (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_cntlzdm ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_cntlzdm (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_cnttzdm ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_cnttzdm (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_pdepd ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_pdepd (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_pextd ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_pextd (ull_a[i], ull_b[i]);
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
new file mode 100644
index 00000000000..4b3b1165663
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
@@ -0,0 +1,45 @@
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include "p10-bifs-vectorize-1.h"
+
+/* Check if vectorized built-in functions run expectedly.  */
+
+#define CHECK(name)                                                           \
+  __attribute__ ((optimize (1))) void check_##name ()                         \
+  {                                                                           \
+    test_##name ();                                                           \
+    for (int i = 0; i < N; i++)                                               \
+      {                                                                       \
+	uLL exp = __builtin_##name (ull_a[i], ull_b[i]);                      \
+	if (exp != ull_c[i])                                                  \
+	  __builtin_abort ();                                                 \
+      }                                                                       \
+  }
+
+CHECK (cfuged)
+CHECK (cntlzdm)
+CHECK (cnttzdm)
+CHECK (pdepd)
+CHECK (pextd)
+
+int
+main ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      ull_a[i] = 0x789a * (i * 11 - 5) - 0xcd1 * (i * 5 - 7);
+      ull_b[i] = 0xfedc * (i * 7 + 3) + 0x467 * (i * 7 - 11);
+    }
+
+  check_cfuged ();
+  check_cntlzdm ();
+  check_cnttzdm ();
+  check_pdepd ();
+  check_pextd ();
+
+  return 0;
+}
+
Kewen.Lin Aug. 12, 2021, 2:10 a.m. UTC | #2
Hi Bill,

Thanks for your prompt review!

on 2021/8/12 上午12:34, Bill Schmidt wrote:
> Hi Kewen,
> 
> FWIW, it's easier on reviewers if you include the patch inline instead of as an attachment.
> 
> On 8/11/21 1:56 AM, Kewen.Lin wrote:
>> Hi,
>>
>> This patch is to add the support to make vectorizer able to
>> vectorize scalar version of some built-in functions with its
>> corresponding vector version with Power10 support.
>>
>> Bootstrapped & regtested on powerpc64le-linux-gnu {P9,P10}
>> and powerpc64-linux-gnu P8.
>>
>> Is it ok for trunk?
>>
>> BR,
>> Kewen
>> -----
>> gcc/ChangeLog:
>>
>> 	* config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add
>> 	support for some built-in functions vectorized on Power10.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	* gcc.target/powerpc/dive-vectorize-1.c: New test.
>> 	* gcc.target/powerpc/dive-vectorize-1.h: New test.
>> 	* gcc.target/powerpc/dive-vectorize-2.c: New test.
>> 	* gcc.target/powerpc/dive-vectorize-2.h: New test.
>> 	* gcc.target/powerpc/dive-vectorize-run-1.c: New test.
>> 	* gcc.target/powerpc/dive-vectorize-run-2.c: New test.
>> 	* gcc.target/powerpc/p10-bifs-vectorize-1.c: New test.
>> 	* gcc.target/powerpc/p10-bifs-vectorize-1.h: New test.
>> 	* gcc.target/powerpc/p10-bifs-vectorize-run-1.c: New test.
> 
> ---
>  gcc/config/rs6000/rs6000.c                    | 55 +++++++++++++++++++
>  .../gcc.target/powerpc/dive-vectorize-1.c     | 11 ++++
>  .../gcc.target/powerpc/dive-vectorize-1.h     | 22 ++++++++
>  .../gcc.target/powerpc/dive-vectorize-2.c     | 12 ++++
>  .../gcc.target/powerpc/dive-vectorize-2.h     | 22 ++++++++
>  .../gcc.target/powerpc/dive-vectorize-run-1.c | 52 ++++++++++++++++++
>  .../gcc.target/powerpc/dive-vectorize-run-2.c | 53 ++++++++++++++++++
>  .../gcc.target/powerpc/p10-bifs-vectorize-1.c | 15 +++++
>  .../gcc.target/powerpc/p10-bifs-vectorize-1.h | 40 ++++++++++++++
>  .../powerpc/p10-bifs-vectorize-run-1.c        | 45 +++++++++++++++
>  10 files changed, 327 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
> 
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 279f00cc648..3eac1d05101 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -5785,6 +5785,61 @@ rs6000_builtin_md_vectorized_function (tree fndecl, tree type_out,
>      default:
>        break;
>      }
> +
> +  machine_mode in_vmode = TYPE_MODE (type_in);
> +  machine_mode out_vmode = TYPE_MODE (type_out);
> +
> +  /* Power10 supported vectorized built-in functions.  */
> +  if (TARGET_POWER10
> +      && in_vmode == out_vmode
> +      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
> +    {
> +      machine_mode exp_mode = DImode;
> +      machine_mode exp_vmode = V2DImode;
> +      enum rs6000_builtins vname = RS6000_BUILTIN_COUNT;
> 
> Using this as a flag value looks unnecessary.  Is this just being done to silence a warning?
> 

Good question!  I didn't notice there is a warning or not, just get used to initializing variable
with one suitable value if possible.  If you don't mind, may I still keep it?  Since if some
future codes use vname in a path where it's not assigned, one explicitly wrong enum (bif) seems
better than a random one.  Or will this mentioned possibility definitely never happen since the
current uninitialized variables detection and warning scheme is robust and should not worry about
that completely?

> +      switch (fn)
> +	{
> +	case MISC_BUILTIN_DIVWE:
> +	case MISC_BUILTIN_DIVWEU:
> +	  exp_mode = SImode;
> +	  exp_vmode = V4SImode;
> +	  if (fn == MISC_BUILTIN_DIVWE)
> +	    vname = P10V_BUILTIN_DIVES_V4SI;
> +	  else
> +	    vname = P10V_BUILTIN_DIVEU_V4SI;
> +	  break;
> +	case MISC_BUILTIN_DIVDE:
> +	case MISC_BUILTIN_DIVDEU:
> +	  if (fn == MISC_BUILTIN_DIVDE)
> +	    vname = P10V_BUILTIN_DIVES_V2DI;
> +	  else
> +	    vname = P10V_BUILTIN_DIVEU_V2DI;
> +	  break;
> +	case P10_BUILTIN_CFUGED:
> +	  vname = P10V_BUILTIN_VCFUGED;
> +	  break;
> +	case P10_BUILTIN_CNTLZDM:
> +	  vname = P10V_BUILTIN_VCLZDM;
> +	  break;
> +	case P10_BUILTIN_CNTTZDM:
> +	  vname = P10V_BUILTIN_VCTZDM;
> +	  break;
> +	case P10_BUILTIN_PDEPD:
> +	  vname = P10V_BUILTIN_VPDEPD;
> +	  break;
> +	case P10_BUILTIN_PEXTD:
> +	  vname = P10V_BUILTIN_VPEXTD;
> +	  break;
> +	default:
> +	  return NULL_TREE;
> +	}
> +
> +      if (vname != RS6000_BUILTIN_COUNT
> 
> Check is not necessary, as you will have returned by now in that case.
> 

Thanks for catching, I put break for "default" initially, didn't noticed the following condition
need an adjustment after updating it to early return.  Will fix it.

BR,
Kewen

> Otherwise this patch LGTM.  Thanks!  Still needs maintainer approval, of course.
> 
> Bill
> 
> +	  && in_mode == exp_mode
> +	  && in_vmode == exp_vmode)
> +	return rs6000_builtin_decls[vname];
> +    }
> +
>    return NULL_TREE;
>  }
>  
> diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
> new file mode 100644
> index 00000000000..84f1b0a88f2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target power10_ok } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
> +
> +/* Test if signed/unsigned int extended divisions get vectorized.  */
> +
> +#include "dive-vectorize-1.h"
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
> +/* { dg-final { scan-assembler-times {\mvdivesw\M} 1 } } */
> +/* { dg-final { scan-assembler-times {\mvdiveuw\M} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
> new file mode 100644
> index 00000000000..119f637b46b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
> @@ -0,0 +1,22 @@
> +#define N 128
> +
> +typedef signed int si;
> +typedef unsigned int ui;
> +
> +si si_a[N], si_b[N], si_c[N];
> +ui ui_a[N], ui_b[N], ui_c[N];
> +
> +__attribute__ ((noipa)) void
> +test_divwe ()
> +{
> +  for (int i = 0; i < N; i++)
> +    si_c[i] = __builtin_divwe (si_a[i], si_b[i]);
> +}
> +
> +__attribute__ ((noipa)) void
> +test_divweu ()
> +{
> +  for (int i = 0; i < N; i++)
> +    ui_c[i] = __builtin_divweu (ui_a[i], ui_b[i]);
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
> new file mode 100644
> index 00000000000..4db0085562f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target lp64 } */
> +/* { dg-require-effective-target power10_ok } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
> +
> +/* Test if signed/unsigned long long extended divisions get vectorized.  */
> +
> +#include "dive-vectorize-2.h"
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
> +/* { dg-final { scan-assembler-times {\mvdivesd\M} 1 } } */
> +/* { dg-final { scan-assembler-times {\mvdiveud\M} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
> new file mode 100644
> index 00000000000..1cab56b2e0b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
> @@ -0,0 +1,22 @@
> +#define N 128
> +
> +typedef signed long long sLL;
> +typedef unsigned long long uLL;
> +
> +sLL sll_a[N], sll_b[N], sll_c[N];
> +uLL ull_a[N], ull_b[N], ull_c[N];
> +
> +__attribute__ ((noipa)) void
> +test_divde ()
> +{
> +  for (int i = 0; i < N; i++)
> +    sll_c[i] = __builtin_divde (sll_a[i], sll_b[i]);
> +}
> +
> +__attribute__ ((noipa)) void
> +test_divdeu ()
> +{
> +  for (int i = 0; i < N; i++)
> +    ull_c[i] = __builtin_divdeu (ull_a[i], ull_b[i]);
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
> new file mode 100644
> index 00000000000..1d5cbaa9f9b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
> @@ -0,0 +1,52 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target power10_hw } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
> +
> +#include "dive-vectorize-1.h"
> +
> +/* Check if test cases with signed/unsigned int extended division
> +   vectorization run successfully.  */
> +
> +__attribute__ ((optimize (1))) void
> +check_divwe ()
> +{
> +  test_divwe ();
> +  for (int i = 0; i < N; i++)
> +    {
> +      si exp = __builtin_divwe (si_a[i], si_b[i]);
> +      if (exp != si_c[i])
> +	__builtin_abort ();
> +    }
> +}
> +
> +__attribute__ ((optimize (1))) void
> +check_divweu ()
> +{
> +  test_divweu ();
> +  for (int i = 0; i < N; i++)
> +    {
> +      ui exp = __builtin_divweu (ui_a[i], ui_b[i]);
> +      if (exp != ui_c[i])
> +	__builtin_abort ();
> +    }
> +}
> +
> +int
> +main ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      si_a[i] = 0x10 * (i * 3 + 2);
> +      si_b[i] = 0x7890 * (i * 3 + 1);
> +      ui_a[i] = 0x234 * (i * 11 + 3) - 0xcd * (i * 5 - 7);
> +      ui_b[i] = 0x6078 * (i * 7 + 3) + 0xef * (i * 7 - 11);
> +      if (si_b[i] == 0 || ui_b[i] == 0)
> +	__builtin_abort ();
> +    }
> +
> +  check_divwe ();
> +  check_divweu ();
> +
> +  return 0;
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
> new file mode 100644
> index 00000000000..921b07b3f1b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
> @@ -0,0 +1,53 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target lp64 } */
> +/* { dg-require-effective-target power10_hw } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
> +
> +#include "dive-vectorize-2.h"
> +
> +/* Check if test cases with signed/unsigned int extended division
> +   vectorization run successfully.  */
> +
> +__attribute__ ((optimize (1))) void
> +check_divde ()
> +{
> +  test_divde ();
> +  for (int i = 0; i < N; i++)
> +    {
> +      sLL exp = __builtin_divde (sll_a[i], sll_b[i]);
> +      if (exp != sll_c[i])
> +	__builtin_abort ();
> +    }
> +}
> +
> +__attribute__ ((optimize (1))) void
> +check_divdeu ()
> +{
> +  test_divdeu ();
> +  for (int i = 0; i < N; i++)
> +    {
> +      uLL exp = __builtin_divdeu (ull_a[i], ull_b[i]);
> +      if (exp != ull_c[i])
> +	__builtin_abort ();
> +    }
> +}
> +
> +int
> +main ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      sll_a[i] = 0x102 * (i * 3 + 2);
> +      sll_b[i] = 0x789ab * (i * 3 + 1);
> +      ull_a[i] = 0x2345 * (i * 11 + 3) - 0xcd1 * (i * 5 - 7);
> +      ull_b[i] = 0x6078e * (i * 7 + 3) + 0xefa * (i * 7 - 11);
> +      if (sll_b[i] == 0 || ull_b[i] == 0)
> +	__builtin_abort ();
> +    }
> +
> +  check_divde ();
> +  check_divdeu ();
> +
> +  return 0;
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
> new file mode 100644
> index 00000000000..9b8b3642ead
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target lp64 } */
> +/* { dg-require-effective-target power10_ok } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
> +
> +/* Test if some Power10 built-in functions get vectorized.  */
> +
> +#include "p10-bifs-vectorize-1.h"
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 5 "vect" } } */
> +/* { dg-final { scan-assembler-times {\mvcfuged\M} 1 } } */
> +/* { dg-final { scan-assembler-times {\mvclzdm\M} 1 } } */
> +/* { dg-final { scan-assembler-times {\mvctzdm\M} 1 } } */
> +/* { dg-final { scan-assembler-times {\mvpdepd\M} 1 } } */
> +/* { dg-final { scan-assembler-times {\mvpextd\M} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
> new file mode 100644
> index 00000000000..80b7aacf810
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
> @@ -0,0 +1,40 @@
> +#define N 32
> +
> +typedef unsigned long long uLL;
> +uLL ull_a[N], ull_b[N], ull_c[N];
> +
> +__attribute__ ((noipa)) void
> +test_cfuged ()
> +{
> +  for (int i = 0; i < N; i++)
> +    ull_c[i] = __builtin_cfuged (ull_a[i], ull_b[i]);
> +}
> +
> +__attribute__ ((noipa)) void
> +test_cntlzdm ()
> +{
> +  for (int i = 0; i < N; i++)
> +    ull_c[i] = __builtin_cntlzdm (ull_a[i], ull_b[i]);
> +}
> +
> +__attribute__ ((noipa)) void
> +test_cnttzdm ()
> +{
> +  for (int i = 0; i < N; i++)
> +    ull_c[i] = __builtin_cnttzdm (ull_a[i], ull_b[i]);
> +}
> +
> +__attribute__ ((noipa)) void
> +test_pdepd ()
> +{
> +  for (int i = 0; i < N; i++)
> +    ull_c[i] = __builtin_pdepd (ull_a[i], ull_b[i]);
> +}
> +
> +__attribute__ ((noipa)) void
> +test_pextd ()
> +{
> +  for (int i = 0; i < N; i++)
> +    ull_c[i] = __builtin_pextd (ull_a[i], ull_b[i]);
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
> new file mode 100644
> index 00000000000..4b3b1165663
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target lp64 } */
> +/* { dg-require-effective-target power10_hw } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
> +
> +#include "p10-bifs-vectorize-1.h"
> +
> +/* Check if vectorized built-in functions run expectedly.  */
> +
> +#define CHECK(name)                                                           \
> +  __attribute__ ((optimize (1))) void check_##name ()                         \
> +  {                                                                           \
> +    test_##name ();                                                           \
> +    for (int i = 0; i < N; i++)                                               \
> +      {                                                                       \
> +	uLL exp = __builtin_##name (ull_a[i], ull_b[i]);                      \
> +	if (exp != ull_c[i])                                                  \
> +	  __builtin_abort ();                                                 \
> +      }                                                                       \
> +  }
> +
> +CHECK (cfuged)
> +CHECK (cntlzdm)
> +CHECK (cnttzdm)
> +CHECK (pdepd)
> +CHECK (pextd)
> +
> +int
> +main ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      ull_a[i] = 0x789a * (i * 11 - 5) - 0xcd1 * (i * 5 - 7);
> +      ull_b[i] = 0xfedc * (i * 7 + 3) + 0x467 * (i * 7 - 11);
> +    }
> +
> +  check_cfuged ();
> +  check_cntlzdm ();
> +  check_cnttzdm ();
> +  check_pdepd ();
> +  check_pextd ();
> +
> +  return 0;
> +}
> +
> -- 
> 2.17.1
> 
> 


BR,
Kewen
Li, Pan2 via Gcc-patches Aug. 12, 2021, 1:03 p.m. UTC | #3
On 8/11/21 9:10 PM, Kewen.Lin wrote:
> Hi Bill,
>
> Thanks for your prompt review!
>
> on 2021/8/12 上午12:34, Bill Schmidt wrote:
>> Hi Kewen,
>>
>> FWIW, it's easier on reviewers if you include the patch inline instead of as an attachment.
>>
>> On 8/11/21 1:56 AM, Kewen.Lin wrote:
>>> Hi,
>>>
>>> This patch is to add the support to make vectorizer able to
>>> vectorize scalar version of some built-in functions with its
>>> corresponding vector version with Power10 support.
>>>
>>> Bootstrapped & regtested on powerpc64le-linux-gnu {P9,P10}
>>> and powerpc64-linux-gnu P8.
>>>
>>> Is it ok for trunk?
>>>
>>> BR,
>>> Kewen
>>> -----
>>> gcc/ChangeLog:
>>>
>>> 	* config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add
>>> 	support for some built-in functions vectorized on Power10.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>> 	* gcc.target/powerpc/dive-vectorize-1.c: New test.
>>> 	* gcc.target/powerpc/dive-vectorize-1.h: New test.
>>> 	* gcc.target/powerpc/dive-vectorize-2.c: New test.
>>> 	* gcc.target/powerpc/dive-vectorize-2.h: New test.
>>> 	* gcc.target/powerpc/dive-vectorize-run-1.c: New test.
>>> 	* gcc.target/powerpc/dive-vectorize-run-2.c: New test.
>>> 	* gcc.target/powerpc/p10-bifs-vectorize-1.c: New test.
>>> 	* gcc.target/powerpc/p10-bifs-vectorize-1.h: New test.
>>> 	* gcc.target/powerpc/p10-bifs-vectorize-run-1.c: New test.
>> ---
>>   gcc/config/rs6000/rs6000.c                    | 55 +++++++++++++++++++
>>   .../gcc.target/powerpc/dive-vectorize-1.c     | 11 ++++
>>   .../gcc.target/powerpc/dive-vectorize-1.h     | 22 ++++++++
>>   .../gcc.target/powerpc/dive-vectorize-2.c     | 12 ++++
>>   .../gcc.target/powerpc/dive-vectorize-2.h     | 22 ++++++++
>>   .../gcc.target/powerpc/dive-vectorize-run-1.c | 52 ++++++++++++++++++
>>   .../gcc.target/powerpc/dive-vectorize-run-2.c | 53 ++++++++++++++++++
>>   .../gcc.target/powerpc/p10-bifs-vectorize-1.c | 15 +++++
>>   .../gcc.target/powerpc/p10-bifs-vectorize-1.h | 40 ++++++++++++++
>>   .../powerpc/p10-bifs-vectorize-run-1.c        | 45 +++++++++++++++
>>   10 files changed, 327 insertions(+)
>>   create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
>>   create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
>>   create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
>>   create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
>>   create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
>>   create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
>>   create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
>>   create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
>>   create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
>>
>> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
>> index 279f00cc648..3eac1d05101 100644
>> --- a/gcc/config/rs6000/rs6000.c
>> +++ b/gcc/config/rs6000/rs6000.c
>> @@ -5785,6 +5785,61 @@ rs6000_builtin_md_vectorized_function (tree fndecl, tree type_out,
>>       default:
>>         break;
>>       }
>> +
>> +  machine_mode in_vmode = TYPE_MODE (type_in);
>> +  machine_mode out_vmode = TYPE_MODE (type_out);
>> +
>> +  /* Power10 supported vectorized built-in functions.  */
>> +  if (TARGET_POWER10
>> +      && in_vmode == out_vmode
>> +      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
>> +    {
>> +      machine_mode exp_mode = DImode;
>> +      machine_mode exp_vmode = V2DImode;
>> +      enum rs6000_builtins vname = RS6000_BUILTIN_COUNT;
>>
>> Using this as a flag value looks unnecessary.  Is this just being done to silence a warning?
>>
> Good question!  I didn't notice there is a warning or not, just get used to initializing variable
> with one suitable value if possible.  If you don't mind, may I still keep it?  Since if some
> future codes use vname in a path where it's not assigned, one explicitly wrong enum (bif) seems
> better than a random one.  Or will this mentioned possibility definitely never happen since the
> current uninitialized variables detection and warning scheme is robust and should not worry about
> that completely?


I don't feel strongly about this either way; up to you and the 
maintainers. :)

Bill

>
>> +      switch (fn)
>> +	{
>> +	case MISC_BUILTIN_DIVWE:
>> +	case MISC_BUILTIN_DIVWEU:
>> +	  exp_mode = SImode;
>> +	  exp_vmode = V4SImode;
>> +	  if (fn == MISC_BUILTIN_DIVWE)
>> +	    vname = P10V_BUILTIN_DIVES_V4SI;
>> +	  else
>> +	    vname = P10V_BUILTIN_DIVEU_V4SI;
>> +	  break;
>> +	case MISC_BUILTIN_DIVDE:
>> +	case MISC_BUILTIN_DIVDEU:
>> +	  if (fn == MISC_BUILTIN_DIVDE)
>> +	    vname = P10V_BUILTIN_DIVES_V2DI;
>> +	  else
>> +	    vname = P10V_BUILTIN_DIVEU_V2DI;
>> +	  break;
>> +	case P10_BUILTIN_CFUGED:
>> +	  vname = P10V_BUILTIN_VCFUGED;
>> +	  break;
>> +	case P10_BUILTIN_CNTLZDM:
>> +	  vname = P10V_BUILTIN_VCLZDM;
>> +	  break;
>> +	case P10_BUILTIN_CNTTZDM:
>> +	  vname = P10V_BUILTIN_VCTZDM;
>> +	  break;
>> +	case P10_BUILTIN_PDEPD:
>> +	  vname = P10V_BUILTIN_VPDEPD;
>> +	  break;
>> +	case P10_BUILTIN_PEXTD:
>> +	  vname = P10V_BUILTIN_VPEXTD;
>> +	  break;
>> +	default:
>> +	  return NULL_TREE;
>> +	}
>> +
>> +      if (vname != RS6000_BUILTIN_COUNT
>>
>> Check is not necessary, as you will have returned by now in that case.
>>
> Thanks for catching, I put break for "default" initially, didn't noticed the following condition
> need an adjustment after updating it to early return.  Will fix it.
>
> BR,
> Kewen
>
>> Otherwise this patch LGTM.  Thanks!  Still needs maintainer approval, of course.
>>
>> Bill
>>
>> +	  && in_mode == exp_mode
>> +	  && in_vmode == exp_vmode)
>> +	return rs6000_builtin_decls[vname];
>> +    }
>> +
>>     return NULL_TREE;
>>   }
>>   
>> diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
>> new file mode 100644
>> index 00000000000..84f1b0a88f2
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
>> @@ -0,0 +1,11 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target power10_ok } */
>> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
>> +
>> +/* Test if signed/unsigned int extended divisions get vectorized.  */
>> +
>> +#include "dive-vectorize-1.h"
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
>> +/* { dg-final { scan-assembler-times {\mvdivesw\M} 1 } } */
>> +/* { dg-final { scan-assembler-times {\mvdiveuw\M} 1 } } */
>> diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
>> new file mode 100644
>> index 00000000000..119f637b46b
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
>> @@ -0,0 +1,22 @@
>> +#define N 128
>> +
>> +typedef signed int si;
>> +typedef unsigned int ui;
>> +
>> +si si_a[N], si_b[N], si_c[N];
>> +ui ui_a[N], ui_b[N], ui_c[N];
>> +
>> +__attribute__ ((noipa)) void
>> +test_divwe ()
>> +{
>> +  for (int i = 0; i < N; i++)
>> +    si_c[i] = __builtin_divwe (si_a[i], si_b[i]);
>> +}
>> +
>> +__attribute__ ((noipa)) void
>> +test_divweu ()
>> +{
>> +  for (int i = 0; i < N; i++)
>> +    ui_c[i] = __builtin_divweu (ui_a[i], ui_b[i]);
>> +}
>> +
>> diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
>> new file mode 100644
>> index 00000000000..4db0085562f
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
>> @@ -0,0 +1,12 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target lp64 } */
>> +/* { dg-require-effective-target power10_ok } */
>> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
>> +
>> +/* Test if signed/unsigned long long extended divisions get vectorized.  */
>> +
>> +#include "dive-vectorize-2.h"
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
>> +/* { dg-final { scan-assembler-times {\mvdivesd\M} 1 } } */
>> +/* { dg-final { scan-assembler-times {\mvdiveud\M} 1 } } */
>> diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
>> new file mode 100644
>> index 00000000000..1cab56b2e0b
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
>> @@ -0,0 +1,22 @@
>> +#define N 128
>> +
>> +typedef signed long long sLL;
>> +typedef unsigned long long uLL;
>> +
>> +sLL sll_a[N], sll_b[N], sll_c[N];
>> +uLL ull_a[N], ull_b[N], ull_c[N];
>> +
>> +__attribute__ ((noipa)) void
>> +test_divde ()
>> +{
>> +  for (int i = 0; i < N; i++)
>> +    sll_c[i] = __builtin_divde (sll_a[i], sll_b[i]);
>> +}
>> +
>> +__attribute__ ((noipa)) void
>> +test_divdeu ()
>> +{
>> +  for (int i = 0; i < N; i++)
>> +    ull_c[i] = __builtin_divdeu (ull_a[i], ull_b[i]);
>> +}
>> +
>> diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
>> new file mode 100644
>> index 00000000000..1d5cbaa9f9b
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
>> @@ -0,0 +1,52 @@
>> +/* { dg-do run } */
>> +/* { dg-require-effective-target power10_hw } */
>> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
>> +
>> +#include "dive-vectorize-1.h"
>> +
>> +/* Check if test cases with signed/unsigned int extended division
>> +   vectorization run successfully.  */
>> +
>> +__attribute__ ((optimize (1))) void
>> +check_divwe ()
>> +{
>> +  test_divwe ();
>> +  for (int i = 0; i < N; i++)
>> +    {
>> +      si exp = __builtin_divwe (si_a[i], si_b[i]);
>> +      if (exp != si_c[i])
>> +	__builtin_abort ();
>> +    }
>> +}
>> +
>> +__attribute__ ((optimize (1))) void
>> +check_divweu ()
>> +{
>> +  test_divweu ();
>> +  for (int i = 0; i < N; i++)
>> +    {
>> +      ui exp = __builtin_divweu (ui_a[i], ui_b[i]);
>> +      if (exp != ui_c[i])
>> +	__builtin_abort ();
>> +    }
>> +}
>> +
>> +int
>> +main ()
>> +{
>> +  for (int i = 0; i < N; i++)
>> +    {
>> +      si_a[i] = 0x10 * (i * 3 + 2);
>> +      si_b[i] = 0x7890 * (i * 3 + 1);
>> +      ui_a[i] = 0x234 * (i * 11 + 3) - 0xcd * (i * 5 - 7);
>> +      ui_b[i] = 0x6078 * (i * 7 + 3) + 0xef * (i * 7 - 11);
>> +      if (si_b[i] == 0 || ui_b[i] == 0)
>> +	__builtin_abort ();
>> +    }
>> +
>> +  check_divwe ();
>> +  check_divweu ();
>> +
>> +  return 0;
>> +}
>> +
>> diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
>> new file mode 100644
>> index 00000000000..921b07b3f1b
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
>> @@ -0,0 +1,53 @@
>> +/* { dg-do run } */
>> +/* { dg-require-effective-target lp64 } */
>> +/* { dg-require-effective-target power10_hw } */
>> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
>> +
>> +#include "dive-vectorize-2.h"
>> +
>> +/* Check if test cases with signed/unsigned int extended division
>> +   vectorization run successfully.  */
>> +
>> +__attribute__ ((optimize (1))) void
>> +check_divde ()
>> +{
>> +  test_divde ();
>> +  for (int i = 0; i < N; i++)
>> +    {
>> +      sLL exp = __builtin_divde (sll_a[i], sll_b[i]);
>> +      if (exp != sll_c[i])
>> +	__builtin_abort ();
>> +    }
>> +}
>> +
>> +__attribute__ ((optimize (1))) void
>> +check_divdeu ()
>> +{
>> +  test_divdeu ();
>> +  for (int i = 0; i < N; i++)
>> +    {
>> +      uLL exp = __builtin_divdeu (ull_a[i], ull_b[i]);
>> +      if (exp != ull_c[i])
>> +	__builtin_abort ();
>> +    }
>> +}
>> +
>> +int
>> +main ()
>> +{
>> +  for (int i = 0; i < N; i++)
>> +    {
>> +      sll_a[i] = 0x102 * (i * 3 + 2);
>> +      sll_b[i] = 0x789ab * (i * 3 + 1);
>> +      ull_a[i] = 0x2345 * (i * 11 + 3) - 0xcd1 * (i * 5 - 7);
>> +      ull_b[i] = 0x6078e * (i * 7 + 3) + 0xefa * (i * 7 - 11);
>> +      if (sll_b[i] == 0 || ull_b[i] == 0)
>> +	__builtin_abort ();
>> +    }
>> +
>> +  check_divde ();
>> +  check_divdeu ();
>> +
>> +  return 0;
>> +}
>> +
>> diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
>> new file mode 100644
>> index 00000000000..9b8b3642ead
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
>> @@ -0,0 +1,15 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target lp64 } */
>> +/* { dg-require-effective-target power10_ok } */
>> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
>> +
>> +/* Test if some Power10 built-in functions get vectorized.  */
>> +
>> +#include "p10-bifs-vectorize-1.h"
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 5 "vect" } } */
>> +/* { dg-final { scan-assembler-times {\mvcfuged\M} 1 } } */
>> +/* { dg-final { scan-assembler-times {\mvclzdm\M} 1 } } */
>> +/* { dg-final { scan-assembler-times {\mvctzdm\M} 1 } } */
>> +/* { dg-final { scan-assembler-times {\mvpdepd\M} 1 } } */
>> +/* { dg-final { scan-assembler-times {\mvpextd\M} 1 } } */
>> diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
>> new file mode 100644
>> index 00000000000..80b7aacf810
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
>> @@ -0,0 +1,40 @@
>> +#define N 32
>> +
>> +typedef unsigned long long uLL;
>> +uLL ull_a[N], ull_b[N], ull_c[N];
>> +
>> +__attribute__ ((noipa)) void
>> +test_cfuged ()
>> +{
>> +  for (int i = 0; i < N; i++)
>> +    ull_c[i] = __builtin_cfuged (ull_a[i], ull_b[i]);
>> +}
>> +
>> +__attribute__ ((noipa)) void
>> +test_cntlzdm ()
>> +{
>> +  for (int i = 0; i < N; i++)
>> +    ull_c[i] = __builtin_cntlzdm (ull_a[i], ull_b[i]);
>> +}
>> +
>> +__attribute__ ((noipa)) void
>> +test_cnttzdm ()
>> +{
>> +  for (int i = 0; i < N; i++)
>> +    ull_c[i] = __builtin_cnttzdm (ull_a[i], ull_b[i]);
>> +}
>> +
>> +__attribute__ ((noipa)) void
>> +test_pdepd ()
>> +{
>> +  for (int i = 0; i < N; i++)
>> +    ull_c[i] = __builtin_pdepd (ull_a[i], ull_b[i]);
>> +}
>> +
>> +__attribute__ ((noipa)) void
>> +test_pextd ()
>> +{
>> +  for (int i = 0; i < N; i++)
>> +    ull_c[i] = __builtin_pextd (ull_a[i], ull_b[i]);
>> +}
>> +
>> diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
>> new file mode 100644
>> index 00000000000..4b3b1165663
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
>> @@ -0,0 +1,45 @@
>> +/* { dg-do run } */
>> +/* { dg-require-effective-target lp64 } */
>> +/* { dg-require-effective-target power10_hw } */
>> +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
>> +
>> +#include "p10-bifs-vectorize-1.h"
>> +
>> +/* Check if vectorized built-in functions run expectedly.  */
>> +
>> +#define CHECK(name)                                                           \
>> +  __attribute__ ((optimize (1))) void check_##name ()                         \
>> +  {                                                                           \
>> +    test_##name ();                                                           \
>> +    for (int i = 0; i < N; i++)                                               \
>> +      {                                                                       \
>> +	uLL exp = __builtin_##name (ull_a[i], ull_b[i]);                      \
>> +	if (exp != ull_c[i])                                                  \
>> +	  __builtin_abort ();                                                 \
>> +      }                                                                       \
>> +  }
>> +
>> +CHECK (cfuged)
>> +CHECK (cntlzdm)
>> +CHECK (cnttzdm)
>> +CHECK (pdepd)
>> +CHECK (pextd)
>> +
>> +int
>> +main ()
>> +{
>> +  for (int i = 0; i < N; i++)
>> +    {
>> +      ull_a[i] = 0x789a * (i * 11 - 5) - 0xcd1 * (i * 5 - 7);
>> +      ull_b[i] = 0xfedc * (i * 7 + 3) + 0x467 * (i * 7 - 11);
>> +    }
>> +
>> +  check_cfuged ();
>> +  check_cntlzdm ();
>> +  check_cnttzdm ();
>> +  check_pdepd ();
>> +  check_pextd ();
>> +
>> +  return 0;
>> +}
>> +
>> -- 
>> 2.17.1
>>
>>
>
> BR,
> Kewen
Segher Boessenkool Aug. 12, 2021, 3:10 p.m. UTC | #4
Hi!

On Wed, Aug 11, 2021 at 02:56:11PM +0800, Kewen.Lin wrote:
> 	* config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add
> 	support for some built-in functions vectorized on Power10.

Say which, not "some" please?

> +  machine_mode in_vmode = TYPE_MODE (type_in);
> +  machine_mode out_vmode = TYPE_MODE (type_out);
> +
> +  /* Power10 supported vectorized built-in functions.  */
> +  if (TARGET_POWER10
> +      && in_vmode == out_vmode
> +      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
> +    {
> +      machine_mode exp_mode = DImode;
> +      machine_mode exp_vmode = V2DImode;
> +      enum rs6000_builtins vname = RS6000_BUILTIN_COUNT;

"name"?  This should be "bif" or similar?

> +      switch (fn)
> +	{
> +	case MISC_BUILTIN_DIVWE:
> +	case MISC_BUILTIN_DIVWEU:
> +	  exp_mode = SImode;
> +	  exp_vmode = V4SImode;
> +	  if (fn == MISC_BUILTIN_DIVWE)
> +	    vname = P10V_BUILTIN_DIVES_V4SI;
> +	  else
> +	    vname = P10V_BUILTIN_DIVEU_V4SI;
> +	  break;
> +	case MISC_BUILTIN_DIVDE:
> +	case MISC_BUILTIN_DIVDEU:
> +	  if (fn == MISC_BUILTIN_DIVDE)
> +	    vname = P10V_BUILTIN_DIVES_V2DI;
> +	  else
> +	    vname = P10V_BUILTIN_DIVEU_V2DI;
> +	  break;

All of the above should not be builtin functions really, they are all
simple arithmetic :-(  They should not be UNSPECs either, on RTL level.
They can and should be optimised in real code as well.  Oh well.

> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target lp64 } */

Please add a comment what this is needed for?  "We scan for dive*d" is
enough, but without anything, it takes time to figure this out.

> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
> @@ -0,0 +1,53 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target lp64 } */

Same here.  I suppose this uses builtins that do not exist on 32-bit?

> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target lp64 } */

And another.

> +#define CHECK(name)                                                           \
> +  __attribute__ ((optimize (1))) void check_##name ()                         \

What is the attribute for, btw?  It seems fragile, but perhaps I do not
understand the intention.


Okay for trunk with whose lp64 things improved.  Thanks!


Segher
Segher Boessenkool Aug. 12, 2021, 3:51 p.m. UTC | #5
On Thu, Aug 12, 2021 at 10:10:10AM +0800, Kewen.Lin wrote:
> > +      enum rs6000_builtins vname = RS6000_BUILTIN_COUNT;
> > 
> > Using this as a flag value looks unnecessary.  Is this just being done to silence a warning?
> 
> Good question!  I didn't notice there is a warning or not, just get used to initializing variable
> with one suitable value if possible.  If you don't mind, may I still keep it?  Since if some
> future codes use vname in a path where it's not assigned, one explicitly wrong enum (bif) seems
> better than a random one.  Or will this mentioned possibility definitely never happen since the
> current uninitialized variables detection and warning scheme is robust and should not worry about
> that completely?

It is a bad idea to initialise things unnecessary: it hinders many
optimisations, but much more importantly, it silences warnings without
fixing the problem.

> > +      if (vname != RS6000_BUILTIN_COUNT
> > 
> > Check is not necessary, as you will have returned by now in that case.
> 
> Thanks for catching, I put break for "default" initially, didn't noticed the following condition
> need an adjustment after updating it to early return.  Will fix it.

Thanks :-)


Segher
Kewen.Lin Aug. 13, 2021, 2:34 a.m. UTC | #6
Hi Segher,

Thanks for the review!

on 2021/8/12 下午11:10, Segher Boessenkool wrote:
> Hi!
> 
> On Wed, Aug 11, 2021 at 02:56:11PM +0800, Kewen.Lin wrote:
>> 	* config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add
>> 	support for some built-in functions vectorized on Power10.
> 
> Say which, not "some" please?
> 

Done.

>> +  machine_mode in_vmode = TYPE_MODE (type_in);
>> +  machine_mode out_vmode = TYPE_MODE (type_out);
>> +
>> +  /* Power10 supported vectorized built-in functions.  */
>> +  if (TARGET_POWER10
>> +      && in_vmode == out_vmode
>> +      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
>> +    {
>> +      machine_mode exp_mode = DImode;
>> +      machine_mode exp_vmode = V2DImode;
>> +      enum rs6000_builtins vname = RS6000_BUILTIN_COUNT;
> 
> "name"?  This should be "bif" or similar?
> 

Updated with name.

>> +      switch (fn)
>> +	{
>> +	case MISC_BUILTIN_DIVWE:
>> +	case MISC_BUILTIN_DIVWEU:
>> +	  exp_mode = SImode;
>> +	  exp_vmode = V4SImode;
>> +	  if (fn == MISC_BUILTIN_DIVWE)
>> +	    vname = P10V_BUILTIN_DIVES_V4SI;
>> +	  else
>> +	    vname = P10V_BUILTIN_DIVEU_V4SI;
>> +	  break;
>> +	case MISC_BUILTIN_DIVDE:
>> +	case MISC_BUILTIN_DIVDEU:
>> +	  if (fn == MISC_BUILTIN_DIVDE)
>> +	    vname = P10V_BUILTIN_DIVES_V2DI;
>> +	  else
>> +	    vname = P10V_BUILTIN_DIVEU_V2DI;
>> +	  break;
> 
> All of the above should not be builtin functions really, they are all
> simple arithmetic :-(  They should not be UNSPECs either, on RTL level.
> They can and should be optimised in real code as well.  Oh well.
> 
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
>> @@ -0,0 +1,12 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target lp64 } */
> 
> Please add a comment what this is needed for?  "We scan for dive*d" is
> enough, but without anything, it takes time to figure this out.
> 

Done, same for below requests on lp64 commentary.

>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
>> @@ -0,0 +1,53 @@
>> +/* { dg-do run } */
>> +/* { dg-require-effective-target lp64 } */
> 
> Same here.  I suppose this uses builtins that do not exist on 32-bit?
> 

Yeah, those bifs which are guarded with lp64 in their cases are only
supported on 64-bit environment.

>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
>> @@ -0,0 +1,45 @@
>> +/* { dg-do run } */
>> +/* { dg-require-effective-target lp64 } */
> 
> And another.
> 
>> +#define CHECK(name)                                                           \
>> +  __attribute__ ((optimize (1))) void check_##name ()                         \
> 
> What is the attribute for, btw?  It seems fragile, but perhaps I do not
> understand the intention.
> 
> 

It's to stop compiler from optimizing check functions with vectorization,
since the test point is to compare the results between scalar and vectorized
version.

> Okay for trunk with whose lp64 things improved.  Thanks!
> 

Thanks, v2 has been attached by addressing Bill's and your comments.  :)


BR,
Kewen
-----
gcc/ChangeLog:

	* config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add
	support for built-in functions MISC_BUILTIN_DIVWE, MISC_BUILTIN_DIVWEU,
	MISC_BUILTIN_DIVDE, MISC_BUILTIN_DIVDEU, P10_BUILTIN_CFUGED,
	P10_BUILTIN_CNTLZDM, P10_BUILTIN_CNTTZDM, P10_BUILTIN_PDEPD and
	P10_BUILTIN_PEXTD on Power10.
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 279f00cc648..a8b3175ed50 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -5785,6 +5785,59 @@ rs6000_builtin_md_vectorized_function (tree fndecl, tree type_out,
     default:
       break;
     }
+
+  machine_mode in_vmode = TYPE_MODE (type_in);
+  machine_mode out_vmode = TYPE_MODE (type_out);
+
+  /* Power10 supported vectorized built-in functions.  */
+  if (TARGET_POWER10
+      && in_vmode == out_vmode
+      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
+    {
+      machine_mode exp_mode = DImode;
+      machine_mode exp_vmode = V2DImode;
+      enum rs6000_builtins name;
+      switch (fn)
+	{
+	case MISC_BUILTIN_DIVWE:
+	case MISC_BUILTIN_DIVWEU:
+	  exp_mode = SImode;
+	  exp_vmode = V4SImode;
+	  if (fn == MISC_BUILTIN_DIVWE)
+	    name = P10V_BUILTIN_DIVES_V4SI;
+	  else
+	    name = P10V_BUILTIN_DIVEU_V4SI;
+	  break;
+	case MISC_BUILTIN_DIVDE:
+	case MISC_BUILTIN_DIVDEU:
+	  if (fn == MISC_BUILTIN_DIVDE)
+	    name = P10V_BUILTIN_DIVES_V2DI;
+	  else
+	    name = P10V_BUILTIN_DIVEU_V2DI;
+	  break;
+	case P10_BUILTIN_CFUGED:
+	  name = P10V_BUILTIN_VCFUGED;
+	  break;
+	case P10_BUILTIN_CNTLZDM:
+	  name = P10V_BUILTIN_VCLZDM;
+	  break;
+	case P10_BUILTIN_CNTTZDM:
+	  name = P10V_BUILTIN_VCTZDM;
+	  break;
+	case P10_BUILTIN_PDEPD:
+	  name = P10V_BUILTIN_VPDEPD;
+	  break;
+	case P10_BUILTIN_PEXTD:
+	  name = P10V_BUILTIN_VPEXTD;
+	  break;
+	default:
+	  return NULL_TREE;
+	}
+
+      if (in_mode == exp_mode && in_vmode == exp_vmode)
+	return rs6000_builtin_decls[name];
+    }
+
   return NULL_TREE;
 }
 
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
new file mode 100644
index 00000000000..84f1b0a88f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if signed/unsigned int extended divisions get vectorized.  */
+
+#include "dive-vectorize-1.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvdivesw\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvdiveuw\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
new file mode 100644
index 00000000000..119f637b46b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
@@ -0,0 +1,22 @@
+#define N 128
+
+typedef signed int si;
+typedef unsigned int ui;
+
+si si_a[N], si_b[N], si_c[N];
+ui ui_a[N], ui_b[N], ui_c[N];
+
+__attribute__ ((noipa)) void
+test_divwe ()
+{
+  for (int i = 0; i < N; i++)
+    si_c[i] = __builtin_divwe (si_a[i], si_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_divweu ()
+{
+  for (int i = 0; i < N; i++)
+    ui_c[i] = __builtin_divweu (ui_a[i], ui_b[i]);
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
new file mode 100644
index 00000000000..13d768d748c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* We scan for vdive*d which are only supported on 64-bit env.  */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if signed/unsigned long long extended divisions get vectorized.  */
+
+#include "dive-vectorize-2.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvdivesd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvdiveud\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
new file mode 100644
index 00000000000..1cab56b2e0b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
@@ -0,0 +1,22 @@
+#define N 128
+
+typedef signed long long sLL;
+typedef unsigned long long uLL;
+
+sLL sll_a[N], sll_b[N], sll_c[N];
+uLL ull_a[N], ull_b[N], ull_c[N];
+
+__attribute__ ((noipa)) void
+test_divde ()
+{
+  for (int i = 0; i < N; i++)
+    sll_c[i] = __builtin_divde (sll_a[i], sll_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_divdeu ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_divdeu (ull_a[i], ull_b[i]);
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
new file mode 100644
index 00000000000..1d5cbaa9f9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
@@ -0,0 +1,52 @@
+/* { dg-do run } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include "dive-vectorize-1.h"
+
+/* Check if test cases with signed/unsigned int extended division
+   vectorization run successfully.  */
+
+__attribute__ ((optimize (1))) void
+check_divwe ()
+{
+  test_divwe ();
+  for (int i = 0; i < N; i++)
+    {
+      si exp = __builtin_divwe (si_a[i], si_b[i]);
+      if (exp != si_c[i])
+	__builtin_abort ();
+    }
+}
+
+__attribute__ ((optimize (1))) void
+check_divweu ()
+{
+  test_divweu ();
+  for (int i = 0; i < N; i++)
+    {
+      ui exp = __builtin_divweu (ui_a[i], ui_b[i]);
+      if (exp != ui_c[i])
+	__builtin_abort ();
+    }
+}
+
+int
+main ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      si_a[i] = 0x10 * (i * 3 + 2);
+      si_b[i] = 0x7890 * (i * 3 + 1);
+      ui_a[i] = 0x234 * (i * 11 + 3) - 0xcd * (i * 5 - 7);
+      ui_b[i] = 0x6078 * (i * 7 + 3) + 0xef * (i * 7 - 11);
+      if (si_b[i] == 0 || ui_b[i] == 0)
+	__builtin_abort ();
+    }
+
+  check_divwe ();
+  check_divweu ();
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
new file mode 100644
index 00000000000..d25111c5c1f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
@@ -0,0 +1,54 @@
+/* { dg-do run } */
+/* The checked bifs are only supported on 64-bit env.  */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include "dive-vectorize-2.h"
+
+/* Check if test cases with signed/unsigned int extended division
+   vectorization run successfully.  */
+
+__attribute__ ((optimize (1))) void
+check_divde ()
+{
+  test_divde ();
+  for (int i = 0; i < N; i++)
+    {
+      sLL exp = __builtin_divde (sll_a[i], sll_b[i]);
+      if (exp != sll_c[i])
+	__builtin_abort ();
+    }
+}
+
+__attribute__ ((optimize (1))) void
+check_divdeu ()
+{
+  test_divdeu ();
+  for (int i = 0; i < N; i++)
+    {
+      uLL exp = __builtin_divdeu (ull_a[i], ull_b[i]);
+      if (exp != ull_c[i])
+	__builtin_abort ();
+    }
+}
+
+int
+main ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      sll_a[i] = 0x102 * (i * 3 + 2);
+      sll_b[i] = 0x789ab * (i * 3 + 1);
+      ull_a[i] = 0x2345 * (i * 11 + 3) - 0xcd1 * (i * 5 - 7);
+      ull_b[i] = 0x6078e * (i * 7 + 3) + 0xefa * (i * 7 - 11);
+      if (sll_b[i] == 0 || ull_b[i] == 0)
+	__builtin_abort ();
+    }
+
+  check_divde ();
+  check_divdeu ();
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
new file mode 100644
index 00000000000..fdbb9ebd61b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* What we scan for are only supported on 64-bit env.  */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if some Power10 built-in functions get vectorized.  */
+
+#include "p10-bifs-vectorize-1.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 5 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvcfuged\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvclzdm\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvctzdm\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvpdepd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvpextd\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
new file mode 100644
index 00000000000..80b7aacf810
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
@@ -0,0 +1,40 @@
+#define N 32
+
+typedef unsigned long long uLL;
+uLL ull_a[N], ull_b[N], ull_c[N];
+
+__attribute__ ((noipa)) void
+test_cfuged ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_cfuged (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_cntlzdm ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_cntlzdm (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_cnttzdm ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_cnttzdm (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_pdepd ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_pdepd (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_pextd ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_pextd (ull_a[i], ull_b[i]);
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
new file mode 100644
index 00000000000..fbaff14b5cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* The checked bifs are only supported on 64-bit env.  */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include "p10-bifs-vectorize-1.h"
+
+/* Check if vectorized built-in functions run expectedly.  */
+
+#define CHECK(name)                                                           \
+  __attribute__ ((optimize (1))) void check_##name ()                         \
+  {                                                                           \
+    test_##name ();                                                           \
+    for (int i = 0; i < N; i++)                                               \
+      {                                                                       \
+	uLL exp = __builtin_##name (ull_a[i], ull_b[i]);                      \
+	if (exp != ull_c[i])                                                  \
+	  __builtin_abort ();                                                 \
+      }                                                                       \
+  }
+
+CHECK (cfuged)
+CHECK (cntlzdm)
+CHECK (cnttzdm)
+CHECK (pdepd)
+CHECK (pextd)
+
+int
+main ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      ull_a[i] = 0x789a * (i * 11 - 5) - 0xcd1 * (i * 5 - 7);
+      ull_b[i] = 0xfedc * (i * 7 + 3) + 0x467 * (i * 7 - 11);
+    }
+
+  check_cfuged ();
+  check_cntlzdm ();
+  check_cnttzdm ();
+  check_pdepd ();
+  check_pextd ();
+
+  return 0;
+}
+
Kewen.Lin Aug. 13, 2021, 3:18 a.m. UTC | #7
on 2021/8/12 下午11:51, Segher Boessenkool wrote:
> On Thu, Aug 12, 2021 at 10:10:10AM +0800, Kewen.Lin wrote:
>>> +      enum rs6000_builtins vname = RS6000_BUILTIN_COUNT;
>>>
>>> Using this as a flag value looks unnecessary.  Is this just being done to silence a warning?
>>
>> Good question!  I didn't notice there is a warning or not, just get used to initializing variable
>> with one suitable value if possible.  If you don't mind, may I still keep it?  Since if some
>> future codes use vname in a path where it's not assigned, one explicitly wrong enum (bif) seems
>> better than a random one.  Or will this mentioned possibility definitely never happen since the
>> current uninitialized variables detection and warning scheme is robust and should not worry about
>> that completely?
> 
> It is a bad idea to initialise things unnecessary: it hinders many
> optimisations, but much more importantly, it silences warnings without
> fixing the problem.
> 

OK, I've made it uninitialized in v2. :-)  I believe the context here is simple
and the uninit-ed var detector can easily catch and warn the bad thing in future.

Sorry for chasing dead ends, I don't follow how it can hinder optimizations here,
IIUC it would be optimized as a dead store here?  As to the warning, although
there is no warning, I'd expect it causes ICE since the init-ed bif name isn't
reasonable for generation.  Wouldn't it be better than warning?  Sometimes we
don't have a proper value for initialization, I agree it should be better to
just leave it be, but IMHO it isn't the case here.  :)


BR,
Kewen

>>> +      if (vname != RS6000_BUILTIN_COUNT
>>>
>>> Check is not necessary, as you will have returned by now in that case.
>>
>> Thanks for catching, I put break for "default" initially, didn't noticed the following condition
>> need an adjustment after updating it to early return.  Will fix it.
> 
> Thanks :-)
> 
> 
> Segher
>
Li, Pan2 via Gcc-patches Aug. 24, 2021, 7:42 p.m. UTC | #8
Hi Kewen,

Sorry this sat in my queue for so long.  It looks like you addressed all 
of our concerns, so LGTM -- recommend maintainers approve.

Thanks!
Bill

On 8/12/21 9:34 PM, Kewen.Lin wrote:
> Hi Segher,
>
> Thanks for the review!
>
> on 2021/8/12 下午11:10, Segher Boessenkool wrote:
>> Hi!
>>
>> On Wed, Aug 11, 2021 at 02:56:11PM +0800, Kewen.Lin wrote:
>>> 	* config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add
>>> 	support for some built-in functions vectorized on Power10.
>> Say which, not "some" please?
>>
> Done.
>
>>> +  machine_mode in_vmode = TYPE_MODE (type_in);
>>> +  machine_mode out_vmode = TYPE_MODE (type_out);
>>> +
>>> +  /* Power10 supported vectorized built-in functions.  */
>>> +  if (TARGET_POWER10
>>> +      && in_vmode == out_vmode
>>> +      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
>>> +    {
>>> +      machine_mode exp_mode = DImode;
>>> +      machine_mode exp_vmode = V2DImode;
>>> +      enum rs6000_builtins vname = RS6000_BUILTIN_COUNT;
>> "name"?  This should be "bif" or similar?
>>
> Updated with name.
>
>>> +      switch (fn)
>>> +	{
>>> +	case MISC_BUILTIN_DIVWE:
>>> +	case MISC_BUILTIN_DIVWEU:
>>> +	  exp_mode = SImode;
>>> +	  exp_vmode = V4SImode;
>>> +	  if (fn == MISC_BUILTIN_DIVWE)
>>> +	    vname = P10V_BUILTIN_DIVES_V4SI;
>>> +	  else
>>> +	    vname = P10V_BUILTIN_DIVEU_V4SI;
>>> +	  break;
>>> +	case MISC_BUILTIN_DIVDE:
>>> +	case MISC_BUILTIN_DIVDEU:
>>> +	  if (fn == MISC_BUILTIN_DIVDE)
>>> +	    vname = P10V_BUILTIN_DIVES_V2DI;
>>> +	  else
>>> +	    vname = P10V_BUILTIN_DIVEU_V2DI;
>>> +	  break;
>> All of the above should not be builtin functions really, they are all
>> simple arithmetic :-(  They should not be UNSPECs either, on RTL level.
>> They can and should be optimised in real code as well.  Oh well.
>>
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
>>> @@ -0,0 +1,12 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target lp64 } */
>> Please add a comment what this is needed for?  "We scan for dive*d" is
>> enough, but without anything, it takes time to figure this out.
>>
> Done, same for below requests on lp64 commentary.
>
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
>>> @@ -0,0 +1,53 @@
>>> +/* { dg-do run } */
>>> +/* { dg-require-effective-target lp64 } */
>> Same here.  I suppose this uses builtins that do not exist on 32-bit?
>>
> Yeah, those bifs which are guarded with lp64 in their cases are only
> supported on 64-bit environment.
>
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
>>> @@ -0,0 +1,45 @@
>>> +/* { dg-do run } */
>>> +/* { dg-require-effective-target lp64 } */
>> And another.
>>
>>> +#define CHECK(name)                                                           \
>>> +  __attribute__ ((optimize (1))) void check_##name ()                         \
>> What is the attribute for, btw?  It seems fragile, but perhaps I do not
>> understand the intention.
>>
>>
> It's to stop compiler from optimizing check functions with vectorization,
> since the test point is to compare the results between scalar and vectorized
> version.
>
>> Okay for trunk with whose lp64 things improved.  Thanks!
>>
> Thanks, v2 has been attached by addressing Bill's and your comments.  :)
>
>
> BR,
> Kewen
> -----
> gcc/ChangeLog:
>
> 	* config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add
> 	support for built-in functions MISC_BUILTIN_DIVWE, MISC_BUILTIN_DIVWEU,
> 	MISC_BUILTIN_DIVDE, MISC_BUILTIN_DIVDEU, P10_BUILTIN_CFUGED,
> 	P10_BUILTIN_CNTLZDM, P10_BUILTIN_CNTTZDM, P10_BUILTIN_PDEPD and
> 	P10_BUILTIN_PEXTD on Power10.
Segher Boessenkool Aug. 24, 2021, 9:56 p.m. UTC | #9
On Fri, Aug 13, 2021 at 11:18:46AM +0800, Kewen.Lin wrote:
> on 2021/8/12 下午11:51, Segher Boessenkool wrote:
> > It is a bad idea to initialise things unnecessary: it hinders many
> > optimisations, but much more importantly, it silences warnings without
> > fixing the problem.
> 
> OK, I've made it uninitialized in v2. :-)  I believe the context here is simple
> and the uninit-ed var detector can easily catch and warn the bad thing in future.

And those warnings generally are for "MAY BE used uninitialised",
anyway.  They will warn :-)

(When the warning says "IS used uninitialised" the compiler should be
sure about that!)

> Sorry for chasing dead ends, I don't follow how it can hinder optimizations here,
> IIUC it would be optimized as a dead store here?

When the compiler is not sure if something needs initialisation or not
it cannot remove actually superfluous initialisation.  Such cases are
always too complicated code, so that should be fixed, not silenced :-)

> As to the warning, although
> there is no warning, I'd expect it causes ICE since the init-ed bif name isn't
> reasonable for generation.  Wouldn't it be better than warning?  Sometimes we
> don't have a proper value for initialization, I agree it should be better to
> just leave it be, but IMHO it isn't the case here.  :)

ICEing is always wrong.  A user should never see an ICE (not counting
"sorry"s as ICEs here -- not that those are good, but they tell the user
exactly what is going on).


Segher
Segher Boessenkool Aug. 24, 2021, 10:14 p.m. UTC | #10
Hi!

On Fri, Aug 13, 2021 at 10:34:46AM +0800, Kewen.Lin wrote:
> on 2021/8/12 下午11:10, Segher Boessenkool wrote:
> >> +      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
> >> +    {
> >> +      machine_mode exp_mode = DImode;
> >> +      machine_mode exp_vmode = V2DImode;
> >> +      enum rs6000_builtins vname = RS6000_BUILTIN_COUNT;
> > 
> > "name"?  This should be "bif" or similar?
> 
> Updated with name.

No, I meant "name" has no meaning other than it is wrong here :-)

It is an enum for which builtin to use here.  It has nothing to do with
a name.  So it could be "enum rs6000_builtins bif" or whatever you want;
short variable names are *good*, for many reasons, but they should not
egregiously lie :-)

> >> +/* { dg-do run } */
> >> +/* { dg-require-effective-target lp64 } */
> > 
> > Same here.  I suppose this uses builtins that do not exist on 32-bit?
> 
> Yeah, those bifs which are guarded with lp64 in their cases are only
> supported on 64-bit environment.

It is a pity we cannot use "powerpc64" here (that selector does not test
what you would/could/should hope it tests...  Maybe someone can fix it
some day?  The only real blocker to that is fixing up the current users
of it, the rest is easy).

Expanding a bit...  You would expect (well, I do!  I did patches
expecting this several times) this to mean "powerpc64_ok", but it in
fact means "powerpc64_hw".  Maybe we should have selectors with those
two names, and get rid of the current "powerpc64"?

> >> +#define CHECK(name)                                                           \
> >> +  __attribute__ ((optimize (1))) void check_##name ()                         \
> > 
> > What is the attribute for, btw?  It seems fragile, but perhaps I do not
> > understand the intention.
> 
> It's to stop compiler from optimizing check functions with vectorization,
> since the test point is to compare the results between scalar and vectorized
> version.

So, add a comment for this as well please.

In general, in testcases you can do the dirtiest things, no problems at
all, just document what you do why :-)

> Thanks, v2 has been attached by addressing Bill's and your comments.  :)

Looks good.  Just fix that "name" thing, and it is okay for trunk.
Thanks!


Segher
Kewen.Lin Aug. 25, 2021, 4:57 a.m. UTC | #11
on 2021/8/25 上午5:56, Segher Boessenkool wrote:
> On Fri, Aug 13, 2021 at 11:18:46AM +0800, Kewen.Lin wrote:
>> on 2021/8/12 下午11:51, Segher Boessenkool wrote:
>>> It is a bad idea to initialise things unnecessary: it hinders many
>>> optimisations, but much more importantly, it silences warnings without
>>> fixing the problem.
>>
>> OK, I've made it uninitialized in v2. :-)  I believe the context here is simple
>> and the uninit-ed var detector can easily catch and warn the bad thing in future.
> 
> And those warnings generally are for "MAY BE used uninitialised",
> anyway.  They will warn :-)
> 
> (When the warning says "IS used uninitialised" the compiler should be
> sure about that!)
> 
>> Sorry for chasing dead ends, I don't follow how it can hinder optimizations here,
>> IIUC it would be optimized as a dead store here?
> 
> When the compiler is not sure if something needs initialisation or not
> it cannot remove actually superfluous initialisation.  Such cases are
> always too complicated code, so that should be fixed, not silenced :-)
> 

aha, you meant complicated code, got it.  :)

>> As to the warning, although
>> there is no warning, I'd expect it causes ICE since the init-ed bif name isn't
>> reasonable for generation.  Wouldn't it be better than warning?  Sometimes we
>> don't have a proper value for initialization, I agree it should be better to
>> just leave it be, but IMHO it isn't the case here.  :)
> 
> ICEing is always wrong.  A user should never see an ICE (not counting
> "sorry"s as ICEs here -- not that those are good, but they tell the user
> exactly what is going on).
> 

Yeah, but here I was expecting the ICE happens when GCC developers are testing
the newly added bif supports.  :)


BR,
Kewen
Kewen.Lin Aug. 25, 2021, 5:17 a.m. UTC | #12
on 2021/8/25 上午6:14, Segher Boessenkool wrote:
> Hi!
> 
> On Fri, Aug 13, 2021 at 10:34:46AM +0800, Kewen.Lin wrote:
>> on 2021/8/12 下午11:10, Segher Boessenkool wrote:
>>>> +      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
>>>> +    {
>>>> +      machine_mode exp_mode = DImode;
>>>> +      machine_mode exp_vmode = V2DImode;
>>>> +      enum rs6000_builtins vname = RS6000_BUILTIN_COUNT;
>>>
>>> "name"?  This should be "bif" or similar?
>>
>> Updated with name.
> 
> No, I meant "name" has no meaning other than it is wrong here :-)
> 
> It is an enum for which builtin to use here.  It has nothing to do with
> a name.  So it could be "enum rs6000_builtins bif" or whatever you want;
> short variable names are *good*, for many reasons, but they should not
> egregiously lie :-)
> 

Oops, sorry for the misunderstanding, will update it with "bif".

>>>> +/* { dg-do run } */
>>>> +/* { dg-require-effective-target lp64 } */
>>>
>>> Same here.  I suppose this uses builtins that do not exist on 32-bit?
>>
>> Yeah, those bifs which are guarded with lp64 in their cases are only
>> supported on 64-bit environment.
> 
> It is a pity we cannot use "powerpc64" here (that selector does not test
> what you would/could/should hope it tests...  Maybe someone can fix it
> some day?  The only real blocker to that is fixing up the current users
> of it, the rest is easy).
> 

If I got it right, there is only one test case using this selector:

gcc/testsuite/gcc.target/powerpc/darwin-longlong.c

The selector checking looks interesting to me, it has special option with
"-mcpu=G5" and seems to exclude all "aix" (didn't verify it yet).

I guess there still would be some efforts to re-direct those existing
cases which should use new "powerpc64_ok" instead of "lp64"?

> Expanding a bit...  You would expect (well, I do!  I did patches
> expecting this several times) this to mean "powerpc64_ok", but it in
> fact means "powerpc64_hw".  Maybe we should have selectors with those
> two names, and get rid of the current "powerpc64"?
> 

Yeah, it sounds good to have those two names just like some existing.

>>>> +#define CHECK(name)                                                           \
>>>> +  __attribute__ ((optimize (1))) void check_##name ()                         \
>>>
>>> What is the attribute for, btw?  It seems fragile, but perhaps I do not
>>> understand the intention.
>>
>> It's to stop compiler from optimizing check functions with vectorization,
>> since the test point is to compare the results between scalar and vectorized
>> version.
> 
> So, add a comment for this as well please.
> 
> In general, in testcases you can do the dirtiest things, no problems at
> all, just document what you do why :-)
> 

OK, will add.

>> Thanks, v2 has been attached by addressing Bill's and your comments.  :)
> 
> Looks good.  Just fix that "name" thing, and it is okay for trunk.
> Thanks!
> 

Thanks for the review!

BR,
Kewen
diff mbox series

Patch

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 279f00cc648..3eac1d05101 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -5785,6 +5785,61 @@  rs6000_builtin_md_vectorized_function (tree fndecl, tree type_out,
     default:
       break;
     }
+
+  machine_mode in_vmode = TYPE_MODE (type_in);
+  machine_mode out_vmode = TYPE_MODE (type_out);
+
+  /* Power10 supported vectorized built-in functions.  */
+  if (TARGET_POWER10
+      && in_vmode == out_vmode
+      && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode))
+    {
+      machine_mode exp_mode = DImode;
+      machine_mode exp_vmode = V2DImode;
+      enum rs6000_builtins vname = RS6000_BUILTIN_COUNT;
+      switch (fn)
+	{
+	case MISC_BUILTIN_DIVWE:
+	case MISC_BUILTIN_DIVWEU:
+	  exp_mode = SImode;
+	  exp_vmode = V4SImode;
+	  if (fn == MISC_BUILTIN_DIVWE)
+	    vname = P10V_BUILTIN_DIVES_V4SI;
+	  else
+	    vname = P10V_BUILTIN_DIVEU_V4SI;
+	  break;
+	case MISC_BUILTIN_DIVDE:
+	case MISC_BUILTIN_DIVDEU:
+	  if (fn == MISC_BUILTIN_DIVDE)
+	    vname = P10V_BUILTIN_DIVES_V2DI;
+	  else
+	    vname = P10V_BUILTIN_DIVEU_V2DI;
+	  break;
+	case P10_BUILTIN_CFUGED:
+	  vname = P10V_BUILTIN_VCFUGED;
+	  break;
+	case P10_BUILTIN_CNTLZDM:
+	  vname = P10V_BUILTIN_VCLZDM;
+	  break;
+	case P10_BUILTIN_CNTTZDM:
+	  vname = P10V_BUILTIN_VCTZDM;
+	  break;
+	case P10_BUILTIN_PDEPD:
+	  vname = P10V_BUILTIN_VPDEPD;
+	  break;
+	case P10_BUILTIN_PEXTD:
+	  vname = P10V_BUILTIN_VPEXTD;
+	  break;
+	default:
+	  return NULL_TREE;
+	}
+
+      if (vname != RS6000_BUILTIN_COUNT
+	  && in_mode == exp_mode
+	  && in_vmode == exp_vmode)
+	return rs6000_builtin_decls[vname];
+    }
+
   return NULL_TREE;
 }
 
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
new file mode 100644
index 00000000000..84f1b0a88f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if signed/unsigned int extended divisions get vectorized.  */
+
+#include "dive-vectorize-1.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvdivesw\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvdiveuw\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
new file mode 100644
index 00000000000..119f637b46b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h
@@ -0,0 +1,22 @@ 
+#define N 128
+
+typedef signed int si;
+typedef unsigned int ui;
+
+si si_a[N], si_b[N], si_c[N];
+ui ui_a[N], ui_b[N], ui_c[N];
+
+__attribute__ ((noipa)) void
+test_divwe ()
+{
+  for (int i = 0; i < N; i++)
+    si_c[i] = __builtin_divwe (si_a[i], si_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_divweu ()
+{
+  for (int i = 0; i < N; i++)
+    ui_c[i] = __builtin_divweu (ui_a[i], ui_b[i]);
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
new file mode 100644
index 00000000000..4db0085562f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if signed/unsigned long long extended divisions get vectorized.  */
+
+#include "dive-vectorize-2.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvdivesd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvdiveud\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
new file mode 100644
index 00000000000..1cab56b2e0b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h
@@ -0,0 +1,22 @@ 
+#define N 128
+
+typedef signed long long sLL;
+typedef unsigned long long uLL;
+
+sLL sll_a[N], sll_b[N], sll_c[N];
+uLL ull_a[N], ull_b[N], ull_c[N];
+
+__attribute__ ((noipa)) void
+test_divde ()
+{
+  for (int i = 0; i < N; i++)
+    sll_c[i] = __builtin_divde (sll_a[i], sll_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_divdeu ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_divdeu (ull_a[i], ull_b[i]);
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
new file mode 100644
index 00000000000..1d5cbaa9f9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c
@@ -0,0 +1,52 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include "dive-vectorize-1.h"
+
+/* Check if test cases with signed/unsigned int extended division
+   vectorization run successfully.  */
+
+__attribute__ ((optimize (1))) void
+check_divwe ()
+{
+  test_divwe ();
+  for (int i = 0; i < N; i++)
+    {
+      si exp = __builtin_divwe (si_a[i], si_b[i]);
+      if (exp != si_c[i])
+	__builtin_abort ();
+    }
+}
+
+__attribute__ ((optimize (1))) void
+check_divweu ()
+{
+  test_divweu ();
+  for (int i = 0; i < N; i++)
+    {
+      ui exp = __builtin_divweu (ui_a[i], ui_b[i]);
+      if (exp != ui_c[i])
+	__builtin_abort ();
+    }
+}
+
+int
+main ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      si_a[i] = 0x10 * (i * 3 + 2);
+      si_b[i] = 0x7890 * (i * 3 + 1);
+      ui_a[i] = 0x234 * (i * 11 + 3) - 0xcd * (i * 5 - 7);
+      ui_b[i] = 0x6078 * (i * 7 + 3) + 0xef * (i * 7 - 11);
+      if (si_b[i] == 0 || ui_b[i] == 0)
+	__builtin_abort ();
+    }
+
+  check_divwe ();
+  check_divweu ();
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
new file mode 100644
index 00000000000..921b07b3f1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c
@@ -0,0 +1,53 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include "dive-vectorize-2.h"
+
+/* Check if test cases with signed/unsigned int extended division
+   vectorization run successfully.  */
+
+__attribute__ ((optimize (1))) void
+check_divde ()
+{
+  test_divde ();
+  for (int i = 0; i < N; i++)
+    {
+      sLL exp = __builtin_divde (sll_a[i], sll_b[i]);
+      if (exp != sll_c[i])
+	__builtin_abort ();
+    }
+}
+
+__attribute__ ((optimize (1))) void
+check_divdeu ()
+{
+  test_divdeu ();
+  for (int i = 0; i < N; i++)
+    {
+      uLL exp = __builtin_divdeu (ull_a[i], ull_b[i]);
+      if (exp != ull_c[i])
+	__builtin_abort ();
+    }
+}
+
+int
+main ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      sll_a[i] = 0x102 * (i * 3 + 2);
+      sll_b[i] = 0x789ab * (i * 3 + 1);
+      ull_a[i] = 0x2345 * (i * 11 + 3) - 0xcd1 * (i * 5 - 7);
+      ull_b[i] = 0x6078e * (i * 7 + 3) + 0xefa * (i * 7 - 11);
+      if (sll_b[i] == 0 || ull_b[i] == 0)
+	__builtin_abort ();
+    }
+
+  check_divde ();
+  check_divdeu ();
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
new file mode 100644
index 00000000000..9b8b3642ead
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
+
+/* Test if some Power10 built-in functions get vectorized.  */
+
+#include "p10-bifs-vectorize-1.h"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 5 "vect" } } */
+/* { dg-final { scan-assembler-times {\mvcfuged\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvclzdm\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvctzdm\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvpdepd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mvpextd\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
new file mode 100644
index 00000000000..80b7aacf810
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h
@@ -0,0 +1,40 @@ 
+#define N 32
+
+typedef unsigned long long uLL;
+uLL ull_a[N], ull_b[N], ull_c[N];
+
+__attribute__ ((noipa)) void
+test_cfuged ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_cfuged (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_cntlzdm ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_cntlzdm (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_cnttzdm ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_cnttzdm (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_pdepd ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_pdepd (ull_a[i], ull_b[i]);
+}
+
+__attribute__ ((noipa)) void
+test_pextd ()
+{
+  for (int i = 0; i < N; i++)
+    ull_c[i] = __builtin_pextd (ull_a[i], ull_b[i]);
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
new file mode 100644
index 00000000000..4b3b1165663
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c
@@ -0,0 +1,45 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include "p10-bifs-vectorize-1.h"
+
+/* Check if vectorized built-in functions run expectedly.  */
+
+#define CHECK(name)                                                           \
+  __attribute__ ((optimize (1))) void check_##name ()                         \
+  {                                                                           \
+    test_##name ();                                                           \
+    for (int i = 0; i < N; i++)                                               \
+      {                                                                       \
+	uLL exp = __builtin_##name (ull_a[i], ull_b[i]);                      \
+	if (exp != ull_c[i])                                                  \
+	  __builtin_abort ();                                                 \
+      }                                                                       \
+  }
+
+CHECK (cfuged)
+CHECK (cntlzdm)
+CHECK (cnttzdm)
+CHECK (pdepd)
+CHECK (pextd)
+
+int
+main ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      ull_a[i] = 0x789a * (i * 11 - 5) - 0xcd1 * (i * 5 - 7);
+      ull_b[i] = 0xfedc * (i * 7 + 3) + 0x467 * (i * 7 - 11);
+    }
+
+  check_cfuged ();
+  check_cntlzdm ();
+  check_cnttzdm ();
+  check_pdepd ();
+  check_pextd ();
+
+  return 0;
+}
+