diff mbox

, PR 71805, Fix PowerPC ISA 3.0 xxperm/xxpermr usage

Message ID 20160712042812.GA26679@ibm-tiger.the-meissners.org
State New
Headers show

Commit Message

Michael Meissner July 12, 2016, 4:28 a.m. UTC
When I initially implemented the xxperm support in the ISA 3.0 extensions, I
misread the manual.  The xxperm (and xxpermr) instruction logically should have
4 arguments, like vperm/vpermr (ouput, 2 inputs that provide the bytes, and the
permute register), but due to encoding issues, it only has 3 VSX arguments.
The output register must be the same as the second input register.  However, I
misread the manual as saying that output be the same as the first input
register.

The test gcc.dg/vect/pr45752.c sets the parameter tree-reassoc-width=1, which
causes the test to be vectorized.  If you don't override the reassociation
width or set it to 2, the code is not vectorized.  When the code is vectorized,
it generates a lot of permute instructions.  This means, the register allocator
runs out of Altivec registers, and generates several xxperm instructions.

In this patch, I have fixed the xxperm instructions to expect the second input
argument to be the same as the output argument instead of the first.  I also
fixed the xxpermr instruction, which suffers from the same problem.  I added
the test case, and made it specific to power9, in case the original test case
is changed.

I have done a bootstrap build with no regressions.  Can I install this patch
into both the trunk and GCC 6, which also shows the bug?

[gcc]
2016-07-11  Michael Meissner  <meissner@linux.vnet.ibm.com>

	PR target/71805
	* config/rs6000/altivec.md (altivec_vperm_<mode>_internal):
	The xxperm and xxpermr instructions require that the 2nd input
	operand overlap with the output operand, and not the 1st.
	(altivec_vperm_v8hiv16qi): Likewise.
	(altivec_vperm_<mode>_uns_internal): Likewise.
	(altivec_vpermr_<mode>_internal): Likewise.
	(vperm_v8hiv4si): Likewise.
	(vperm_v16qiv8hi): Likewise.

[gcc/testsuite]
2016-07-11  Michael Meissner  <meissner@linux.vnet.ibm.com>

	PR target/71805
	* gcc.target/powerpc/pr71805.c: New test.

Comments

Segher Boessenkool July 12, 2016, 12:09 p.m. UTC | #1
On Tue, Jul 12, 2016 at 12:28:12AM -0400, Michael Meissner wrote:
> In this patch, I have fixed the xxperm instructions to expect the second input
> argument to be the same as the output argument instead of the first.  I also
> fixed the xxpermr instruction, which suffers from the same problem.  I added
> the test case, and made it specific to power9, in case the original test case
> is changed.
> 
> I have done a bootstrap build with no regressions.  Can I install this patch
> into both the trunk and GCC 6, which also shows the bug?

Okay for both.  Thanks,


Segher
diff mbox

Patch

Index: gcc/config/rs6000/altivec.md
===================================================================
--- gcc/config/rs6000/altivec.md	(revision 238232)
+++ gcc/config/rs6000/altivec.md	(working copy)
@@ -1985,27 +1985,27 @@  (define_expand "altivec_vperm_<mode>"
 ;; Slightly prefer vperm, since the target does not overlap the source
 (define_insn "*altivec_vperm_<mode>_internal"
   [(set (match_operand:VM 0 "register_operand" "=v,?wo")
-	(unspec:VM [(match_operand:VM 1 "register_operand" "v,0")
-		    (match_operand:VM 2 "register_operand" "v,wo")
+	(unspec:VM [(match_operand:VM 1 "register_operand" "v,wo")
+		    (match_operand:VM 2 "register_operand" "v,0")
 		    (match_operand:V16QI 3 "register_operand" "v,wo")]
 		   UNSPEC_VPERM))]
   "TARGET_ALTIVEC"
   "@
    vperm %0,%1,%2,%3
-   xxperm %x0,%x2,%x3"
+   xxperm %x0,%x1,%x3"
   [(set_attr "type" "vecperm")
    (set_attr "length" "4")])
 
 (define_insn "altivec_vperm_v8hiv16qi"
   [(set (match_operand:V16QI 0 "register_operand" "=v,?wo")
-	(unspec:V16QI [(match_operand:V8HI 1 "register_operand" "v,0")
-   	               (match_operand:V8HI 2 "register_operand" "v,wo")
+	(unspec:V16QI [(match_operand:V8HI 1 "register_operand" "v,wo")
+   	               (match_operand:V8HI 2 "register_operand" "v,0")
 		       (match_operand:V16QI 3 "register_operand" "v,wo")]
 		   UNSPEC_VPERM))]
   "TARGET_ALTIVEC"
   "@
    vperm %0,%1,%2,%3
-   xxperm %x0,%x2,%x3"
+   xxperm %x0,%x1,%x3"
   [(set_attr "type" "vecperm")
    (set_attr "length" "4")])
 
@@ -2026,14 +2026,14 @@  (define_expand "altivec_vperm_<mode>_uns
 
 (define_insn "*altivec_vperm_<mode>_uns_internal"
   [(set (match_operand:VM 0 "register_operand" "=v,?wo")
-	(unspec:VM [(match_operand:VM 1 "register_operand" "v,0")
-		    (match_operand:VM 2 "register_operand" "v,wo")
+	(unspec:VM [(match_operand:VM 1 "register_operand" "v,wo")
+		    (match_operand:VM 2 "register_operand" "v,0")
 		    (match_operand:V16QI 3 "register_operand" "v,wo")]
 		   UNSPEC_VPERM_UNS))]
   "TARGET_ALTIVEC"
   "@
    vperm %0,%1,%2,%3
-   xxperm %x0,%x2,%x3"
+   xxperm %x0,%x1,%x3"
   [(set_attr "type" "vecperm")
    (set_attr "length" "4")])
 
@@ -2066,14 +2066,14 @@  (define_expand "vec_perm_constv16qi"
 
 (define_insn "*altivec_vpermr_<mode>_internal"
   [(set (match_operand:VM 0 "register_operand" "=v,?wo")
-	(unspec:VM [(match_operand:VM 1 "register_operand" "v,0")
-		    (match_operand:VM 2 "register_operand" "v,wo")
+	(unspec:VM [(match_operand:VM 1 "register_operand" "v,wo")
+		    (match_operand:VM 2 "register_operand" "v,0")
 		    (match_operand:V16QI 3 "register_operand" "v,wo")]
 		   UNSPEC_VPERMR))]
   "TARGET_P9_VECTOR"
   "@
    vpermr %0,%2,%1,%3
-   xxpermr %x0,%x2,%x3"
+   xxpermr %x0,%x1,%x3"
   [(set_attr "type" "vecperm")
    (set_attr "length" "4")])
 
@@ -2895,27 +2895,27 @@  (define_expand "vec_unpacks_lo_<VP_small
 
 (define_insn "vperm_v8hiv4si"
   [(set (match_operand:V4SI 0 "register_operand" "=v,?wo")
-        (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v,0")
-		      (match_operand:V4SI 2 "register_operand" "v,wo")
+        (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v,wo")
+		      (match_operand:V4SI 2 "register_operand" "v,0")
 		      (match_operand:V16QI 3 "register_operand" "v,wo")]
                   UNSPEC_VPERMSI))]
   "TARGET_ALTIVEC"
   "@
    vperm %0,%1,%2,%3
-   xxperm %x0,%x2,%x3"
+   xxperm %x0,%x1,%x3"
   [(set_attr "type" "vecperm")
    (set_attr "length" "4")])
 
 (define_insn "vperm_v16qiv8hi"
   [(set (match_operand:V8HI 0 "register_operand" "=v,?wo")
-        (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v,0")
-		      (match_operand:V8HI 2 "register_operand" "v,wo")
+        (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v,wo")
+		      (match_operand:V8HI 2 "register_operand" "v,0")
 		      (match_operand:V16QI 3 "register_operand" "v,wo")]
                   UNSPEC_VPERMHI))]
   "TARGET_ALTIVEC"
   "@
    vperm %0,%1,%2,%3
-   xxperm %x0,%x2,%x3"
+   xxperm %x0,%x1,%x3"
   [(set_attr "type" "vecperm")
    (set_attr "length" "4")])
 
Index: gcc/testsuite/gcc.target/powerpc/pr71805.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr71805.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/pr71805.c	(working copy)
@@ -0,0 +1,113 @@ 
+/* { dg-require-effective-target p9vector_hw } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */
+/* { dg-options "-mcpu=power9 -O3 --param tree-reassoc-width=1" } */
+
+/* Originally from gcc.dg/vect/pr45752.c.  */
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void abort (void);
+extern void exit (int);
+#ifdef __cplusplus
+}
+#endif
+
+#define M00 100
+#define M10 216
+#define M20 23
+#define M30 237
+#define M40 437
+
+#define M01 1322
+#define M11 13
+#define M21 27271
+#define M31 2280
+#define M41 284
+
+#define M02 74
+#define M12 191
+#define M22 500
+#define M32 111
+#define M42 1114
+
+#define M03 134
+#define M13 117
+#define M23 11
+#define M33 771
+#define M43 71
+
+#define M04 334
+#define M14 147
+#define M24 115
+#define M34 7716
+#define M44 16
+
+#define N 20
+
+void foo (unsigned int *__restrict__ pInput,
+          unsigned int *__restrict__ pOutput,
+          unsigned int *__restrict__ pInput2,
+          unsigned int *__restrict__ pOutput2)
+{
+  unsigned int i, a, b, c, d, e;
+
+  for (i = 0; i < N / 5; i++)
+    {
+       a = *pInput++;
+       b = *pInput++;
+       c = *pInput++;
+       d = *pInput++;
+       e = *pInput++;
+
+       *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
+       *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
+       *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
+       *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
+       *pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
+
+
+       a = *pInput2++;
+       b = *pInput2++;
+       c = *pInput2++;
+       d = *pInput2++;
+       e = *pInput2++;
+
+       *pOutput2++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
+       *pOutput2++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
+       *pOutput2++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
+       *pOutput2++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
+       *pOutput2++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
+
+    }
+}
+
+int main (int argc, const char* argv[])
+{
+  unsigned int input[N], output[N], i, input2[N], output2[N];
+  unsigned int check_results[N]
+    = {3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399, 
+    22848, 8174, 307964, 146829, 22009, 32668, 11594, 447564, 202404, 31619 };
+  unsigned int check_results2[N]
+    = {7136, 2702, 84604, 57909, 6633, 16956, 6122, 224204, 113484, 16243, 
+    26776, 9542, 363804, 169059, 25853, 36596, 12962, 503404, 224634, 35463 };
+
+  for (i = 0; i < N; i++)
+    {
+      input[i] = i%256;
+      input2[i] = i + 2;
+      output[i] = 0;
+      output2[i] = 0;
+      __asm__ volatile ("");
+    }
+
+  foo (input, output, input2, output2);
+
+  for (i = 0; i < N; i++)
+    if (output[i] != check_results[i]
+        || output2[i] != check_results2[i])
+      abort ();
+
+  return 0;
+}