diff mbox

Fix V64QImode multiplication with AVX512BW (PR target/70329)

Message ID 20160321201656.GG3017@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek March 21, 2016, 8:16 p.m. UTC
Hi!

The ix86_expand_vecop_qihi function has been adjusted for AVX512* just
by changing i < 32 to i < 64 (where both were sometimes wasteful), but
for !full_interleave that is even wrong, swapping the second and third
quarter is something that works to undo AVX256 unpacks only,
where we want
0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62,
permutation.  But, for AVX512 we want
0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126
where the current trunk code has been producing
0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,96,98,100,102,104,106,108,110,80,82,84,86,88,90,92,94,112,114,116,118,120,122,124,126
instead.

Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
trunk?

2016-03-21  Jakub Jelinek  <jakub@redhat.com>

	PR target/70329
	* config/i386/i386.c (ix86_expand_vecop_qihi): Don't bother computing
	d.perm[i] for i >= d.nelt.  If not full_interleave, compute d.perm[i]
	in a way that works also for AVX512BW.

	* gcc.target/i386/avx512bw-pr70329-1.c: New test.
	* gcc.target/i386/avx512bw-pr70329-2.c: New test.


	Jakub

Comments

Kirill Yukhin March 22, 2016, 7:29 a.m. UTC | #1
Hi Jakub!
On 21 Mar 21:16, Jakub Jelinek wrote:
> The ix86_expand_vecop_qihi function has been adjusted for AVX512* just
> by changing i < 32 to i < 64 (where both were sometimes wasteful), but
> for !full_interleave that is even wrong, swapping the second and third
> quarter is something that works to undo AVX256 unpacks only,
> where we want
> 0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62,
> permutation.  But, for AVX512 we want
> 0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126
> where the current trunk code has been producing
> 0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,96,98,100,102,104,106,108,110,80,82,84,86,88,90,92,94,112,114,116,118,120,122,124,126
> instead.
> 
> Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
> trunk?

Your putch is OK.
I'd only suggest to add a comment to this calculation:
+	d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);

--
Thanks, K
diff mbox

Patch

--- gcc/config/i386/i386.c.jj	2016-03-11 17:37:38.000000000 +0100
+++ gcc/config/i386/i386.c	2016-03-21 15:49:37.322044578 +0100
@@ -51910,7 +51910,7 @@  ix86_expand_vecop_qihi (enum rtx_code co
     {
       /* For SSE2, we used an full interleave, so the desired
 	 results are in the even elements.  */
-      for (i = 0; i < 64; ++i)
+      for (i = 0; i < d.nelt; ++i)
 	d.perm[i] = i * 2;
     }
   else
@@ -51918,8 +51918,8 @@  ix86_expand_vecop_qihi (enum rtx_code co
       /* For AVX, the interleave used above was not cross-lane.  So the
 	 extraction is evens but with the second and third quarter swapped.
 	 Happily, that is even one insn shorter than even extraction.  */
-      for (i = 0; i < 64; ++i)
-	d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
+      for (i = 0; i < d.nelt; ++i)
+	d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
     }
 
   ok = ix86_expand_vec_perm_const_1 (&d);
--- gcc/testsuite/gcc.target/i386/avx512bw-pr70329-1.c.jj	2016-03-21 16:06:44.073158614 +0100
+++ gcc/testsuite/gcc.target/i386/avx512bw-pr70329-1.c	2016-03-21 16:18:35.805538887 +0100
@@ -0,0 +1,27 @@ 
+/* PR target/70329 */
+/* { dg-do run } */
+/* { dg-options "-O0 -mavx512bw" } */
+/* { dg-require-effective-target avx512bw } */
+
+#define AVX512BW
+#include "avx512f-helper.h"
+
+typedef unsigned char A __attribute__ ((vector_size (64)));
+typedef unsigned int B __attribute__ ((vector_size (64)));
+
+unsigned __attribute__ ((noinline, noclone))
+foo (A a, A b, B c)
+{
+  a *= b;
+  c[1] += a[8];
+  return c[1];
+}
+
+void
+TEST (void)
+{
+  A a = (A) { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  unsigned x = foo (a, a, (B) { 1, 2 });
+  if (x != 83)
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx512bw-pr70329-2.c.jj	2016-03-21 16:09:37.667811745 +0100
+++ gcc/testsuite/gcc.target/i386/avx512bw-pr70329-2.c	2016-03-21 16:17:38.726309545 +0100
@@ -0,0 +1,33 @@ 
+/* PR target/70329 */
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize -mavx512bw" } */
+/* { dg-require-effective-target avx512bw } */
+
+#define AVX512BW
+#include "avx512f-helper.h"
+
+__attribute__((noinline, noclone)) void
+foo (unsigned char *src1, unsigned char *src2, unsigned char *dst)
+{
+  int i;
+
+  for (i = 0; i < 64; i++)
+    dst[i] = (unsigned char) ((int) src1[i] * (int) src2[i]);
+}
+
+void
+TEST (void)
+{
+  unsigned char a[64], b[64], c[64];
+  int i;
+
+  for (i = 0; i < 64; i++)
+    {
+      a[i] = i;
+      b[i] = (i + 1);
+    }
+  foo (a, b, c);
+  for (i = 0; i < 64; i++)
+    if (c[i] != (unsigned char) (i * (i + 1)))
+      abort ();
+}