diff mbox

[1/2,x86] Add palignr support for AVX2.

Message ID 20141001125618.GT1986@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek Oct. 1, 2014, 12:56 p.m. UTC
On Wed, Oct 01, 2014 at 02:25:01PM +0200, Uros Bizjak wrote:
> OK.

And now the expand_vec_perm_palignr improvement, tested
with GCC_TEST_RUN_EXPENSIVE=1 make check-gcc \
RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'
E.g.
typedef unsigned long long V __attribute__ ((vector_size (32)));
extern void abort (void);
V a, b, c, d;
void test_14 (void)
{
  V mask = { 6, 1, 3, 4 };
  int i;
  c = __builtin_shuffle (a, mask);
  d = __builtin_shuffle (a, b, mask);
}
(distilled from test 15 in vshuf-v4di.c) results in:
-	vmovdqa	a(%rip), %ymm0
-	vpermq	$54, %ymm0, %ymm1
-	vpshufb	.LC1(%rip), %ymm0, %ymm0
-	vmovdqa	%ymm1, c(%rip)
-	vmovdqa	b(%rip), %ymm1
-	vpshufb	.LC0(%rip), %ymm1, %ymm1
-	vpermq	$78, %ymm1, %ymm1
-	vpor	%ymm1, %ymm0, %ymm0
+	vmovdqa	a(%rip), %ymm1
+	vpermq	$54, %ymm1, %ymm0
+	vmovdqa	%ymm0, c(%rip)
+	vmovdqa	b(%rip), %ymm0
+	vpalignr	$8, %ymm1, %ymm0, %ymm0
+	vpermq	$99, %ymm0, %ymm0
 	vmovdqa	%ymm0, d(%rip)
 	vzeroupper
 	ret
change (and two fewer .rodata constants).

Ok for trunk?

2014-10-01  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/i386.c (expand_vec_perm_palignr): Handle
	256-bit vectors for TARGET_AVX2.



	Jakub

Comments

Uros Bizjak Oct. 1, 2014, 1:09 p.m. UTC | #1
On Wed, Oct 1, 2014 at 2:56 PM, Jakub Jelinek <jakub@redhat.com> wrote:

> And now the expand_vec_perm_palignr improvement, tested
> with GCC_TEST_RUN_EXPENSIVE=1 make check-gcc \
> RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'
> E.g.
> typedef unsigned long long V __attribute__ ((vector_size (32)));
> extern void abort (void);
> V a, b, c, d;
> void test_14 (void)
> {
>   V mask = { 6, 1, 3, 4 };
>   int i;
>   c = __builtin_shuffle (a, mask);
>   d = __builtin_shuffle (a, b, mask);
> }
> (distilled from test 15 in vshuf-v4di.c) results in:
> -       vmovdqa a(%rip), %ymm0
> -       vpermq  $54, %ymm0, %ymm1
> -       vpshufb .LC1(%rip), %ymm0, %ymm0
> -       vmovdqa %ymm1, c(%rip)
> -       vmovdqa b(%rip), %ymm1
> -       vpshufb .LC0(%rip), %ymm1, %ymm1
> -       vpermq  $78, %ymm1, %ymm1
> -       vpor    %ymm1, %ymm0, %ymm0
> +       vmovdqa a(%rip), %ymm1
> +       vpermq  $54, %ymm1, %ymm0
> +       vmovdqa %ymm0, c(%rip)
> +       vmovdqa b(%rip), %ymm0
> +       vpalignr        $8, %ymm1, %ymm0, %ymm0
> +       vpermq  $99, %ymm0, %ymm0
>         vmovdqa %ymm0, d(%rip)
>         vzeroupper
>         ret
> change (and two fewer .rodata constants).
>
> Ok for trunk?
>
> 2014-10-01  Jakub Jelinek  <jakub@redhat.com>
>
>         * config/i386/i386.c (expand_vec_perm_palignr): Handle
>         256-bit vectors for TARGET_AVX2.

Please mention PR 62128 and include the testcase from the PR. Also,
please add a version of gcc.target/i386/pr52252-atom.c, compiled with
-mavx2 (perhaps named pr52252-avx2.c)

OK with a small adjustment below.

Thanks,
Uros.

> --- gcc/config/i386/i386.c.jj   2014-10-01 14:24:16.483138899 +0200
> +++ gcc/config/i386/i386.c      2014-10-01 14:27:53.577222011 +0200
> @@ -43297,44 +43297,75 @@ expand_vec_perm_palignr (struct expand_v
>    rtx shift, target;
>    struct expand_vec_perm_d dcopy;
>
> -  /* Even with AVX, palignr only operates on 128-bit vectors.  */
> -  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
> +  /* Even with AVX, palignr only operates on 128-bit vectors,
> +     in AVX2 palignr operates on both 128-bit lanes.  */
> +  if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
> +      && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))

Please simplify the above condition ...

>      return false;
>
> -  min = nelt, max = 0;
> +  min = 2 * nelt, max = 0;
>    for (i = 0; i < nelt; ++i)
>      {
>        unsigned e = d->perm[i];
> +      if (GET_MODE_SIZE (d->vmode) == 32)
> +       e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
>        if (e < min)
>         min = e;
>        if (e > max)
>         max = e;
>      }
> -  if (min == 0 || max - min >= nelt)
> +  if (min == 0
> +      || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
>      return false;
>
>    /* Given that we have SSSE3, we know we'll be able to implement the
> -     single operand permutation after the palignr with pshufb.  */
> -  if (d->testing_p)
> +     single operand permutation after the palignr with pshufb for
> +     128-bit vectors.  */
> +  if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16)
>      return true;
>
>    dcopy = *d;
> -  shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
> -  target = gen_reg_rtx (TImode);
> -  emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
> -                                 gen_lowpart (TImode, d->op0), shift));
> -
> -  dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
> -  dcopy.one_operand_p = true;
>
>    in_order = true;
>    for (i = 0; i < nelt; ++i)
>      {
> -      unsigned e = dcopy.perm[i] - min;
> +      unsigned e = dcopy.perm[i];
> +      if (GET_MODE_SIZE (d->vmode) == 32
> +         && e >= nelt
> +         && (e & (nelt / 2 - 1)) < min)
> +       e = e - min - (nelt / 2);
> +      else
> +       e = e - min;
>        if (e != i)
>         in_order = false;
>        dcopy.perm[i] = e;
>      }
> +  dcopy.one_operand_p = true;
> +
> +  /* For AVX2, test whether we can permute the result in one instruction.  */
> +  if (d->testing_p)
> +    {
> +      if (in_order)
> +       return true;
> +      dcopy.op1 = dcopy.op0;
> +      return expand_vec_perm_1 (&dcopy);
> +    }
> +
> +  shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
> +  if (GET_MODE_SIZE (d->vmode) == 16)
> +    {
> +      target = gen_reg_rtx (TImode);
> +      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
> +                                     gen_lowpart (TImode, d->op0), shift));
> +    }
> +  else
> +    {
> +      target = gen_reg_rtx (V2TImode);
> +      emit_insn (gen_avx2_palignrv2ti (target, gen_lowpart (V2TImode, d->op1),
> +                                      gen_lowpart (V2TImode, d->op0), shift));
> +    }
> +
> +  dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
>
>    /* Test for the degenerate case where the alignment by itself
>       produces the desired permutation.  */
> @@ -43345,7 +43376,7 @@ expand_vec_perm_palignr (struct expand_v
>      }
>
>    ok = expand_vec_perm_1 (&dcopy);
> -  gcc_assert (ok);
> +  gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
>
>    return ok;
>  }
>
>
>         Jakub
Uros Bizjak Oct. 1, 2014, 7:03 p.m. UTC | #2
On Wed, Oct 1, 2014 at 2:56 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Wed, Oct 01, 2014 at 02:25:01PM +0200, Uros Bizjak wrote:
>> OK.
>
> And now the expand_vec_perm_palignr improvement, tested
> with GCC_TEST_RUN_EXPENSIVE=1 make check-gcc \
> RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'
> E.g.
> typedef unsigned long long V __attribute__ ((vector_size (32)));
> extern void abort (void);
> V a, b, c, d;
> void test_14 (void)
> {
>   V mask = { 6, 1, 3, 4 };
>   int i;
>   c = __builtin_shuffle (a, mask);
>   d = __builtin_shuffle (a, b, mask);
> }
> (distilled from test 15 in vshuf-v4di.c) results in:
> -       vmovdqa a(%rip), %ymm0
> -       vpermq  $54, %ymm0, %ymm1
> -       vpshufb .LC1(%rip), %ymm0, %ymm0
> -       vmovdqa %ymm1, c(%rip)
> -       vmovdqa b(%rip), %ymm1
> -       vpshufb .LC0(%rip), %ymm1, %ymm1
> -       vpermq  $78, %ymm1, %ymm1
> -       vpor    %ymm1, %ymm0, %ymm0
> +       vmovdqa a(%rip), %ymm1
> +       vpermq  $54, %ymm1, %ymm0
> +       vmovdqa %ymm0, c(%rip)
> +       vmovdqa b(%rip), %ymm0
> +       vpalignr        $8, %ymm1, %ymm0, %ymm0
> +       vpermq  $99, %ymm0, %ymm0
>         vmovdqa %ymm0, d(%rip)
>         vzeroupper
>         ret
> change (and two fewer .rodata constants).

On a related note, I would like to point out that
gcc.target/i386/pr61403.c also fails to generate blend insn with
-mavx2. The new insn sequence includes lots of new vpshufb insns with
memory access.

Uros.
diff mbox

Patch

--- gcc/config/i386/i386.c.jj	2014-10-01 14:24:16.483138899 +0200
+++ gcc/config/i386/i386.c	2014-10-01 14:27:53.577222011 +0200
@@ -43297,44 +43297,75 @@  expand_vec_perm_palignr (struct expand_v
   rtx shift, target;
   struct expand_vec_perm_d dcopy;
 
-  /* Even with AVX, palignr only operates on 128-bit vectors.  */
-  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+  /* Even with AVX, palignr only operates on 128-bit vectors,
+     in AVX2 palignr operates on both 128-bit lanes.  */
+  if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+      && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
     return false;
 
-  min = nelt, max = 0;
+  min = 2 * nelt, max = 0;
   for (i = 0; i < nelt; ++i)
     {
       unsigned e = d->perm[i];
+      if (GET_MODE_SIZE (d->vmode) == 32)
+	e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
       if (e < min)
 	min = e;
       if (e > max)
 	max = e;
     }
-  if (min == 0 || max - min >= nelt)
+  if (min == 0
+      || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
     return false;
 
   /* Given that we have SSSE3, we know we'll be able to implement the
-     single operand permutation after the palignr with pshufb.  */
-  if (d->testing_p)
+     single operand permutation after the palignr with pshufb for
+     128-bit vectors.  */
+  if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16)
     return true;
 
   dcopy = *d;
-  shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
-  target = gen_reg_rtx (TImode);
-  emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
-				  gen_lowpart (TImode, d->op0), shift));
-
-  dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
-  dcopy.one_operand_p = true;
 
   in_order = true;
   for (i = 0; i < nelt; ++i)
     {
-      unsigned e = dcopy.perm[i] - min;
+      unsigned e = dcopy.perm[i];
+      if (GET_MODE_SIZE (d->vmode) == 32
+	  && e >= nelt
+	  && (e & (nelt / 2 - 1)) < min)
+	e = e - min - (nelt / 2);
+      else
+	e = e - min;
       if (e != i)
 	in_order = false;
       dcopy.perm[i] = e;
     }
+  dcopy.one_operand_p = true;
+
+  /* For AVX2, test whether we can permute the result in one instruction.  */
+  if (d->testing_p)
+    {
+      if (in_order)
+	return true;
+      dcopy.op1 = dcopy.op0;
+      return expand_vec_perm_1 (&dcopy);
+    }
+
+  shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      target = gen_reg_rtx (TImode);
+      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
+				      gen_lowpart (TImode, d->op0), shift));
+    }
+  else
+    {
+      target = gen_reg_rtx (V2TImode);
+      emit_insn (gen_avx2_palignrv2ti (target, gen_lowpart (V2TImode, d->op1),
+				       gen_lowpart (V2TImode, d->op0), shift));
+    }
+
+  dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
 
   /* Test for the degenerate case where the alignment by itself
      produces the desired permutation.  */
@@ -43345,7 +43376,7 @@  expand_vec_perm_palignr (struct expand_v
     }
 
   ok = expand_vec_perm_1 (&dcopy);
-  gcc_assert (ok);
+  gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
 
   return ok;
 }