diff mbox

[1/2,x86] Add palignr support for AVX2.

Message ID 20141001121715.GR1986@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek Oct. 1, 2014, 12:17 p.m. UTC
On Wed, Oct 01, 2014 at 01:45:54PM +0200, Uros Bizjak wrote:
> OK.

Thanks.  Second step is a tiny optimization, for the
simplified 122 (now 24) vshuf-v4di.c testcase:
typedef unsigned long long V __attribute__ ((vector_size (32)));
V a, b, c, d;

int
main ()
{
  int i;
  for (i = 0; i < 4; ++i)
    {
      a[i] = i + 2;
      b[i] = 4 + i + 2;
    }
  asm volatile ("" : : : "memory");
  c = __builtin_shuffle (a, b, (V) { 2, 5, 6, 3 });
  d = __builtin_shuffle ((V) { 2, 3, 4, 5 }, (V) { 6, 7, 8, 9 }, (V) { 2, 5, 6, 3 });
  if (__builtin_memcmp (&c, &d, sizeof (c)))
    __builtin_abort ();
  return 0;
}

this patch allows better code to be generated:
-       vmovdqa b(%rip), %ymm0
+       vpermq  $238, a(%rip), %ymm1
        movl    $32, %edx
-       movl    $d, %esi
-       vmovdqa a(%rip), %ymm1
+       vmovdqa b(%rip), %ymm0
+       movl    $d, %esi
        movl    $c, %edi
-       vperm2i128      $17, %ymm0, %ymm1, %ymm1
        vpblendd        $195, %ymm1, %ymm0, %ymm0
        vmovdqa %ymm0, c(%rip)

That is because vperm2i128 $17 unnecessarily uses
two operands when all the data it grabs are from a single one.
So, by canonicalizing the permutation we can emit
vpermq $238 instead.  Perhaps more places might benefit from
extra canonicalize_perm calls (two spots already use that beyond
the single one on the expansion/testing entry point).

Tested again with
GCC_TEST_RUN_EXPENSIVE=1 make check-gcc \
RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'
on x86_64-linux.  Ok for trunk?

2014-10-01  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/i386.c (expand_vec_perm_vperm2f128): Canonicalize
	dfirst permutation.



	Jakub

Comments

Uros Bizjak Oct. 1, 2014, 12:25 p.m. UTC | #1
On Wed, Oct 1, 2014 at 2:17 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Wed, Oct 01, 2014 at 01:45:54PM +0200, Uros Bizjak wrote:
>> OK.
>
> Thanks.  Second step is a tiny optimization, for the
> simplified 122 (now 24) vshuf-v4di.c testcase:
> typedef unsigned long long V __attribute__ ((vector_size (32)));
> V a, b, c, d;
>
> int
> main ()
> {
>   int i;
>   for (i = 0; i < 4; ++i)
>     {
>       a[i] = i + 2;
>       b[i] = 4 + i + 2;
>     }
>   asm volatile ("" : : : "memory");
>   c = __builtin_shuffle (a, b, (V) { 2, 5, 6, 3 });
>   d = __builtin_shuffle ((V) { 2, 3, 4, 5 }, (V) { 6, 7, 8, 9 }, (V) { 2, 5, 6, 3 });
>   if (__builtin_memcmp (&c, &d, sizeof (c)))
>     __builtin_abort ();
>   return 0;
> }
>
> this patch allows better code to be generated:
> -       vmovdqa b(%rip), %ymm0
> +       vpermq  $238, a(%rip), %ymm1
>         movl    $32, %edx
> -       movl    $d, %esi
> -       vmovdqa a(%rip), %ymm1
> +       vmovdqa b(%rip), %ymm0
> +       movl    $d, %esi
>         movl    $c, %edi
> -       vperm2i128      $17, %ymm0, %ymm1, %ymm1
>         vpblendd        $195, %ymm1, %ymm0, %ymm0
>         vmovdqa %ymm0, c(%rip)
>
> That is because vperm2i128 $17 unnecessarily uses
> two operands when all the data it grabs are from a single one.
> So, by canonicalizing the permutation we can emit
> vpermq $238 instead.  Perhaps more places might benefit from
> extra canonicalize_perm calls (two spots already use that beyond
> the single one on the expansion/testing entry point).
>
> Tested again with
> GCC_TEST_RUN_EXPENSIVE=1 make check-gcc \
> RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'
> on x86_64-linux.  Ok for trunk?
>
> 2014-10-01  Jakub Jelinek  <jakub@redhat.com>
>
>         * config/i386/i386.c (expand_vec_perm_vperm2f128): Canonicalize
>         dfirst permutation.

OK.

Thanks,
Uros.
diff mbox

Patch

--- gcc/config/i386/i386.c.jj	2014-10-01 13:00:30.000000000 +0200
+++ gcc/config/i386/i386.c	2014-10-01 13:59:40.061956852 +0200
@@ -43905,15 +43905,16 @@  expand_vec_perm_vperm2f128 (struct expan
 	    dfirst.perm[i] = (i & (nelt2 - 1))
 			     + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
 
+	  canonicalize_perm (&dfirst);
 	  ok = expand_vec_perm_1 (&dfirst);
 	  gcc_assert (ok);
 
 	  /* And dsecond is some single insn shuffle, taking
 	     d->op0 and result of vperm2f128 (if perm < 16) or
 	     d->op1 and result of vperm2f128 (otherwise).  */
-	  dsecond.op1 = dfirst.target;
 	  if (perm >= 16)
-	    dsecond.op0 = dfirst.op1;
+	    dsecond.op0 = dsecond.op1;
+	  dsecond.op1 = dfirst.target;
 
 	  ok = expand_vec_perm_1 (&dsecond);
 	  gcc_assert (ok);