diff mbox

[x86,PR60451] Expand even/odd permutation using pack insn.

Message ID CAOvf_xxWXEfgQ2ndJ9Ax+PeURhL2jkCWzu_oNgzHNRxCAwzkbA@mail.gmail.com
State New
Headers show

Commit Message

Evgeny Stupachenko Nov. 20, 2014, 3:03 p.m. UTC
Good point! "gen_shift" also requires only SSE2.
That way we can optimize out interleave sequence for V16QI mode in
expand_vec_perm_even_odd_1.
Thanks!

Evgeny

Updated patch:


On Thu, Nov 20, 2014 at 5:30 PM, Richard Henderson <rth@redhat.com> wrote:
> On 11/20/2014 12:36 PM, Evgeny Stupachenko wrote:
>> +  /* Required for "pack".  */
>> +  if (!TARGET_SSE4_2 || d->one_operand_p)
>> +    return false;
>
> Why the SSE4_2 check here when...
>
>> +
>> +  /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than general
>> +     shuffles.  */
>> +  if (d->vmode == V8HImode)
>> +    {
>> +      c = 0xffff;
>> +      s = 16;
>> +      half_mode = V4SImode;
>> +      gen_and = gen_andv4si3;
>> +      gen_pack = gen_sse4_1_packusdw;
>
> ... it's SSE4_1 here,
>
>> +      gen_shift = gen_lshrv4si3;
>> +    }
>> +  else if (d->vmode == V16QImode)
>> +    {
>> +      c = 0xff;
>> +      s = 8;
>> +      half_mode = V8HImode;
>> +      gen_and = gen_andv8hi3;
>> +      gen_pack = gen_sse2_packuswb;
>
> ... and SSE2 here?
>
>
>
> r~

Comments

Evgeny Stupachenko Nov. 20, 2014, 4:25 p.m. UTC | #1
Bootstrap / make check passed with updated patch.

Is it still ok?

It looks like we don't need "expand_vec_perm_vpshufb2_vpermq_even_odd"
any more with the patch.
However the clean up will be in the separate patch after appropriate testing.

Modified ChangeLog:

2014-11-20  Evgeny Stupachenko  <evstupac@gmail.com>

gcc/testsuite
        PR target/60451
        * gcc.target/i386/pr60451.c: New.

gcc/
        PR target/60451
        * config/i386/i386.c (expand_vec_perm_even_odd_pack): New.
        (expand_vec_perm_even_odd_1): Add new expand for V8HI mode,
        replace for V16QI, V16HI and V32QI modes.
        (ix86_expand_vec_perm_const_1): Add new expand.

On Thu, Nov 20, 2014 at 6:03 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> Good point! "gen_shift" also requires only SSE2.
> That way we can optimize out interleave sequence for V16QI mode in
> expand_vec_perm_even_odd_1.
> Thanks!
>
> Evgeny
>
> Updated patch:
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 085eb54..054089b 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -48322,6 +48322,127 @@ expand_vec_perm_vpshufb2_vpermq_even_odd
> (struct expand_vec_perm_d *d)
>    return true;
>  }
>
> +/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
> +   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
> +   with two "and" and "pack" or two "shift" and "pack" insns.  We should
> +   have already failed all two instruction sequences.  */
> +
> +static bool
> +expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
> +{
> +  rtx op, dop0, dop1, t, rperm[16];
> +  unsigned i, odd, c, s, nelt = d->nelt;
> +  bool end_perm = false;
> +  machine_mode half_mode;
> +  rtx (*gen_and) (rtx, rtx, rtx);
> +  rtx (*gen_pack) (rtx, rtx, rtx);
> +  rtx (*gen_shift) (rtx, rtx, rtx);
> +
> +  if (d->one_operand_p)
> +    return false;
> +
> +  switch (d->vmode)
> +    {
> +    case V8HImode:
> +      /* Required for "pack".  */
> +      if (!TARGET_SSE4_1)
> +        return false;
> +      c = 0xffff;
> +      s = 16;
> +      half_mode = V4SImode;
> +      gen_and = gen_andv4si3;
> +      gen_pack = gen_sse4_1_packusdw;
> +      gen_shift = gen_lshrv4si3;
> +      break;
> +    case V16QImode:
> +      /* No check as all instructions are SSE2.  */
> +      c = 0xff;
> +      s = 8;
> +      half_mode = V8HImode;
> +      gen_and = gen_andv8hi3;
> +      gen_pack = gen_sse2_packuswb;
> +      gen_shift = gen_lshrv8hi3;
> +      break;
> +    case V16HImode:
> +      if (!TARGET_AVX2)
> +        return false;
> +      c = 0xffff;
> +      s = 16;
> +      half_mode = V8SImode;
> +      gen_and = gen_andv8si3;
> +      gen_pack = gen_avx2_packusdw;
> +      gen_shift = gen_lshrv8si3;
> +      end_perm = true;
> +      break;
> +    case V32QImode:
> +      if (!TARGET_AVX2)
> +        return false;
> +      c = 0xff;
> +      s = 8;
> +      half_mode = V16HImode;
> +      gen_and = gen_andv16hi3;
> +      gen_pack = gen_avx2_packuswb;
> +      gen_shift = gen_lshrv16hi3;
> +      end_perm = true;
> +      break;
> +    default:
> +      /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
> +        general shuffles.  */
> +      return false;
> +    }
> +
> +  /* Check that permutation is even or odd.  */
> +  odd = d->perm[0];
> +  if (odd > 1)
> +    return false;
> +
> +  for (i = 1; i < nelt; ++i)
> +    if (d->perm[i] != 2 * i + odd)
> +      return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  dop0 = gen_reg_rtx (half_mode);
> +  dop1 = gen_reg_rtx (half_mode);
> +  if (odd == 0)
> +    {
> +      for (i = 0; i < nelt / 2; i++)
> +       rperm[i] = GEN_INT (c);
> +      t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
> +      t = force_reg (half_mode, t);
> +      emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
> +      emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
> +    }
> +  else
> +    {
> +      emit_insn (gen_shift (dop0,
> +                           gen_lowpart (half_mode, d->op0),
> +                           GEN_INT (s)));
> +      emit_insn (gen_shift (dop1,
> +                           gen_lowpart (half_mode, d->op1),
> +                           GEN_INT (s)));
> +    }
> +  /* In AVX2 for 256 bit case we need to permute pack result.  */
> +  if (TARGET_AVX2 && end_perm)
> +    {
> +      op = gen_reg_rtx (d->vmode);
> +      t = gen_reg_rtx (V4DImode);
> +      emit_insn (gen_pack (op, dop0, dop1));
> +      emit_insn (gen_avx2_permv4di_1 (t,
> +                                     gen_lowpart (V4DImode, op),
> +                                     const0_rtx,
> +                                     const2_rtx,
> +                                     const1_rtx,
> +                                     GEN_INT (3)));
> +      emit_move_insn (d->target, gen_lowpart (d->vmode, t));
> +    }
> +  else
> +    emit_insn (gen_pack (d->target, dop0, dop1));
> +
> +  return true;
> +}
> +
>  /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
>     and extract-odd permutations.  */
>
> @@ -48393,7 +48514,9 @@ expand_vec_perm_even_odd_1 (struct
> expand_vec_perm_d *d, unsigned odd)
>        gcc_unreachable ();
>
>      case V8HImode:
> -      if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
> +      if (TARGET_SSE4_1)
> +       return expand_vec_perm_even_odd_pack (d);
> +      else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
>         return expand_vec_perm_pshufb2 (d);
>        else
>         {
> @@ -48416,32 +48539,11 @@ expand_vec_perm_even_odd_1 (struct
> expand_vec_perm_d *d, unsigned odd)
>        break;
>
>      case V16QImode:
> -      if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
> -       return expand_vec_perm_pshufb2 (d);
> -      else
> -       {
> -         if (d->testing_p)
> -           break;
> -         t1 = gen_reg_rtx (V16QImode);
> -         t2 = gen_reg_rtx (V16QImode);
> -         t3 = gen_reg_rtx (V16QImode);
> -         emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
> -         emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
> -         emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
> -         emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
> -         emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
> -         emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
> -         if (odd)
> -           t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
> -         else
> -           t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
> -         emit_insn (t3);
> -       }
> -      break;
> +      return expand_vec_perm_even_odd_pack (d);
>
>      case V16HImode:
>      case V32QImode:
> -      return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
> +      return expand_vec_perm_even_odd_pack (d);
>
>      case V4DImode:
>        if (!TARGET_AVX2)
> @@ -48814,6 +48916,9 @@ ix86_expand_vec_perm_const_1 (struct
> expand_vec_perm_d *d)
>
>    /* Try sequences of three instructions.  */
>
> +  if (expand_vec_perm_even_odd_pack (d))
> +    return true;
> +
>    if (expand_vec_perm_2vperm2f128_vshuf (d))
>      return true;
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr60451.c
> b/gcc/testsuite/gcc.target/i386/pr60451.c
> new file mode 100644
> index 0000000..c600f4a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr60451.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target sse2 } */
> +/* { dg-options "-O2 -ftree-vectorize -msse2" } */
> +
> +void
> +foo (unsigned char *a, unsigned char *b, unsigned char *c, int size)
> +{
> +  int i;
> +
> +  for (i = 0; i < size; i++)
> +    a[i] = (unsigned char) ((unsigned int)1 + b[i] * c[i] * 117);
> +}
> +
> +/* { dg-final { scan-assembler "packuswb|vpunpck" } } */
>
> On Thu, Nov 20, 2014 at 5:30 PM, Richard Henderson <rth@redhat.com> wrote:
>> On 11/20/2014 12:36 PM, Evgeny Stupachenko wrote:
>>> +  /* Required for "pack".  */
>>> +  if (!TARGET_SSE4_2 || d->one_operand_p)
>>> +    return false;
>>
>> Why the SSE4_2 check here when...
>>
>>> +
>>> +  /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than general
>>> +     shuffles.  */
>>> +  if (d->vmode == V8HImode)
>>> +    {
>>> +      c = 0xffff;
>>> +      s = 16;
>>> +      half_mode = V4SImode;
>>> +      gen_and = gen_andv4si3;
>>> +      gen_pack = gen_sse4_1_packusdw;
>>
>> ... it's SSE4_1 here,
>>
>>> +      gen_shift = gen_lshrv4si3;
>>> +    }
>>> +  else if (d->vmode == V16QImode)
>>> +    {
>>> +      c = 0xff;
>>> +      s = 8;
>>> +      half_mode = V8HImode;
>>> +      gen_and = gen_andv8hi3;
>>> +      gen_pack = gen_sse2_packuswb;
>>
>> ... and SSE2 here?
>>
>>
>>
>> r~
Uros Bizjak Nov. 21, 2014, 8:01 a.m. UTC | #2
On Thu, Nov 20, 2014 at 5:25 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> Bootstrap / make check passed with updated patch.
>
> Is it still ok?
>
> It looks like we don't need "expand_vec_perm_vpshufb2_vpermq_even_odd"
> any more with the patch.
> However the clean up will be in the separate patch after appropriate testing.
>
> Modified ChangeLog:
>
> 2014-11-20  Evgeny Stupachenko  <evstupac@gmail.com>
>
> gcc/testsuite
>         PR target/60451
>         * gcc.target/i386/pr60451.c: New.
>
> gcc/
>         PR target/60451
>         * config/i386/i386.c (expand_vec_perm_even_odd_pack): New.
>         (expand_vec_perm_even_odd_1): Add new expand for V8HI mode,
>         replace for V16QI, V16HI and V32QI modes.
>         (ix86_expand_vec_perm_const_1): Add new expand.

OK.

Thanks,
Uros.
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 085eb54..054089b 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -48322,6 +48322,127 @@  expand_vec_perm_vpshufb2_vpermq_even_odd
(struct expand_vec_perm_d *d)
   return true;
 }

+/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
+   with two "and" and "pack" or two "shift" and "pack" insns.  We should
+   have already failed all two instruction sequences.  */
+
+static bool
+expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
+{
+  rtx op, dop0, dop1, t, rperm[16];
+  unsigned i, odd, c, s, nelt = d->nelt;
+  bool end_perm = false;
+  machine_mode half_mode;
+  rtx (*gen_and) (rtx, rtx, rtx);
+  rtx (*gen_pack) (rtx, rtx, rtx);
+  rtx (*gen_shift) (rtx, rtx, rtx);
+
+  if (d->one_operand_p)
+    return false;
+
+  switch (d->vmode)
+    {
+    case V8HImode:
+      /* Required for "pack".  */
+      if (!TARGET_SSE4_1)
+        return false;
+      c = 0xffff;
+      s = 16;
+      half_mode = V4SImode;
+      gen_and = gen_andv4si3;
+      gen_pack = gen_sse4_1_packusdw;
+      gen_shift = gen_lshrv4si3;
+      break;
+    case V16QImode:
+      /* No check as all instructions are SSE2.  */
+      c = 0xff;
+      s = 8;
+      half_mode = V8HImode;
+      gen_and = gen_andv8hi3;
+      gen_pack = gen_sse2_packuswb;
+      gen_shift = gen_lshrv8hi3;
+      break;
+    case V16HImode:
+      if (!TARGET_AVX2)
+        return false;
+      c = 0xffff;
+      s = 16;
+      half_mode = V8SImode;
+      gen_and = gen_andv8si3;
+      gen_pack = gen_avx2_packusdw;
+      gen_shift = gen_lshrv8si3;
+      end_perm = true;
+      break;
+    case V32QImode:
+      if (!TARGET_AVX2)
+        return false;
+      c = 0xff;
+      s = 8;
+      half_mode = V16HImode;
+      gen_and = gen_andv16hi3;
+      gen_pack = gen_avx2_packuswb;
+      gen_shift = gen_lshrv16hi3;
+      end_perm = true;
+      break;
+    default:
+      /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
+        general shuffles.  */
+      return false;
+    }
+
+  /* Check that permutation is even or odd.  */
+  odd = d->perm[0];
+  if (odd > 1)
+    return false;
+
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != 2 * i + odd)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  dop0 = gen_reg_rtx (half_mode);
+  dop1 = gen_reg_rtx (half_mode);
+  if (odd == 0)
+    {
+      for (i = 0; i < nelt / 2; i++)
+       rperm[i] = GEN_INT (c);
+      t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
+      t = force_reg (half_mode, t);
+      emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
+      emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+    }
+  else
+    {
+      emit_insn (gen_shift (dop0,
+                           gen_lowpart (half_mode, d->op0),
+                           GEN_INT (s)));
+      emit_insn (gen_shift (dop1,
+                           gen_lowpart (half_mode, d->op1),
+                           GEN_INT (s)));
+    }
+  /* In AVX2 for 256 bit case we need to permute pack result.  */
+  if (TARGET_AVX2 && end_perm)
+    {
+      op = gen_reg_rtx (d->vmode);
+      t = gen_reg_rtx (V4DImode);
+      emit_insn (gen_pack (op, dop0, dop1));
+      emit_insn (gen_avx2_permv4di_1 (t,
+                                     gen_lowpart (V4DImode, op),
+                                     const0_rtx,
+                                     const2_rtx,
+                                     const1_rtx,
+                                     GEN_INT (3)));
+      emit_move_insn (d->target, gen_lowpart (d->vmode, t));
+    }
+  else
+    emit_insn (gen_pack (d->target, dop0, dop1));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
    and extract-odd permutations.  */

@@ -48393,7 +48514,9 @@  expand_vec_perm_even_odd_1 (struct
expand_vec_perm_d *d, unsigned odd)
       gcc_unreachable ();

     case V8HImode:
-      if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
+      if (TARGET_SSE4_1)
+       return expand_vec_perm_even_odd_pack (d);
+      else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
        return expand_vec_perm_pshufb2 (d);
       else
        {
@@ -48416,32 +48539,11 @@  expand_vec_perm_even_odd_1 (struct
expand_vec_perm_d *d, unsigned odd)
       break;

     case V16QImode:
-      if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
-       return expand_vec_perm_pshufb2 (d);
-      else
-       {
-         if (d->testing_p)
-           break;
-         t1 = gen_reg_rtx (V16QImode);
-         t2 = gen_reg_rtx (V16QImode);
-         t3 = gen_reg_rtx (V16QImode);
-         emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
-         emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
-         emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
-         emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
-         emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
-         emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
-         if (odd)
-           t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
-         else
-           t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
-         emit_insn (t3);
-       }
-      break;
+      return expand_vec_perm_even_odd_pack (d);

     case V16HImode:
     case V32QImode:
-      return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
+      return expand_vec_perm_even_odd_pack (d);

     case V4DImode:
       if (!TARGET_AVX2)
@@ -48814,6 +48916,9 @@  ix86_expand_vec_perm_const_1 (struct
expand_vec_perm_d *d)

   /* Try sequences of three instructions.  */

+  if (expand_vec_perm_even_odd_pack (d))
+    return true;
+
   if (expand_vec_perm_2vperm2f128_vshuf (d))
     return true;

diff --git a/gcc/testsuite/gcc.target/i386/pr60451.c
b/gcc/testsuite/gcc.target/i386/pr60451.c
new file mode 100644
index 0000000..c600f4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr60451.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-options "-O2 -ftree-vectorize -msse2" } */
+
+void
+foo (unsigned char *a, unsigned char *b, unsigned char *c, int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    a[i] = (unsigned char) ((unsigned int)1 + b[i] * c[i] * 117);
+}
+
+/* { dg-final { scan-assembler "packuswb|vpunpck" } } */