diff mbox

[x86] Improves x86 permutation expand

Message ID CAOvf_xwib1Zd_Qp-+Vx3FMeJGk=gm+qSo972WbDo7k=LAUytSg@mail.gmail.com
State New
Headers show

Commit Message

Evgeny Stupachenko June 9, 2014, 7:10 p.m. UTC
Nice catch.
Patch with corresponding changes:

   if (expand_vec_perm_2vperm2f128_vshuf (d))

On Mon, Jun 9, 2014 at 8:30 PM, Richard Henderson <rth@redhat.com> wrote:
> On 06/09/2014 03:13 AM, Evgeny Stupachenko wrote:
>> +  /* First we apply one operand permutation to the part where
>> +     elements stay not in their respective lanes.  */
>> +  dcopy = *d;
>> +  if (which == 2)
>> +    dcopy.op0 = dcopy.op1 = d->op1;
>> +  else
>> +    dcopy.op0 = dcopy.op1 = d->op0;
>> +  dcopy.one_operand_p = true;
>> +
>> +  for (i = 0; i < nelt; ++i)
>> +    {
>> +      unsigned e = d->perm[i];
>> +      if (which == 2)
>> +       dcopy.perm[i] = ((e >= nelt) ? (e - nelt) : e);
>
> This is wrong for which == 1.  For both cases this simplifies to
>
>   dcopy.perm[i] = e & (nelt - 1);
>
>> +
>> +  for (i = 0; i < nelt; ++i)
>> +    {
>> +      unsigned e = d->perm[i];
>> +      if (which == 2)
>> +       dcopy1.perm[i] = ((e >= nelt) ? (nelt + i) : e);
>> +      else
>> +       dcopy1.perm[i] = ((e < nelt) ? i : e);
>> +    }
>
> This is known to be a blend, so you know the value of E.
> Simplifies to
>
>   dcopy1.perm[i] = (e >= nelt ? nelt + i : i);
>
>
> r~

Comments

Richard Henderson June 9, 2014, 7:49 p.m. UTC | #1
On 06/09/2014 12:10 PM, Evgeny Stupachenko wrote:
> Nice catch.
> Patch with corresponding changes:

Looks ok with an appropriate changelog.


r~
H.J. Lu June 9, 2014, 8:19 p.m. UTC | #2
On Mon, Jun 9, 2014 at 12:49 PM, Richard Henderson <rth@redhat.com> wrote:
> On 06/09/2014 12:10 PM, Evgeny Stupachenko wrote:
>> Nice catch.
>> Patch with corresponding changes:
>
> Looks ok with an appropriate changelog.
>

It will be nice to include testcases to cover those changes.
Evgeny Stupachenko June 10, 2014, 10:19 a.m. UTC | #3
The stability of the changes are covered in gcc.dg/vect/pr52252-ld.c
Test on "pblend" scan I'll add with the patch:
https://gcc.gnu.org/ml/gcc-patches/2014-06/msg00795.html

On Tue, Jun 10, 2014 at 12:19 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, Jun 9, 2014 at 12:49 PM, Richard Henderson <rth@redhat.com> wrote:
>> On 06/09/2014 12:10 PM, Evgeny Stupachenko wrote:
>>> Nice catch.
>>> Patch with corresponding changes:
>>
>> Looks ok with an appropriate changelog.
>>
>
> It will be nice to include testcases to cover those changes.
>
> --
> H.J.
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 8827256..0b80354 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -43185,6 +43185,80 @@  expand_vec_perm_palignr (struct expand_vec_perm_d *d)
   return ok;
 }

+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
+   the permutation using the SSE4_1 pblendv instruction.  Potentially
+   reduces permutaion from 2 pshufb and or to 1 pshufb and pblendv.  */
+
+static bool
+expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
+{
+  unsigned i, which, nelt = d->nelt;
+  struct expand_vec_perm_d dcopy, dcopy1;
+  enum machine_mode vmode = d->vmode;
+  bool ok;
+
+  /* Use the same checks as in expand_vec_perm_blend, but skipping
+     AVX2 as it requires more than 2 instructions for general case.  */
+  if (d->one_operand_p)
+    return false;
+  if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+    ;
+  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+    ;
+  else
+    return false;
+
+  /* Figure out where permutation elements stay not in their
+     respective lanes.  */
+  for (i = 0, which = 0; i < nelt; ++i)
+    {
+      unsigned e = d->perm[i];
+      if (e != i)
+       which |= (e < nelt ? 1 : 2);
+    }
+  /* We can pblend the part where elements stay not in their
+     respective lanes only when these elements are all in one
+     half of a permutation.
+     {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
+     lanes, but both 8 and 9 >= 8
+     {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
+     respective lanes and 8 >= 8, but 2 not.  */
+  if (which != 1 && which != 2)
+    return false;
+  if (d->testing_p)
+    return true;
+
+  /* First we apply one operand permutation to the part where
+     elements stay not in their respective lanes.  */
+  dcopy = *d;
+  if (which == 2)
+    dcopy.op0 = dcopy.op1 = d->op1;
+  else
+    dcopy.op0 = dcopy.op1 = d->op0;
+  dcopy.one_operand_p = true;
+
+  for (i = 0; i < nelt; ++i)
+    dcopy.perm[i] = d->perm[i] & (nelt - 1);
+
+  ok = expand_vec_perm_1 (&dcopy);
+  gcc_assert (ok);
+
+  /* Next we put permuted elements into thier positions.  */
+  dcopy1 = *d;
+  if (which == 2)
+    dcopy1.op1 = dcopy.target;
+  else
+    dcopy1.op0 = dcopy.target;
+
+  for (i = 0; i < nelt; ++i)
+    dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
+
+  ok = expand_vec_perm_blend (&dcopy1);
+  gcc_assert (ok);
+
+  return true;
+}
+
 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);

 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
@@ -44557,6 +44631,9 @@  ix86_expand_vec_perm_const_1 (struct
expand_vec_perm_d *d)
   if (expand_vec_perm_vperm2f128 (d))
     return true;

+  if (expand_vec_perm_pblendv (d))
+    return true;
+
   /* Try sequences of three instructions.  */