diff mbox

Handle vector COND_EXPRs in vector genericization (PR tree-optimization/65427)

Message ID 20150316162102.GR1746@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek March 16, 2015, 4:21 p.m. UTC
Hi!

On the following testcase, gimple LIM creates a vector COND_EXPR (scalar
condition, vector lhs, rhs2 and rhs3), but if we don't have corresponding
vector mode for it, we ICE trying to expand the BLKmode COND_EXPR, as it is
unprepared for that.

This patch lowers those (parallel or piecewise).
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2015-03-16  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/65427
	* tree-vect-generic.c (do_cond, expand_vector_scalar_condition): New
	functions.
	(expand_vector_operations_1): Handle BLKmode vector COND_EXPR.

	* gcc.c-torture/execute/pr65427.c: New test.


	Jakub

Comments

Richard Biener March 16, 2015, 6:15 p.m. UTC | #1
On March 16, 2015 5:21:02 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote:
>Hi!
>
>On the following testcase, gimple LIM creates a vector COND_EXPR
>(scalar
>condition, vector lhs, rhs2 and rhs3), but if we don't have
>corresponding
>vector mode for it, we ICE trying to expand the BLKmode COND_EXPR, as
>it is
>unprepared for that.
>
>This patch lowers those (parallel or piecewise).
>Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK.  Though maybe LIM should not create these for cost reasons? I also wonder if we should lower them to control flow?

Thanks,
Richard.

>2015-03-16  Jakub Jelinek  <jakub@redhat.com>
>
>	PR tree-optimization/65427
>	* tree-vect-generic.c (do_cond, expand_vector_scalar_condition): New
>	functions.
>	(expand_vector_operations_1): Handle BLKmode vector COND_EXPR.
>
>	* gcc.c-torture/execute/pr65427.c: New test.
>
>--- gcc/tree-vect-generic.c.jj	2015-01-15 20:25:40.000000000 +0100
>+++ gcc/tree-vect-generic.c	2015-03-16 14:25:37.391932269 +0100
>@@ -1417,6 +1417,57 @@ count_type_subparts (tree type)
>   return VECTOR_TYPE_P (type) ? TYPE_VECTOR_SUBPARTS (type) : 1;
> }
> 
>+static tree
>+do_cond (gimple_stmt_iterator *gsi, tree inner_type, tree a, tree b,
>+	 tree bitpos, tree bitsize, enum tree_code code)
>+{
>+  if (TREE_CODE (TREE_TYPE (a)) == VECTOR_TYPE)
>+    a = tree_vec_extract (gsi, inner_type, a, bitsize, bitpos);
>+  if (TREE_CODE (TREE_TYPE (b)) == VECTOR_TYPE)
>+    b = tree_vec_extract (gsi, inner_type, b, bitsize, bitpos);
>+  tree cond = gimple_assign_rhs1 (gsi_stmt (*gsi));
>+  return gimplify_build3 (gsi, code, inner_type, cond, a, b);
>+}
>+
>+/* Expand a vector COND_EXPR to scalars, piecewise.  */
>+static void
>+expand_vector_scalar_condition (gimple_stmt_iterator *gsi)
>+{
>+  gassign *stmt = as_a <gassign *> (gsi_stmt (*gsi));
>+  tree type = gimple_expr_type (stmt);
>+  tree compute_type = get_compute_type (COND_EXPR, mov_optab, type);
>+  machine_mode compute_mode = TYPE_MODE (compute_type);
>+  gcc_assert (compute_mode != BLKmode);
>+  tree lhs = gimple_assign_lhs (stmt);
>+  tree rhs2 = gimple_assign_rhs2 (stmt);
>+  tree rhs3 = gimple_assign_rhs3 (stmt);
>+  tree new_rhs;
>+
>+  /* If the compute mode is not a vector mode (hence we are not
>decomposing
>+     a BLKmode vector to smaller, hardware-supported vectors), we may
>want
>+     to expand the operations in parallel.  */
>+  if (GET_MODE_CLASS (compute_mode) != MODE_VECTOR_INT
>+      && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_FLOAT
>+      && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_FRACT
>+      && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_UFRACT
>+      && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_ACCUM
>+      && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_UACCUM)
>+    new_rhs = expand_vector_parallel (gsi, do_cond, type, rhs2, rhs3,
>+				      COND_EXPR);
>+  else
>+    new_rhs = expand_vector_piecewise (gsi, do_cond, type,
>compute_type,
>+				       rhs2, rhs3, COND_EXPR);
>+  if (!useless_type_conversion_p (TREE_TYPE (lhs), TREE_TYPE
>(new_rhs)))
>+    new_rhs = gimplify_build1 (gsi, VIEW_CONVERT_EXPR, TREE_TYPE
>(lhs),
>+			       new_rhs);
>+
>+  /* NOTE:  We should avoid using gimple_assign_set_rhs_from_tree. One
>+     way to do it is change expand_vector_operation and its callees to
>+     return a tree_code, RHS1 and RHS2 instead of a tree. */
>+  gimple_assign_set_rhs_from_tree (gsi, new_rhs);
>+  update_stmt (gsi_stmt (*gsi));
>+}
>+
>/* Process one statement.  If we identify a vector operation, expand
>it.  */
> 
> static void
>@@ -1449,6 +1500,14 @@ expand_vector_operations_1 (gimple_stmt_
>       return;
>     }
> 
>+  if (code == COND_EXPR
>+      && TREE_CODE (TREE_TYPE (gimple_assign_lhs (stmt))) ==
>VECTOR_TYPE
>+      && TYPE_MODE (TREE_TYPE (gimple_assign_lhs (stmt))) == BLKmode)
>+    {
>+      expand_vector_scalar_condition (gsi);
>+      return;
>+    }
>+
>   if (code == CONSTRUCTOR
>       && TREE_CODE (lhs) == SSA_NAME
>       && VECTOR_MODE_P (TYPE_MODE (TREE_TYPE (lhs)))
>--- gcc/testsuite/gcc.c-torture/execute/pr65427.c.jj	2015-03-16
>14:36:29.489254701 +0100
>+++ gcc/testsuite/gcc.c-torture/execute/pr65427.c	2015-03-16
>14:40:58.789851433 +0100
>@@ -0,0 +1,34 @@
>+/* PR tree-optimization/65427 */
>+
>+typedef int V __attribute__ ((vector_size (8 * sizeof (int))));
>+V a, b, c, d, e, f;
>+
>+__attribute__((noinline, noclone)) void
>+foo (int x, int y)
>+{
>+  do
>+    {
>+      if (x)
>+	d = a ^ c;
>+      else
>+	d = a ^ b;
>+    }
>+  while (y);
>+}
>+
>+int
>+main ()
>+{
>+  a = (V) { 1, 2, 3, 4, 5, 6, 7, 8 };
>+  b = (V) { 0x40, 0x80, 0x40, 0x80, 0x40, 0x80, 0x40, 0x80 };
>+  e = (V) { 0x41, 0x82, 0x43, 0x84, 0x45, 0x86, 0x47, 0x88 };
>+  foo (0, 0);
>+  if (__builtin_memcmp (&d, &e, sizeof (V)) != 0)
>+    __builtin_abort ();
>+  c = (V) { 0x80, 0x40, 0x80, 0x40, 0x80, 0x40, 0x80, 0x40 };
>+  f = (V) { 0x81, 0x42, 0x83, 0x44, 0x85, 0x46, 0x87, 0x48 };
>+  foo (1, 0);
>+  if (__builtin_memcmp (&d, &f, sizeof (V)) != 0)
>+    __builtin_abort ();
>+  return 0;
>+}
>
>	Jakub
Jakub Jelinek March 16, 2015, 6:26 p.m. UTC | #2
On Mon, Mar 16, 2015 at 07:15:59PM +0100, Richard Biener wrote:
> On March 16, 2015 5:21:02 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote:
> >On the following testcase, gimple LIM creates a vector COND_EXPR
> >(scalar
> >condition, vector lhs, rhs2 and rhs3), but if we don't have
> >corresponding
> >vector mode for it, we ICE trying to expand the BLKmode COND_EXPR, as
> >it is
> >unprepared for that.
> >
> >This patch lowers those (parallel or piecewise).
> >Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> OK.  Though maybe LIM should not create these for cost reasons? I also wonder if we should lower them to control flow?

Yeah, I've been thinking about teaching LIM not to do that if it is BLKmode.
But then found how many other spots create COND_EXPRs and thought I just
can't catch all of them.  But guess I can change LIM incrementally too.

I've also been thinking about lowering it to control flow, but only if it
couldn't be done in say two halves comparison as in the testcase.  I suppose
doing say 2 V4SImode COND_EXPRs would still be beneficial over control flow,
but if it is more than that (say 4 or 8+) it might be already better to just
expand it as a PHI.  But, as we don't create basic blocks in
tree-vect-generic.c right now, I thought it might be too much for stage4.

	Jakub
Richard Biener March 16, 2015, 6:34 p.m. UTC | #3
On March 16, 2015 7:26:58 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote:
>On Mon, Mar 16, 2015 at 07:15:59PM +0100, Richard Biener wrote:
>> On March 16, 2015 5:21:02 PM GMT+01:00, Jakub Jelinek
><jakub@redhat.com> wrote:
>> >On the following testcase, gimple LIM creates a vector COND_EXPR
>> >(scalar
>> >condition, vector lhs, rhs2 and rhs3), but if we don't have
>> >corresponding
>> >vector mode for it, we ICE trying to expand the BLKmode COND_EXPR,
>as
>> >it is
>> >unprepared for that.
>> >
>> >This patch lowers those (parallel or piecewise).
>> >Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>> 
>> OK.  Though maybe LIM should not create these for cost reasons? I
>also wonder if we should lower them to control flow?
>
>Yeah, I've been thinking about teaching LIM not to do that if it is
>BLKmode.
>But then found how many other spots create COND_EXPRs and thought I
>just
>can't catch all of them.  But guess I can change LIM incrementally too.
>
>I've also been thinking about lowering it to control flow, but only if
>it
>couldn't be done in say two halves comparison as in the testcase.  I
>suppose
>doing say 2 V4SImode COND_EXPRs would still be beneficial over control
>flow,
>but if it is more than that (say 4 or 8+) it might be already better to
>just
>expand it as a PHI.  But, as we don't create basic blocks in
>tree-vect-generic.c right now, I thought it might be too much for
>stage4.

Yeah, the patch is fine for stage 4.  I just wrote down my random thoughts.

Richard.

>	Jakub
Kyrylo Tkachov March 18, 2015, 3:10 p.m. UTC | #4
On 16/03/15 18:26, Jakub Jelinek wrote:
> On Mon, Mar 16, 2015 at 07:15:59PM +0100, Richard Biener wrote:
>> On March 16, 2015 5:21:02 PM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote:
>>> On the following testcase, gimple LIM creates a vector COND_EXPR
>>> (scalar
>>> condition, vector lhs, rhs2 and rhs3), but if we don't have
>>> corresponding
>>> vector mode for it, we ICE trying to expand the BLKmode COND_EXPR, as
>>> it is
>>> unprepared for that.
>>>
>>> This patch lowers those (parallel or piecewise).
>>> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>> OK.  Though maybe LIM should not create these for cost reasons? I also wonder if we should lower them to control flow?
> Yeah, I've been thinking about teaching LIM not to do that if it is BLKmode.
> But then found how many other spots create COND_EXPRs and thought I just
> can't catch all of them.  But guess I can change LIM incrementally too.
>
> I've also been thinking about lowering it to control flow, but only if it
> couldn't be done in say two halves comparison as in the testcase.  I suppose
> doing say 2 V4SImode COND_EXPRs would still be beneficial over control flow,
> but if it is more than that (say 4 or 8+) it might be already better to just
> expand it as a PHI.  But, as we don't create basic blocks in
> tree-vect-generic.c right now, I thought it might be too much for stage4.

Hi Jakub,
The testcase ICEs on arm-none-eabi at -Os (only):
0x7e6f11 process_insert_insn
         $TOP/gcc/gcse.c:2174
0x7e8a80 insert_insn_end_basic_block
         $TOP/gcc/gcse.c:2196
0x7eab01 hoist_code
         $TOP/gcc/gcse.c:3492
0x7eab01 one_code_hoisting_pass
         $TOP/gcc/gcse.c:3722
0x7eab01 execute_rtl_hoist
         $TOP/gcc/gcse.c:4212
0x7eab01 execute
         $TOP/gcc/gcse.c:4293

Thanks,
Kyrill

> 	Jakub
>
Jakub Jelinek March 18, 2015, 3:14 p.m. UTC | #5
On Wed, Mar 18, 2015 at 03:10:44PM +0000, Kyrill Tkachov wrote:
> >I've also been thinking about lowering it to control flow, but only if it
> >couldn't be done in say two halves comparison as in the testcase.  I suppose
> >doing say 2 V4SImode COND_EXPRs would still be beneficial over control flow,
> >but if it is more than that (say 4 or 8+) it might be already better to just
> >expand it as a PHI.  But, as we don't create basic blocks in
> >tree-vect-generic.c right now, I thought it might be too much for stage4.

> The testcase ICEs on arm-none-eabi at -Os (only):
> 0x7e6f11 process_insert_insn
>         $TOP/gcc/gcse.c:2174
> 0x7e8a80 insert_insn_end_basic_block
>         $TOP/gcc/gcse.c:2196
> 0x7eab01 hoist_code
>         $TOP/gcc/gcse.c:3492
> 0x7eab01 one_code_hoisting_pass
>         $TOP/gcc/gcse.c:3722
> 0x7eab01 execute_rtl_hoist
>         $TOP/gcc/gcse.c:4212
> 0x7eab01 execute
>         $TOP/gcc/gcse.c:4293

Must be some unrelated latent issue, either in arm backend or in gcse.
I think the lowering I've done could be very easily just present in user
code, so shouldn't be that hard to adjust the testcase so that it ICEs
already before the commit.

	Jakub
Kyrylo Tkachov March 18, 2015, 3:20 p.m. UTC | #6
On 18/03/15 15:14, Jakub Jelinek wrote:
> On Wed, Mar 18, 2015 at 03:10:44PM +0000, Kyrill Tkachov wrote:
>>> I've also been thinking about lowering it to control flow, but only if it
>>> couldn't be done in say two halves comparison as in the testcase.  I suppose
>>> doing say 2 V4SImode COND_EXPRs would still be beneficial over control flow,
>>> but if it is more than that (say 4 or 8+) it might be already better to just
>>> expand it as a PHI.  But, as we don't create basic blocks in
>>> tree-vect-generic.c right now, I thought it might be too much for stage4.
>> The testcase ICEs on arm-none-eabi at -Os (only):
>> 0x7e6f11 process_insert_insn
>>          $TOP/gcc/gcse.c:2174
>> 0x7e8a80 insert_insn_end_basic_block
>>          $TOP/gcc/gcse.c:2196
>> 0x7eab01 hoist_code
>>          $TOP/gcc/gcse.c:3492
>> 0x7eab01 one_code_hoisting_pass
>>          $TOP/gcc/gcse.c:3722
>> 0x7eab01 execute_rtl_hoist
>>          $TOP/gcc/gcse.c:4212
>> 0x7eab01 execute
>>          $TOP/gcc/gcse.c:4293
> Must be some unrelated latent issue, either in arm backend or in gcse.
> I think the lowering I've done could be very easily just present in user
> code, so shouldn't be that hard to adjust the testcase so that it ICEs
> already before the commit.

Yep, I'm seeing the same ICE in the testcsase before the patch.
Kyrill

>
> 	Jakub
>
Kyrylo Tkachov March 18, 2015, 3:53 p.m. UTC | #7
On 18/03/15 15:20, Kyrill Tkachov wrote:
> On 18/03/15 15:14, Jakub Jelinek wrote:
>> On Wed, Mar 18, 2015 at 03:10:44PM +0000, Kyrill Tkachov wrote:
>>>> I've also been thinking about lowering it to control flow, but only if it
>>>> couldn't be done in say two halves comparison as in the testcase.  I suppose
>>>> doing say 2 V4SImode COND_EXPRs would still be beneficial over control flow,
>>>> but if it is more than that (say 4 or 8+) it might be already better to just
>>>> expand it as a PHI.  But, as we don't create basic blocks in
>>>> tree-vect-generic.c right now, I thought it might be too much for stage4.
>>> The testcase ICEs on arm-none-eabi at -Os (only):
>>> 0x7e6f11 process_insert_insn
>>>           $TOP/gcc/gcse.c:2174
>>> 0x7e8a80 insert_insn_end_basic_block
>>>           $TOP/gcc/gcse.c:2196
>>> 0x7eab01 hoist_code
>>>           $TOP/gcc/gcse.c:3492
>>> 0x7eab01 one_code_hoisting_pass
>>>           $TOP/gcc/gcse.c:3722
>>> 0x7eab01 execute_rtl_hoist
>>>           $TOP/gcc/gcse.c:4212
>>> 0x7eab01 execute
>>>           $TOP/gcc/gcse.c:4293
>> Must be some unrelated latent issue, either in arm backend or in gcse.
>> I think the lowering I've done could be very easily just present in user
>> code, so shouldn't be that hard to adjust the testcase so that it ICEs
>> already before the commit.
> Yep, I'm seeing the same ICE in the testcsase before the patch.

With some help from James G, I've got a potential fix to gcse.c in testing.

Kyrill

> Kyrill
>
>> 	Jakub
>>
>
>
diff mbox

Patch

--- gcc/tree-vect-generic.c.jj	2015-01-15 20:25:40.000000000 +0100
+++ gcc/tree-vect-generic.c	2015-03-16 14:25:37.391932269 +0100
@@ -1417,6 +1417,57 @@  count_type_subparts (tree type)
   return VECTOR_TYPE_P (type) ? TYPE_VECTOR_SUBPARTS (type) : 1;
 }
 
+static tree
+do_cond (gimple_stmt_iterator *gsi, tree inner_type, tree a, tree b,
+	 tree bitpos, tree bitsize, enum tree_code code)
+{
+  if (TREE_CODE (TREE_TYPE (a)) == VECTOR_TYPE)
+    a = tree_vec_extract (gsi, inner_type, a, bitsize, bitpos);
+  if (TREE_CODE (TREE_TYPE (b)) == VECTOR_TYPE)
+    b = tree_vec_extract (gsi, inner_type, b, bitsize, bitpos);
+  tree cond = gimple_assign_rhs1 (gsi_stmt (*gsi));
+  return gimplify_build3 (gsi, code, inner_type, cond, a, b);
+}
+
+/* Expand a vector COND_EXPR to scalars, piecewise.  */
+static void
+expand_vector_scalar_condition (gimple_stmt_iterator *gsi)
+{
+  gassign *stmt = as_a <gassign *> (gsi_stmt (*gsi));
+  tree type = gimple_expr_type (stmt);
+  tree compute_type = get_compute_type (COND_EXPR, mov_optab, type);
+  machine_mode compute_mode = TYPE_MODE (compute_type);
+  gcc_assert (compute_mode != BLKmode);
+  tree lhs = gimple_assign_lhs (stmt);
+  tree rhs2 = gimple_assign_rhs2 (stmt);
+  tree rhs3 = gimple_assign_rhs3 (stmt);
+  tree new_rhs;
+
+  /* If the compute mode is not a vector mode (hence we are not decomposing
+     a BLKmode vector to smaller, hardware-supported vectors), we may want
+     to expand the operations in parallel.  */
+  if (GET_MODE_CLASS (compute_mode) != MODE_VECTOR_INT
+      && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_FLOAT
+      && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_FRACT
+      && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_UFRACT
+      && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_ACCUM
+      && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_UACCUM)
+    new_rhs = expand_vector_parallel (gsi, do_cond, type, rhs2, rhs3,
+				      COND_EXPR);
+  else
+    new_rhs = expand_vector_piecewise (gsi, do_cond, type, compute_type,
+				       rhs2, rhs3, COND_EXPR);
+  if (!useless_type_conversion_p (TREE_TYPE (lhs), TREE_TYPE (new_rhs)))
+    new_rhs = gimplify_build1 (gsi, VIEW_CONVERT_EXPR, TREE_TYPE (lhs),
+			       new_rhs);
+
+  /* NOTE:  We should avoid using gimple_assign_set_rhs_from_tree. One
+     way to do it is change expand_vector_operation and its callees to
+     return a tree_code, RHS1 and RHS2 instead of a tree. */
+  gimple_assign_set_rhs_from_tree (gsi, new_rhs);
+  update_stmt (gsi_stmt (*gsi));
+}
+
 /* Process one statement.  If we identify a vector operation, expand it.  */
 
 static void
@@ -1449,6 +1500,14 @@  expand_vector_operations_1 (gimple_stmt_
       return;
     }
 
+  if (code == COND_EXPR
+      && TREE_CODE (TREE_TYPE (gimple_assign_lhs (stmt))) == VECTOR_TYPE
+      && TYPE_MODE (TREE_TYPE (gimple_assign_lhs (stmt))) == BLKmode)
+    {
+      expand_vector_scalar_condition (gsi);
+      return;
+    }
+
   if (code == CONSTRUCTOR
       && TREE_CODE (lhs) == SSA_NAME
       && VECTOR_MODE_P (TYPE_MODE (TREE_TYPE (lhs)))
--- gcc/testsuite/gcc.c-torture/execute/pr65427.c.jj	2015-03-16 14:36:29.489254701 +0100
+++ gcc/testsuite/gcc.c-torture/execute/pr65427.c	2015-03-16 14:40:58.789851433 +0100
@@ -0,0 +1,34 @@ 
+/* PR tree-optimization/65427 */
+
+typedef int V __attribute__ ((vector_size (8 * sizeof (int))));
+V a, b, c, d, e, f;
+
+__attribute__((noinline, noclone)) void
+foo (int x, int y)
+{
+  do
+    {
+      if (x)
+	d = a ^ c;
+      else
+	d = a ^ b;
+    }
+  while (y);
+}
+
+int
+main ()
+{
+  a = (V) { 1, 2, 3, 4, 5, 6, 7, 8 };
+  b = (V) { 0x40, 0x80, 0x40, 0x80, 0x40, 0x80, 0x40, 0x80 };
+  e = (V) { 0x41, 0x82, 0x43, 0x84, 0x45, 0x86, 0x47, 0x88 };
+  foo (0, 0);
+  if (__builtin_memcmp (&d, &e, sizeof (V)) != 0)
+    __builtin_abort ();
+  c = (V) { 0x80, 0x40, 0x80, 0x40, 0x80, 0x40, 0x80, 0x40 };
+  f = (V) { 0x81, 0x42, 0x83, 0x44, 0x85, 0x46, 0x87, 0x48 };
+  foo (1, 0);
+  if (__builtin_memcmp (&d, &f, sizeof (V)) != 0)
+    __builtin_abort ();
+  return 0;
+}