Patchwork Tree tail merging breaks __builtin_unreachable optimization

login
register
mail settings
Submitter Tom de Vries
Date July 5, 2012, 12:49 p.m.
Message ID <4FF58D3E.4060105@mentor.com>
Download mbox | patch
Permalink /patch/169150/
State New
Headers show

Comments

Tom de Vries - July 5, 2012, 12:49 p.m.
On 04/07/12 19:02, Ulrich Weigand wrote:
> Any suggestions how to fix this?  Should tail merging detect
> __builtin_unreachable and not merge such block?  Or else, should
> the CFG optimizer be extended (how?) to handle unreachable blocks
> with multiple predecessors better?

Ulrich,

I extended the example as such:
...
int
foo(int a)
{
    if (a <= 0)
      {
#ifdef MERGED
      L1:
#endif
	__builtin_unreachable();
      }
    if (a > 2)
#ifdef MERGED
      goto L1;
#else
      __builtin_unreachable();
#endif
    return a > 0;
}
...

Indeed I can reproduce the problem:
...
$ gcc unreachable.c -O2 -S -o- -ftree-tail-merge
foo:
	.cfi_startproc
	testl	%edi, %edi
	jle	.L3
	cmpl	$2, %edi
	jg	.L3
	movl	$1, %eax
	ret
.L3:
	.cfi_endproc
...

And I can make the problem go away with -fno-tree-tail-merge:
...
$ gcc unreachable.c -O2 -S -o- -fno-tree-tail-merge
foo:
	.cfi_startproc
	movl	$1, %eax
	ret
	.cfi_endproc
...

But I can also reproduce the problem with -fno-tree-tail-merge by using -DMERGED:
,,,
$ gcc unreachable.c -O2 -S -o- -fno-tree-tail-merge -DMERGED
foo:
	.cfi_startproc
	testl	%edi, %edi
	jle	.L3
	cmpl	$2, %edi
	jg	.L3
	movl	$1, %eax
	ret
.L3:
	.cfi_endproc
,,,

So it seems that this is a problem of a missed optimization, triggered by, but
not exclusively triggered by -ftree-tail-merge.

How to fix the missed optimization, I'm not sure where it should be done.

I think the optimization could be done in tree-vrp. Currently the assert
expressions look like this:
...
.foo (intD.6 aD.1711)
{
  _BoolD.1693 D.1720;
  intD.6 D.1719;

  # BLOCK 2 freq:10000
  # PRED: ENTRY [100.0%]  (fallthru,exec)
  if (aD.1711_1(D) <= 0)
    goto <bb 3>;
  else
    goto <bb 4>;
  # SUCC: 3 [0.0%]  (true,exec) 4 [100.0%]  (false,exec)

  # BLOCK 3 freq:4
  # PRED: 2 [0.0%]  (true,exec)
  # VUSE <.MEMD.1722_4(D)>
  # USE = nonlocal
  # CLB = nonlocal
  __builtin_unreachableD.997 ();
  # SUCC:

  # BLOCK 4 freq:9996
  # PRED: 2 [100.0%]  (false,exec)
  aD.1711_6 = ASSERT_EXPR <aD.1711_1(D), aD.1711_1(D) > 0>;
  if (aD.1711_6 > 2)
    goto <bb 5>;
  else
    goto <bb 6>;
  # SUCC: 5 [0.0%]  (true,exec) 6 [100.0%]  (false,exec)

  # BLOCK 5 freq:4
  # PRED: 4 [0.0%]  (true,exec)
  # VUSE <.MEMD.1722_4(D)>
  # USE = nonlocal
  # CLB = nonlocal
  __builtin_unreachableD.997 ();
  # SUCC:

  # BLOCK 6 freq:9992
  # PRED: 4 [100.0%]  (false,exec)
  aD.1711_5 = ASSERT_EXPR <aD.1711_6, aD.1711_6 <= 2>;
  D.1720_2 = aD.1711_5 > 0;
  D.1719_3 = (intD.6) D.1720_2;
  # VUSE <.MEMD.1722_4(D)>
  return D.1719_3;
  # SUCC: EXIT [100.0%]

}
..

The asserts allow the return result to be optimized, but not the cfg conditions.

AFAIU, we can insert the asserts earlier. F.i., we can insert
  aD.1711_6 = ASSERT_EXPR <aD.1711_1(D), aD.1711_1(D) > 0>
before the GIMPLE_COND in bb2.

Attached proof-of-concept patch implements this, and works for this example both
with and without -DMERGED, independent of -ftree-tail-merge.

The only problem I can see with this approach is that doing this early (vrp1)
basically removes range annotations which might have had an effect later on. It
could be and idea to only do this in vrp2, which is not much earlier than
expand, were the rtl cfg optimization kicks in for the first time.

Thanks,
- Tom
Richard Guenther - July 5, 2012, 1:17 p.m.
On Thu, Jul 5, 2012 at 2:49 PM, Tom de Vries <Tom_deVries@mentor.com> wrote:
> On 04/07/12 19:02, Ulrich Weigand wrote:
>> Any suggestions how to fix this?  Should tail merging detect
>> __builtin_unreachable and not merge such block?  Or else, should
>> the CFG optimizer be extended (how?) to handle unreachable blocks
>> with multiple predecessors better?
>
> Ulrich,
>
> I extended the example as such:
> ...
> int
> foo(int a)
> {
>     if (a <= 0)
>       {
> #ifdef MERGED
>       L1:
> #endif
>         __builtin_unreachable();
>       }
>     if (a > 2)
> #ifdef MERGED
>       goto L1;
> #else
>       __builtin_unreachable();
> #endif
>     return a > 0;
> }
> ...
>
> Indeed I can reproduce the problem:
> ...
> $ gcc unreachable.c -O2 -S -o- -ftree-tail-merge
> foo:
>         .cfi_startproc
>         testl   %edi, %edi
>         jle     .L3
>         cmpl    $2, %edi
>         jg      .L3
>         movl    $1, %eax
>         ret
> .L3:
>         .cfi_endproc
> ...
>
> And I can make the problem go away with -fno-tree-tail-merge:
> ...
> $ gcc unreachable.c -O2 -S -o- -fno-tree-tail-merge
> foo:
>         .cfi_startproc
>         movl    $1, %eax
>         ret
>         .cfi_endproc
> ...
>
> But I can also reproduce the problem with -fno-tree-tail-merge by using -DMERGED:
> ,,,
> $ gcc unreachable.c -O2 -S -o- -fno-tree-tail-merge -DMERGED
> foo:
>         .cfi_startproc
>         testl   %edi, %edi
>         jle     .L3
>         cmpl    $2, %edi
>         jg      .L3
>         movl    $1, %eax
>         ret
> .L3:
>         .cfi_endproc
> ,,,
>
> So it seems that this is a problem of a missed optimization, triggered by, but
> not exclusively triggered by -ftree-tail-merge.
>
> How to fix the missed optimization, I'm not sure where it should be done.
>
> I think the optimization could be done in tree-vrp. Currently the assert
> expressions look like this:
> ...
> .foo (intD.6 aD.1711)
> {
>   _BoolD.1693 D.1720;
>   intD.6 D.1719;
>
>   # BLOCK 2 freq:10000
>   # PRED: ENTRY [100.0%]  (fallthru,exec)
>   if (aD.1711_1(D) <= 0)
>     goto <bb 3>;
>   else
>     goto <bb 4>;
>   # SUCC: 3 [0.0%]  (true,exec) 4 [100.0%]  (false,exec)
>
>   # BLOCK 3 freq:4
>   # PRED: 2 [0.0%]  (true,exec)
>   # VUSE <.MEMD.1722_4(D)>
>   # USE = nonlocal
>   # CLB = nonlocal
>   __builtin_unreachableD.997 ();
>   # SUCC:
>
>   # BLOCK 4 freq:9996
>   # PRED: 2 [100.0%]  (false,exec)
>   aD.1711_6 = ASSERT_EXPR <aD.1711_1(D), aD.1711_1(D) > 0>;
>   if (aD.1711_6 > 2)
>     goto <bb 5>;
>   else
>     goto <bb 6>;
>   # SUCC: 5 [0.0%]  (true,exec) 6 [100.0%]  (false,exec)
>
>   # BLOCK 5 freq:4
>   # PRED: 4 [0.0%]  (true,exec)
>   # VUSE <.MEMD.1722_4(D)>
>   # USE = nonlocal
>   # CLB = nonlocal
>   __builtin_unreachableD.997 ();
>   # SUCC:
>
>   # BLOCK 6 freq:9992
>   # PRED: 4 [100.0%]  (false,exec)
>   aD.1711_5 = ASSERT_EXPR <aD.1711_6, aD.1711_6 <= 2>;
>   D.1720_2 = aD.1711_5 > 0;
>   D.1719_3 = (intD.6) D.1720_2;
>   # VUSE <.MEMD.1722_4(D)>
>   return D.1719_3;
>   # SUCC: EXIT [100.0%]
>
> }
> ..
>
> The asserts allow the return result to be optimized, but not the cfg conditions.
>
> AFAIU, we can insert the asserts earlier. F.i., we can insert
>   aD.1711_6 = ASSERT_EXPR <aD.1711_1(D), aD.1711_1(D) > 0>
> before the GIMPLE_COND in bb2.
>
> Attached proof-of-concept patch implements this, and works for this example both
> with and without -DMERGED, independent of -ftree-tail-merge.

You don't need to do it this way - you can simply remove the CFG path
feeding __builtin_unreachable.  Putting the assert before the if looks
a bit backward, so I'd prefer to not do that.

> The only problem I can see with this approach is that doing this early (vrp1)
> basically removes range annotations which might have had an effect later on. It
> could be and idea to only do this in vrp2, which is not much earlier than
> expand, were the rtl cfg optimization kicks in for the first time.

Indeed.

Richard.

> Thanks,
> - Tom
Michael Matz - July 5, 2012, 1:30 p.m.
Hi,

On Thu, 5 Jul 2012, Tom de Vries wrote:

> The asserts allow the return result to be optimized, but not the cfg 
> conditions.
> 
> AFAIU, we can insert the asserts earlier. F.i., we can insert
>   aD.1711_6 = ASSERT_EXPR <aD.1711_1(D), aD.1711_1(D) > 0>
> before the GIMPLE_COND in bb2.

Nope.  That would require some more checks, in particular that the BB 
containing builtin_unreachable doesn't contain any other side-effects.  
Given this:

if (i < 0)
  { do_something_interesting();
    __builtin_unreachable();
  }

moving the assert before the if would remove the if condition, hence 
the call to do_something_interesting.  You need to retain side-effects if 
there are any.


Ciao,
Michael.

Patch

Index: gcc/tree-vrp.c
===================================================================
--- gcc/tree-vrp.c (revision 189007)
+++ gcc/tree-vrp.c (working copy)
@@ -4222,9 +4222,11 @@  register_new_assert_for (tree name, tree
 
   gcc_checking_assert (bb == NULL || e == NULL);
 
+#if 0
   if (e == NULL)
     gcc_checking_assert (gimple_code (gsi_stmt (si)) != GIMPLE_COND
 			 && gimple_code (gsi_stmt (si)) != GIMPLE_SWITCH);
+#endif
 
   /* Never build an assert comparing against an integer constant with
      TREE_OVERFLOW set.  This confuses our undefined overflow warning
@@ -4449,7 +4451,28 @@  register_edge_assert_for_2 (tree name, e
   if (live_on_edge (e, name)
       && !has_single_use (name))
     {
-      register_new_assert_for (name, name, comp_code, val, NULL, e, bsi);
+      bool only_reachable_edge = true;
+      edge_iterator ei2;
+      edge e2;
+      FOR_EACH_EDGE (e2, ei2, e->src->succs)
+	{
+	  if (e2 == e)
+	    continue;
+	  if (EDGE_COUNT (e2->dest->succs) == 0)
+	    continue;
+	  only_reachable_edge = false;
+	  break;
+	}
+
+      if (only_reachable_edge)
+	{
+	  gimple_stmt_iterator bsi2 = bsi;
+	  gsi_prev (&bsi2);
+	  register_new_assert_for (name, name, comp_code, val, e->src, NULL, bsi2);
+	  return true;
+	}
+      else
+	register_new_assert_for (name, name, comp_code, val, NULL, e, bsi);
       retval = true;
     }
 
@@ -5569,7 +5592,13 @@  process_assert_insertions_for (tree name
   /* Otherwise, we can insert right after LOC->SI iff the
      statement must not be the last statement in the block.  */
   stmt = gsi_stmt (loc->si);
-  if (!stmt_ends_bb_p (stmt))
+  if (stmt == NULL)
+    {
+      gimple_stmt_iterator si2 = gsi_last_bb (gsi_bb (loc->si));
+      gsi_insert_before (&si2, assert_stmt, GSI_SAME_STMT);
+      return false;
+    }
+  else if (!stmt_ends_bb_p (stmt))
     {
       gsi_insert_after (&loc->si, assert_stmt, GSI_SAME_STMT);
       return false;