diff mbox series

[V12] : Improve code sinking pass

Message ID 8344d83c-8e78-491d-85db-76aac47da023@linux.ibm.com
State New
Headers show
Series [V12] : Improve code sinking pass | expand

Commit Message

Ajit Agarwal March 13, 2024, 9:02 a.m. UTC
Hello All:

Currently, code sinking will sink code at the use points with loop having same
nesting depth. The following patch improves code sinking by placing the sunk
code in immediate dominator with same loop nest depth.

Changes since v11:

Reorganization of the code.

For example :

void bar();
int j;
void foo(int a, int b, int c, int d, int e, int f)
{
  int l;
  l = a + b + c + d +e + f;
  if (a != 5)
    {
      bar();
      j = l;
    }
}

Code Sinking does the following:

void bar();
int j;
void foo(int a, int b, int c, int d, int e, int f)
{
  int l;

  if (a != 5)
    {
      l = a + b + c + d +e + f;
      bar();
      j = l;
    }
}

Bootstrapped regtested on powerpc64-linux-gnu.

Thanks & Regards


tree-ssa-sink: Improve code sinking pass

Currently, code sinking will sink code at the use points with loop having same
nesting depth. The following patch improves code sinking by placing the sunk
code in immediate dominator with same loop nest depth.

2024-03-13  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>

gcc/ChangeLog:

        PR tree-optimization/81953
        * tree-ssa-sink.cc (statement_sink_location): Move statements with
        same loop nest depth.
        (select_best_block): Add heuristics to select the best blocks in the
        immediate dominator for same loop nest depth.

gcc/testsuite/ChangeLog:

        PR tree-optimization/81953
        * gcc.dg/tree-ssa/ssa-sink-21.c: New test.
        * gcc.dg/tree-ssa/ssa-sink-22.c: New test.
---
 gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c | 15 ++++++++++++
 gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c | 19 +++++++++++++++
 gcc/tree-ssa-sink.cc                        | 26 +++++++++++++++++----
 3 files changed, 55 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c

Comments

Richard Biener March 13, 2024, 10:22 a.m. UTC | #1
On Wed, Mar 13, 2024 at 10:02 AM Ajit Agarwal <aagarwa1@linux.ibm.com> wrote:
>
> Hello All:
>
> Currently, code sinking will sink code at the use points with loop having same
> nesting depth. The following patch improves code sinking by placing the sunk
> code in immediate dominator with same loop nest depth.
>
> Changes since v11:
>
> Reorganization of the code.
>
> For example :
>
> void bar();
> int j;
> void foo(int a, int b, int c, int d, int e, int f)
> {
>   int l;
>   l = a + b + c + d +e + f;
>   if (a != 5)
>     {
>       bar();
>       j = l;
>     }
> }
>
> Code Sinking does the following:
>
> void bar();
> int j;
> void foo(int a, int b, int c, int d, int e, int f)
> {
>   int l;
>
>   if (a != 5)
>     {
>       l = a + b + c + d +e + f;
>       bar();
>       j = l;
>     }
> }
>
> Bootstrapped regtested on powerpc64-linux-gnu.
>
> Thanks & Regards
>
>
> tree-ssa-sink: Improve code sinking pass
>
> Currently, code sinking will sink code at the use points with loop having same
> nesting depth. The following patch improves code sinking by placing the sunk
> code in immediate dominator with same loop nest depth.
>
> 2024-03-13  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>
>
> gcc/ChangeLog:
>
>         PR tree-optimization/81953
>         * tree-ssa-sink.cc (statement_sink_location): Move statements with
>         same loop nest depth.
>         (select_best_block): Add heuristics to select the best blocks in the
>         immediate dominator for same loop nest depth.
>
> gcc/testsuite/ChangeLog:
>
>         PR tree-optimization/81953
>         * gcc.dg/tree-ssa/ssa-sink-21.c: New test.
>         * gcc.dg/tree-ssa/ssa-sink-22.c: New test.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c | 15 ++++++++++++
>  gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c | 19 +++++++++++++++
>  gcc/tree-ssa-sink.cc                        | 26 +++++++++++++++++----
>  3 files changed, 55 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c
> new file mode 100644
> index 00000000000..d3b79ca5803
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-sink-stats" } */
> +void bar();
> +int j;
> +void foo(int a, int b, int c, int d, int e, int f)
> +{
> +  int l;
> +  l = a + b + c + d +e + f;
> +  if (a != 5)
> +    {
> +      bar();
> +      j = l;
> +    }
> +}
> +/* { dg-final { scan-tree-dump {l_12\s+=\s+_4\s+\+\s+f_11\(D\);\n\s+bar\s+\(\)} sink1 } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c
> new file mode 100644
> index 00000000000..84e7938c54f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-sink-stats" } */
> +void bar();
> +int j, x;
> +void foo(int a, int b, int c, int d, int e, int f)
> +{
> +  int l;
> +  l = a + b + c + d +e + f;
> +  if (a != 5)
> +    {
> +      bar();
> +      if (b != 3)
> +        x = 3;
> +      else
> +        x = 5;
> +      j = l;
> +    }
> +}
> +/* { dg-final { scan-tree-dump {l_13\s+=\s+_4\s+\+\s+f_12\(D\);\n\s+bar\s+\(\)} sink1 } } */
> diff --git a/gcc/tree-ssa-sink.cc b/gcc/tree-ssa-sink.cc
> index 880d6f70a80..40f51e2f3b9 100644
> --- a/gcc/tree-ssa-sink.cc
> +++ b/gcc/tree-ssa-sink.cc
> @@ -176,6 +176,9 @@ nearest_common_dominator_of_uses (def_operand_p def_p, bool *debug_stmts)
>     tree, return the best basic block between them (inclusive) to place
>     statements.
>
> +   The best basic block should be an immediate dominator of
> +   best basic block if we've moved to same loop nest.
> +
>     We want the most control dependent block in the shallowest loop nest.
>
>     If the resulting block is in a shallower loop nest, then use it.  Else
> @@ -209,6 +212,21 @@ select_best_block (basic_block early_bb,
>        temp_bb = get_immediate_dominator (CDI_DOMINATORS, temp_bb);
>      }
>
> +  temp_bb = best_bb;
> +  /* Move sinking to immediate dominator if the statement to be moved
> +     is not memory operand and same loop nest.  */
> +  if (best_bb == late_bb
> +      && !gimple_vuse (stmt))
> +    {
> +      while (temp_bb != early_bb)
> +       {
> +         if (bb_loop_depth (temp_bb) == bb_loop_depth (best_bb))
> +           best_bb = temp_bb;
> +
> +         temp_bb = get_immediate_dominator (CDI_DOMINATORS, temp_bb);
> +       }
> +     }
> +

I don't think this is what sinking should do.  Instead ...

>    /* Placing a statement before a setjmp-like function would be invalid
>       (it cannot be reevaluated when execution follows an abnormal edge).
>       If we selected a block with abnormal predecessors, just punt.  */
> @@ -250,7 +268,7 @@ select_best_block (basic_block early_bb,
>        /* If result of comparsion is unknown, prefer EARLY_BB.
>          Thus use !(...>=..) rather than (...<...)  */
>        && !(best_bb->count * 100 >= early_bb->count * threshold))
> -    return best_bb;
> +     return best_bb;
>
>    /* No better block found, so return EARLY_BB, which happens to be the
>       statement's original block.  */
> @@ -430,6 +448,7 @@ statement_sink_location (gimple *stmt, basic_block frombb,
>             continue;
>           break;
>         }
> +
>        use = USE_STMT (one_use);
>
>        if (gimple_code (use) != GIMPLE_PHI)
> @@ -439,10 +458,7 @@ statement_sink_location (gimple *stmt, basic_block frombb,
>           if (sinkbb == frombb)
>             return false;
>
> -         if (sinkbb == gimple_bb (use))
> -           *togsi = gsi_for_stmt (use);
> -         else
> -           *togsi = gsi_after_labels (sinkbb);
> +         *togsi = gsi_after_labels (sinkbb);

... this hunk is OK (please test and split it out separatley).  In the spirit of
moving the stmt the least amount (in this case not schedule it within the
basic-block).  In the same spirit one would choose an earlier basic-block
but only if the old choosen one post-dominates that, dominance isn't
a good criteria since you'd move it where the computation might not be
needed.  A practical testcase would be

  tem = a + b;
  if (foo)
    bar ();
  tem2 = tem + d;

where we at the moment would sink 'tem = a+ b' to the block containing
'tem2 = tem + d' not reducing the number of evaluations (of course bar()
might not return, but that's a minor detail).  Code motion like that should
be subject to register-pressure considerations which we do not estimate
here at all.  So it could be argued we shouldn't perform any sinking here.

But compared to not scheduling within a BB this requires separate
evaluation as of course this can in some cases reduce register
pressure (you exempted loads for example).  Note that's true for
scheduling within a BB as well but we tend to "break" that easily with TER.

A good first-order heuristic would be to avoid the scheduling
when the number of non-virtual SSA uses on the stmt to be moved is bigger
than one.  For zero we reduce the lifetime of the def.  For one we're not
making things worse.  For more uses it depends on whether we're moving
within the lifetime of the uses and it becomes a global problem (we're
greedily moving dependent statements, so we even get "local global" wrong
then).

That said, changing will cause regressions, given both before and after
is somewhat ad-hoc it's hard to argue one is more correct than the other.

IMO scheduling should be left to a stmt scheduler on GIMPLE
(which we don't have).

Richard.

>           return true;
>         }
> --
> 2.39.3
>
Jeff Law March 13, 2024, 8:27 p.m. UTC | #2
On 3/13/24 4:22 AM, Richard Biener wrote:

> 
> ... this hunk is OK (please test and split it out separatley).  In the spirit of
> moving the stmt the least amount (in this case not schedule it within the
> basic-block).  In the same spirit one would choose an earlier basic-block
> but only if the old choosen one post-dominates that, dominance isn't
> a good criteria since you'd move it where the computation might not be
> needed.  A practical testcase would be
> 
>    tem = a + b;
>    if (foo)
>      bar ();
>    tem2 = tem + d;
> 
> where we at the moment would sink 'tem = a+ b' to the block containing
> 'tem2 = tem + d' not reducing the number of evaluations (of course bar()
> might not return, but that's a minor detail).  Code motion like that should
> be subject to register-pressure considerations which we do not estimate
> here at all.  So it could be argued we shouldn't perform any sinking here.
Agreed.  This looks more like a scheduling and register-pressure issue 
rather than a classic sinking issue.

Sinking is supposed to be moving code to lesser executed points.  In the 
case above, the only way sinking into the tem2 = block would be if bar() 
doesn't return.  It just doesn't make sense to me from a sinking standpoint.

The block execution data generally prevents this kind of gratuitous 
movement.

I actually evaluated our sinking code several years ago against an 
implementation of Click's algorithm.  In general they were quite 
comparable in terms of selecting an "optimal" block from an execution 
standpoint.  There were a couple of fixes that were added to our 
implementation at that time, but again, generally we were picking 
sensible blocks.


> 
> A good first-order heuristic would be to avoid the scheduling
> when the number of non-virtual SSA uses on the stmt to be moved is bigger
> than one.  For zero we reduce the lifetime of the def.  For one we're not
> making things worse.  For more uses it depends on whether we're moving
> within the lifetime of the uses and it becomes a global problem (we're
> greedily moving dependent statements, so we even get "local global" wrong
> then).
> 
> That said, changing will cause regressions, given both before and after
> is somewhat ad-hoc it's hard to argue one is more correct than the other.
> 
> IMO scheduling should be left to a stmt scheduler on GIMPLE
> (which we don't have).
Click's work can function as a statement scheduler, though I'm not 
convinced it's actually a good one.  Essentially most statements are 
conceptually disassociated from their blocks, then re-scheduled by 
visiting defining statements of "pinned" instructions.  That model is 
mostly for driving redundancy elimination.  Scheduling is just a side 
effect.



Bernd had a statement scheduler for gimple years ago, but it was 
somewhat controversial at the time and never moved forward enough to get 
integrated.  IIRC it ran just before or just after TER and its primary 
objective was to avoid some of the pathological cases that ultimately 
result in significant spilling after we're done with the bulk of the RTL 
pipeline.
Richard Biener March 14, 2024, 7:43 a.m. UTC | #3
On Wed, Mar 13, 2024 at 9:27 PM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 3/13/24 4:22 AM, Richard Biener wrote:
>
> >
> > ... this hunk is OK (please test and split it out separatley).  In the spirit of
> > moving the stmt the least amount (in this case not schedule it within the
> > basic-block).  In the same spirit one would choose an earlier basic-block
> > but only if the old choosen one post-dominates that, dominance isn't
> > a good criteria since you'd move it where the computation might not be
> > needed.  A practical testcase would be
> >
> >    tem = a + b;
> >    if (foo)
> >      bar ();
> >    tem2 = tem + d;
> >
> > where we at the moment would sink 'tem = a+ b' to the block containing
> > 'tem2 = tem + d' not reducing the number of evaluations (of course bar()
> > might not return, but that's a minor detail).  Code motion like that should
> > be subject to register-pressure considerations which we do not estimate
> > here at all.  So it could be argued we shouldn't perform any sinking here.
> Agreed.  This looks more like a scheduling and register-pressure issue
> rather than a classic sinking issue.
>
> Sinking is supposed to be moving code to lesser executed points.  In the
> case above, the only way sinking into the tem2 = block would be if bar()
> doesn't return.  It just doesn't make sense to me from a sinking standpoint.
>
> The block execution data generally prevents this kind of gratuitous
> movement.
>
> I actually evaluated our sinking code several years ago against an
> implementation of Click's algorithm.  In general they were quite
> comparable in terms of selecting an "optimal" block from an execution
> standpoint.  There were a couple of fixes that were added to our
> implementation at that time, but again, generally we were picking
> sensible blocks.
>
>
> >
> > A good first-order heuristic would be to avoid the scheduling
> > when the number of non-virtual SSA uses on the stmt to be moved is bigger
> > than one.  For zero we reduce the lifetime of the def.  For one we're not
> > making things worse.  For more uses it depends on whether we're moving
> > within the lifetime of the uses and it becomes a global problem (we're
> > greedily moving dependent statements, so we even get "local global" wrong
> > then).
> >
> > That said, changing will cause regressions, given both before and after
> > is somewhat ad-hoc it's hard to argue one is more correct than the other.
> >
> > IMO scheduling should be left to a stmt scheduler on GIMPLE
> > (which we don't have).
> Click's work can function as a statement scheduler, though I'm not
> convinced it's actually a good one.  Essentially most statements are
> conceptually disassociated from their blocks, then re-scheduled by
> visiting defining statements of "pinned" instructions.  That model is
> mostly for driving redundancy elimination.  Scheduling is just a side
> effect.
>
>
>
> Bernd had a statement scheduler for gimple years ago, but it was
> somewhat controversial at the time and never moved forward enough to get
> integrated.  IIRC it ran just before or just after TER and its primary
> objective was to avoid some of the pathological cases that ultimately
> result in significant spilling after we're done with the bulk of the RTL
> pipeline.

Yeah, the most difficult thing with scheduling on GIMPLE is the interaction
with TER.  TER does have some "scheduling boundaries" it respects
(I'd have to look them up), so GIMPLE scheduling that only looks at
scheduling across such boundaries might be useful.

Richard.
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c
new file mode 100644
index 00000000000..d3b79ca5803
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-sink-stats" } */
+void bar();
+int j;
+void foo(int a, int b, int c, int d, int e, int f)
+{
+  int l;
+  l = a + b + c + d +e + f;
+  if (a != 5)
+    {
+      bar();
+      j = l;
+    }
+}
+/* { dg-final { scan-tree-dump {l_12\s+=\s+_4\s+\+\s+f_11\(D\);\n\s+bar\s+\(\)} sink1 } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c
new file mode 100644
index 00000000000..84e7938c54f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-sink-stats" } */
+void bar();
+int j, x;
+void foo(int a, int b, int c, int d, int e, int f)
+{
+  int l;
+  l = a + b + c + d +e + f;
+  if (a != 5)
+    {
+      bar();
+      if (b != 3)
+        x = 3;
+      else
+        x = 5;
+      j = l;
+    }
+}
+/* { dg-final { scan-tree-dump {l_13\s+=\s+_4\s+\+\s+f_12\(D\);\n\s+bar\s+\(\)} sink1 } } */
diff --git a/gcc/tree-ssa-sink.cc b/gcc/tree-ssa-sink.cc
index 880d6f70a80..40f51e2f3b9 100644
--- a/gcc/tree-ssa-sink.cc
+++ b/gcc/tree-ssa-sink.cc
@@ -176,6 +176,9 @@  nearest_common_dominator_of_uses (def_operand_p def_p, bool *debug_stmts)
    tree, return the best basic block between them (inclusive) to place
    statements.
 
+   The best basic block should be an immediate dominator of
+   best basic block if we've moved to same loop nest.
+
    We want the most control dependent block in the shallowest loop nest.
 
    If the resulting block is in a shallower loop nest, then use it.  Else
@@ -209,6 +212,21 @@  select_best_block (basic_block early_bb,
       temp_bb = get_immediate_dominator (CDI_DOMINATORS, temp_bb);
     }
 
+  temp_bb = best_bb;
+  /* Move sinking to immediate dominator if the statement to be moved
+     is not memory operand and same loop nest.  */
+  if (best_bb == late_bb
+      && !gimple_vuse (stmt))
+    {
+      while (temp_bb != early_bb)
+	{
+	  if (bb_loop_depth (temp_bb) == bb_loop_depth (best_bb))
+	    best_bb = temp_bb;
+
+	  temp_bb = get_immediate_dominator (CDI_DOMINATORS, temp_bb);
+	}
+     }
+
   /* Placing a statement before a setjmp-like function would be invalid
      (it cannot be reevaluated when execution follows an abnormal edge).
      If we selected a block with abnormal predecessors, just punt.  */
@@ -250,7 +268,7 @@  select_best_block (basic_block early_bb,
       /* If result of comparsion is unknown, prefer EARLY_BB.
 	 Thus use !(...>=..) rather than (...<...)  */
       && !(best_bb->count * 100 >= early_bb->count * threshold))
-    return best_bb;
+     return best_bb;
 
   /* No better block found, so return EARLY_BB, which happens to be the
      statement's original block.  */
@@ -430,6 +448,7 @@  statement_sink_location (gimple *stmt, basic_block frombb,
 	    continue;
 	  break;
 	}
+
       use = USE_STMT (one_use);
 
       if (gimple_code (use) != GIMPLE_PHI)
@@ -439,10 +458,7 @@  statement_sink_location (gimple *stmt, basic_block frombb,
 	  if (sinkbb == frombb)
 	    return false;
 
-	  if (sinkbb == gimple_bb (use))
-	    *togsi = gsi_for_stmt (use);
-	  else
-	    *togsi = gsi_after_labels (sinkbb);
+	  *togsi = gsi_after_labels (sinkbb);
 
 	  return true;
 	}