diff mbox

PR80218: Call CDCE fails to update the block profile

Message ID 871sthbme3.fsf@e105548-lin.cambridge.arm.com
State New
Headers show

Commit Message

Richard Sandiford March 28, 2017, 12:45 p.m. UTC
tree-call-cdce.c was updating the edge probabilities and counts but
it wasn't updating the corresponding block information.  Among other
things, this tricked the register allocator into thinking that the
libm call was relatively hot and that it wasn't worth assigning
call-clobbered registers to values that were live across the call.
With correct frequency information, the RA instead keeps x in the
first argument register and spills it only around the call.

Although the problem has been around for a long time, it became more
acute (and would only trigger for the first function in the testcase)
after r230488.  Until that patch, the code was specific to calls that
had no lhs, but that we still had to keep for their effect on errno.
After the patch we also used the code for calls with an lhs, provided
that the hardware could calculate the lhs directly.

This showed up as a significant performance drop in an internal
benchmark from GCC 5 to GCC 6.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK for trunk
and gcc-6-branch?

Thanks,
Richard


gcc/
	PR tree-optimization/80218
	* tree-call-cdce.c (shrink_wrap_one_built_in_call_with_conds):
	Update block frequencies and counts.

gcc/testsuite/
	PR tree-optimization/80218
	* gcc.dg/pr80218.c: New test.

Comments

Jeff Law March 28, 2017, 3:14 p.m. UTC | #1
On 03/28/2017 06:45 AM, Richard Sandiford wrote:
> tree-call-cdce.c was updating the edge probabilities and counts but
> it wasn't updating the corresponding block information.  Among other
> things, this tricked the register allocator into thinking that the
> libm call was relatively hot and that it wasn't worth assigning
> call-clobbered registers to values that were live across the call.
> With correct frequency information, the RA instead keeps x in the
> first argument register and spills it only around the call.
>
> Although the problem has been around for a long time, it became more
> acute (and would only trigger for the first function in the testcase)
> after r230488.  Until that patch, the code was specific to calls that
> had no lhs, but that we still had to keep for their effect on errno.
> After the patch we also used the code for calls with an lhs, provided
> that the hardware could calculate the lhs directly.
>
> This showed up as a significant performance drop in an internal
> benchmark from GCC 5 to GCC 6.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK for trunk
> and gcc-6-branch?
>
> Thanks,
> Richard
>
>
> gcc/
> 	PR tree-optimization/80218
> 	* tree-call-cdce.c (shrink_wrap_one_built_in_call_with_conds):
> 	Update block frequencies and counts.
>
> gcc/testsuite/
> 	PR tree-optimization/80218
> 	* gcc.dg/pr80218.c: New test.

OK.  I went ahead and installed this for you.

Thanks,
Jeff
diff mbox

Patch

diff --git a/gcc/testsuite/gcc.dg/pr80218.c b/gcc/testsuite/gcc.dg/pr80218.c
new file mode 100644
index 0000000..8669ee1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr80218.c
@@ -0,0 +1,28 @@ 
+/* { dg-options "-O2 -fdump-rtl-ira-details-blocks" } */
+/* { dg-require-effective-target c99_runtime } */
+
+#include <math.h>
+
+void foo (float *);
+
+void
+f1 (float *x)
+{
+  x[0] = sqrtf (x[0]);
+}
+
+void
+f2 (float *x)
+{
+  sqrtf (x[0]);
+  foo (x);
+}
+
+void
+f3 (float *x)
+{
+  acosf (x[0]);
+  foo (x);
+}
+
+/* { dg-final { scan-rtl-dump-not "Invalid sum" "ira" } } */
diff --git a/gcc/tree-call-cdce.c b/gcc/tree-call-cdce.c
index 10d2c47..7bd242c 100644
--- a/gcc/tree-call-cdce.c
+++ b/gcc/tree-call-cdce.c
@@ -847,10 +847,12 @@  shrink_wrap_one_built_in_call_with_conds (gcall *bi_call, vec <gimple *> conds,
       gsi_insert_before (&bi_call_bsi, c, GSI_SAME_STMT);
       cond_expr = c;
     }
-  nconds--;
   ci++;
   gcc_assert (cond_expr && gimple_code (cond_expr) == GIMPLE_COND);
 
+  typedef std::pair<edge, edge> edge_pair;
+  auto_vec<edge_pair, 8> edges;
+
   bi_call_in_edge0 = split_block (bi_call_bb, cond_expr);
   bi_call_in_edge0->flags &= ~EDGE_FALLTHRU;
   bi_call_in_edge0->flags |= EDGE_FALSE_VALUE;
@@ -859,17 +861,11 @@  shrink_wrap_one_built_in_call_with_conds (gcall *bi_call, vec <gimple *> conds,
   join_tgt_in_edge_fall_thru = make_edge (guard_bb, join_tgt_bb,
                                           EDGE_TRUE_VALUE);
 
-  bi_call_in_edge0->probability = REG_BR_PROB_BASE * ERR_PROB;
-  bi_call_in_edge0->count =
-      apply_probability (guard_bb->count,
-			 bi_call_in_edge0->probability);
-  join_tgt_in_edge_fall_thru->probability =
-      inverse_probability (bi_call_in_edge0->probability);
-  join_tgt_in_edge_fall_thru->count =
-      guard_bb->count - bi_call_in_edge0->count;
+  edges.reserve (nconds);
+  edges.quick_push (edge_pair (bi_call_in_edge0, join_tgt_in_edge_fall_thru));
 
   /* Code generation for the rest of the conditions  */
-  while (nconds > 0)
+  for (unsigned int i = 1; i < nconds; ++i)
     {
       unsigned ci0;
       edge bi_call_in_edge;
@@ -885,7 +881,6 @@  shrink_wrap_one_built_in_call_with_conds (gcall *bi_call, vec <gimple *> conds,
           gsi_insert_before (&guard_bsi, c, GSI_SAME_STMT);
           cond_expr = c;
         }
-      nconds--;
       ci++;
       gcc_assert (cond_expr && gimple_code (cond_expr) == GIMPLE_COND);
       guard_bb_in_edge = split_block (guard_bb, cond_expr);
@@ -893,14 +888,51 @@  shrink_wrap_one_built_in_call_with_conds (gcall *bi_call, vec <gimple *> conds,
       guard_bb_in_edge->flags |= EDGE_TRUE_VALUE;
 
       bi_call_in_edge = make_edge (guard_bb, bi_call_bb, EDGE_FALSE_VALUE);
+      edges.quick_push (edge_pair (bi_call_in_edge, guard_bb_in_edge));
+    }
+
+  /* Now update the probability and profile information, processing the
+     guards in order of execution.
+
+     There are two approaches we could take here.  On the one hand we
+     could assign a probability of X to the call block and distribute
+     that probability among its incoming edges.  On the other hand we
+     could assign a probability of X to each individual call edge.
+
+     The choice only affects calls that have more than one condition.
+     In those cases, the second approach would give the call block
+     a greater probability than the first.  However, the difference
+     is only small, and our chosen X is a pure guess anyway.
+
+     Here we take the second approach because it's slightly simpler
+     and because it's easy to see that it doesn't lose profile counts.  */
+  bi_call_bb->count = 0;
+  bi_call_bb->frequency = 0;
+  while (!edges.is_empty ())
+    {
+      edge_pair e = edges.pop ();
+      edge call_edge = e.first;
+      edge nocall_edge = e.second;
+      basic_block src_bb = call_edge->src;
+      gcc_assert (src_bb == nocall_edge->src);
+
+      call_edge->probability = REG_BR_PROB_BASE * ERR_PROB;
+      call_edge->count = apply_probability (src_bb->count,
+					    call_edge->probability);
+      nocall_edge->probability = inverse_probability (call_edge->probability);
+      nocall_edge->count = src_bb->count - call_edge->count;
+
+      unsigned int call_frequency = apply_probability (src_bb->frequency,
+						       call_edge->probability);
 
-      bi_call_in_edge->probability = REG_BR_PROB_BASE * ERR_PROB;
-      bi_call_in_edge->count =
-	  apply_probability (guard_bb->count,
-			     bi_call_in_edge->probability);
-      guard_bb_in_edge->probability =
-          inverse_probability (bi_call_in_edge->probability);
-      guard_bb_in_edge->count = guard_bb->count - bi_call_in_edge->count;
+      bi_call_bb->count += call_edge->count;
+      bi_call_bb->frequency += call_frequency;
+
+      if (nocall_edge->dest != join_tgt_bb)
+	{
+	  nocall_edge->dest->count = nocall_edge->count;
+	  nocall_edge->dest->frequency = src_bb->frequency - call_frequency;
+	}
     }
 
   if (dom_info_available_p (CDI_DOMINATORS))