Patchwork Handle more COMDAT profiling issues

login
register
mail settings
Submitter Teresa Johnson
Date March 22, 2014, 1:30 a.m.
Message ID <CAAe5K+UH7Gs6aEV=jqH2UrYhggeuoaE6yHk5skf42O5gUpO-+A@mail.gmail.com>
Download mbox | patch
Permalink /patch/332789/
State New
Headers show

Comments

Teresa Johnson - March 22, 2014, 1:30 a.m.
On Fri, Feb 28, 2014 at 9:13 AM, Teresa Johnson <tejohnson@google.com> wrote:
>>>> Here's the new patch. The only changes from the earlier patch are in
>>>> handle_missing_profiles, where we now get the counts off of the entry
>>>> and call stmt bbs, and in tree_profiling, where we call
>>>> handle_missing_profiles earlier and I have removed the outlined cgraph
>>>> rebuilding code since it doesn't need to be reinvoked.
>>>>
>>>> Honza, does this look ok for trunk when stage 1 reopens? David, I can
>>>> send a similar patch for review to google-4_8 if it looks good.
>>>>
>>>> Thanks,
>>>> Teresa
>>>>
> ...
>>
>> Spec testing of my earlier patch hit an issue with the call to
>> gimple_bb in this routine, since the caller was a thunk and therefore
>> the edge did not have a call_stmt set. I've attached a slightly
>> modified patch that guards the call by a check to
>> cgraph_function_with_gimple_body_p. Regression and spec testing are
>> clean.
>>

I made some more improvements to the patch based on more extensive
testing. Since we now use the bb counts instead of the cgraph edge
counts to determine call counts, it is important to do a topological
walk over the cgraph when dropping profiles. This ensures that the
counts we apply along a chain of calls whose profiles are being
dropped are consistent. I also realized that we may in fact need to
drop profiles on non-COMDATs - when a non-COMDAT was IPA inlined into
a COMDAT during the profile-gen build, its profile counts would also
be lost along with those of the COMDAT it was inlined into. So now the
patch will drop profiles on non-COMDATs if they are low and the caller
had its profile dropped.

Here is a link to the original patch with motivation for the change:
http://gcc.gnu.org/ml/gcc-patches/2014-02/msg00632.html

New patch below. Bootstrapped and tested on x86_64-unknown-linux-gnu.
Also profilebootstrapped and tested. Spec cpu2006 testing shows that
we drop profiles on more routines in 3 benchmarks (447.dealII,
450.soplex, 483.xalancbmk). I am seeing around 1% speedup consistently
on soplex when run on a Westmere.

Ok for stage 1?

Thanks,
Teresa

2014-03-21  Teresa Johnson  <tejohnson@google.com>

        * graphite.c (graphite_finalize): Pass new parameter.
        * params.def (PARAM_MIN_CALLER_REESTIMATE_RATIO): New.
        * predict.c (tree_estimate_probability): New parameter.
        (tree_estimate_probability_worker): Renamed from
        tree_estimate_probability_driver.
        (tree_reestimate_probability): New function.
        (tree_estimate_probability_driver): Invoke
        tree_estimate_probability_worker.
        (freqs_to_counts): Move here from tree-inline.c.
        (drop_profile): New parameter, re-estimate profiles when dropping
        counts.
        (handle_missing_profiles): Drop for some non-zero functions as well,
        get counts from bbs to support invocation before cgraph rebuild,
        and use a single topological walk over cgraph.
        (counts_to_freqs): Remove code obviated by reestimation.
        * predict.h (tree_estimate_probability): Update declaration.
        * tree-inline.c (freqs_to_counts): Move to predict.c.
        (copy_cfg_body): Remove code obviated by reestimation.
        * tree-profile.c (tree_profiling): Invoke handle_missing_profiles
        before cgraph rebuild.

Patch

Index: graphite.c
===================================================================
--- graphite.c  (revision 208492)
+++ graphite.c  (working copy)
@@ -247,7 +247,7 @@  graphite_finalize (bool need_cfg_cleanup_p)
       cleanup_tree_cfg ();
       profile_status_for_fn (cfun) = PROFILE_ABSENT;
       release_recorded_exits ();
-      tree_estimate_probability ();
+      tree_estimate_probability (false);
     }

   cloog_state_free (cloog_state);
Index: params.def
===================================================================
--- params.def  (revision 208492)
+++ params.def  (working copy)
@@ -44,6 +44,12 @@  DEFPARAM (PARAM_PREDICTABLE_BRANCH_OUTCOME,
          "Maximal estimated outcome of branch considered predictable",
          2, 0, 50)

+DEFPARAM (PARAM_MIN_CALLER_REESTIMATE_RATIO,
+         "min-caller-reestimate-ratio",
+         "Minimum caller-to-callee node count ratio to force
reestimated branch "
+          "probabilities in callee (where 0 means only when callee
count is 0)",
+         10, 0, 0)
+
 DEFPARAM (PARAM_INLINE_MIN_SPEEDUP,
          "inline-min-speedup",
          "The minimal estimated speedup allowing inliner to ignore
inline-insns-single and inline-isnsns-auto",
Index: predict.c
===================================================================
--- predict.c   (revision 208492)
+++ predict.c   (working copy)
@@ -68,6 +68,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "tree-pass.h"
 #include "tree-scalar-evolution.h"
 #include "cfgloop.h"
+#include "ipa-utils.h"

 /* real constants: 0, 1, 1-1/REG_BR_PROB_BASE, REG_BR_PROB_BASE,
                   1/REG_BR_PROB_BASE, 0.5, BB_FREQ_MAX.  */
@@ -2379,10 +2380,12 @@  tree_estimate_probability_bb (basic_block bb)

 /* Predict branch probabilities and estimate profile of the tree CFG.
    This function can be called from the loop optimizers to recompute
-   the profile information.  */
+   the profile information.  When REDO is true then we are forcing
+   re-estimation of the probabilities because the profile was deemed
+   insufficient.  */

 void
-tree_estimate_probability (void)
+tree_estimate_probability (bool redo)
 {
   basic_block bb;

@@ -2390,7 +2393,8 @@  void
   connect_infinite_loops_to_exit ();
   /* We use loop_niter_by_eval, which requires that the loops have
      preheaders.  */
-  create_preheaders (CP_SIMPLE_PREHEADERS);
+  if (!redo)
+    create_preheaders (CP_SIMPLE_PREHEADERS);
   calculate_dominance_info (CDI_POST_DOMINATORS);

   bb_predictions = pointer_map_create ();
@@ -2412,16 +2416,16 @@  void
   pointer_map_destroy (bb_predictions);
   bb_predictions = NULL;

-  estimate_bb_frequencies (false);
+  estimate_bb_frequencies (redo);
   free_dominance_info (CDI_POST_DOMINATORS);
   remove_fake_exit_edges ();
 }

 /* Predict branch probabilities and estimate profile of the tree CFG.
-   This is the driver function for PASS_PROFILE.  */
+   When REDO is true, we are forcing reestimation of the probabilities.  */

-static unsigned int
-tree_estimate_probability_driver (void)
+static void
+tree_estimate_probability_worker (bool redo)
 {
   unsigned nb_loops;

@@ -2435,7 +2439,7 @@  void
   if (nb_loops > 1)
     scev_initialize ();

-  tree_estimate_probability ();
+  tree_estimate_probability (redo);

   if (nb_loops > 1)
     scev_finalize ();
@@ -2445,6 +2449,34 @@  void
     gimple_dump_cfg (dump_file, dump_flags);
   if (profile_status_for_fn (cfun) == PROFILE_ABSENT)
     profile_status_for_fn (cfun) = PROFILE_GUESSED;
+}
+
+/* Force re-estimation of the probabilities, because the profile was
+   deemed insufficient.  */
+
+static void
+tree_reestimate_probability (void)
+{
+  basic_block bb;
+  edge e;
+  edge_iterator ei;
+
+  /* Need to reset the counts to ensure probabilities are recomputed.  */
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      bb->count = 0;
+      FOR_EACH_EDGE (e, ei, bb->succs)
+        e->count = 0;
+    }
+  tree_estimate_probability_worker (true);
+}
+
+/* Estimate probabilities.
+   This is the driver function for PASS_PROFILE.  */
+static unsigned int
+tree_estimate_probability_driver (void)
+{
+  tree_estimate_probability_worker (false);
   return 0;
 }
 ^L
@@ -2765,33 +2797,64 @@  estimate_loops (void)
   BITMAP_FREE (tovisit);
 }

+/* Convert estimated frequencies into counts for NODE, scaling COUNT
+   with each bb's frequency. Used when NODE has an entry count that
+   is much lower than the caller edges reaching it.  See the comments
+   for handle_missing_profiles() for when this can happen for COMDATs.  */
+
+void
+freqs_to_counts (struct cgraph_node *node, gcov_type count)
+{
+  basic_block bb;
+  edge_iterator ei;
+  edge e;
+  struct function *fn = DECL_STRUCT_FUNCTION (node->decl);
+
+  FOR_ALL_BB_FN(bb, fn)
+    {
+      bb->count = apply_scale (count,
+                               GCOV_COMPUTE_SCALE (bb->frequency,
BB_FREQ_MAX));
+      FOR_EACH_EDGE (e, ei, bb->succs)
+        e->count = apply_probability (e->src->count, e->probability);
+    }
+}
+
 /* Drop the profile for NODE to guessed, and update its frequency based on
-   whether it is expected to be hot given the CALL_COUNT.  */
+   whether it is expected to be hot given the CALL_COUNT. When
+   CALLER_DROPPED is true, then one of NODE's callers already
+   had its profile dropped.  */

 static void
-drop_profile (struct cgraph_node *node, gcov_type call_count)
+drop_profile (struct cgraph_node *node, gcov_type call_count,
+              bool caller_dropped)
 {
   struct function *fn = DECL_STRUCT_FUNCTION (node->decl);
-  /* In the case where this was called by another function with a
-     dropped profile, call_count will be 0. Since there are no
-     non-zero call counts to this function, we don't know for sure
-     whether it is hot, and therefore it will be marked normal below.  */
+
+  if (profile_status_for_fn (fn) != PROFILE_READ)
+    return;
+
   bool hot = maybe_hot_count_p (NULL, call_count);

   if (dump_file)
     fprintf (dump_file,
-             "Dropping 0 profile for %s/%i. %s based on calls.\n",
+             "Dropping %ld profile for %s/%i. %s based on calls.\n",
+             ENTRY_BLOCK_PTR_FOR_FN (fn)->count,
              node->name (), node->order,
              hot ? "Function is hot" : "Function is normal");
   /* We only expect to miss profiles for functions that are reached
-     via non-zero call edges in cases where the function may have
+     via much hotter call edges in cases where the function may have
      been linked from another module or library (COMDATs and extern
-     templates). See the comments below for handle_missing_profiles.
+     templates). However, if this is a function whose caller had
+     a dropped profile, it may not be a COMDAT. That would happen when
+     a non-COMDAT had been IPA inlined into a COMDAT, in which case
+     we could have lost its profile counts along with the COMDAT's.
+     See the comments below for handle_missing_profiles.
      Also, only warn in cases where the missing counts exceed the
      number of training runs. In certain cases with an execv followed
      by a no-return call the profile for the no-return call is not
      dumped and there can be a mismatch.  */
   if (!DECL_COMDAT (node->decl) && !DECL_EXTERNAL (node->decl)
+      && !caller_dropped
       && call_count > profile_info->runs)
     {
       if (flag_profile_correction)
@@ -2806,6 +2869,18 @@  static void
                  node->name (), node->order);
     }

+  /* Re-estimate the probabilities for function and use the estimated
+     frequencies to compute the counts.  */
+  push_cfun (DECL_STRUCT_FUNCTION (node->decl));
+  tree_reestimate_probability ();
+  freqs_to_counts (node, call_count);
+  if (dump_file)
+    {
+      fprintf (dump_file, "After re-estimating probabilies and counts\n");
+      gimple_dump_cfg (dump_file,
dump_flags|TDF_DETAILS|TDF_BLOCKS|TDF_LINENO|TDF_STATS);
+    }
+  pop_cfun ();
+
   profile_status_for_fn (fn)
       = (flag_guess_branch_prob ? PROFILE_GUESSED : PROFILE_ABSENT);
   node->frequency
@@ -2815,80 +2890,110 @@  static void
 /* In the case of COMDAT routines, multiple object files will contain the same
    function and the linker will select one for the binary. In that case
    all the other copies from the profile instrument binary will be missing
-   profile counts. Look for cases where this happened, due to non-zero
-   call counts going to 0-count functions, and drop the profile to guessed
+   profile counts. This can confuse downstream optimizations such as
+   function splitting.
+
+   Look for cases where this happened, due to non-zero call counts going
+   to much lower count functions, and drop the profile to guessed
    so that we can use the estimated probabilities and avoid optimizing only
-   for size.
+   for size. In the case where the COMDAT was inlined in some locations
+   within the file and not others, the profile count will be non-zero due
+   to the inlined instances, but may still be significantly smaller than the
+   call edges for the non-inlined instances. Detect that case when requested
+   and reestimate probabilities, since the counts will not necessarily reflect
+   the behavior along the more frequent call paths.

    The other case where the profile may be missing is when the routine
    is not going to be emitted to the object file, e.g. for "extern template"
-   class methods. Those will be marked DECL_EXTERNAL. Emit a warning in
-   all other cases of non-zero calls to 0-count functions.  */
+   class methods. Those will be marked DECL_EXTERNAL.
+
+   Emit a warning in all other cases of hot calls to much lower-count
functions,
+   except when we have already dropped the profile on a caller. A non-COMDAT
+   IPA inlined into a COMDAT during the profile-generate compile could
+   lose some of its counts along with the COMDAT it is inlined into.
+
+   This is now invoked before rebuilding the cgraph after reading profile
+   counts, so the cgraph edge and node counts are still 0. Therefore we
+   need to look at the counts on the entry bbs and the call stmt bbs.
+   That way we can avoid needing to rebuild the cgraph again to reflect
+   the nodes with dropped profiles.  */

 void
 handle_missing_profiles (void)
 {
   struct cgraph_node *node;
   int unlikely_count_fraction = PARAM_VALUE (UNLIKELY_BB_COUNT_FRACTION);
-  vec<struct cgraph_node *> worklist;
-  worklist.create (64);
+  int min_reest_ratio = PARAM_VALUE (PARAM_MIN_CALLER_REESTIMATE_RATIO);

-  /* See if 0 count function has non-0 count callers.  In this case we
-     lost some profile.  Drop its function profile to PROFILE_GUESSED.  */
-  FOR_EACH_DEFINED_FUNCTION (node)
+  /* Walk the cgraph nodes in topological order so the dropped counts
+     are propagated in a consistent way when the profile is dropped
+     for multiple routines along a call chain.  */
+  int nnodes;
+  cgraph_node_ptr *order;
+  order = ggc_alloc_vec_cgraph_node_ptr (cgraph_n_nodes);
+  nnodes = ipa_reverse_postorder (order);
+  for (int i = 0; i < nnodes; i++)
     {
+      node = order[i];
       struct cgraph_edge *e;
       gcov_type call_count = 0;
       gcov_type max_tp_first_run = 0;
       struct function *fn = DECL_STRUCT_FUNCTION (node->decl);

-      if (node->count)
-        continue;
+      bool caller_dropped = false;
       for (e = node->callers; e; e = e->next_caller)
-      {
-        call_count += e->count;
+        {
+          if (e->caller->tp_first_run > max_tp_first_run)
+            max_tp_first_run = e->caller->tp_first_run;

-       if (e->caller->tp_first_run > max_tp_first_run)
-         max_tp_first_run = e->caller->tp_first_run;
-      }
+          struct function *caller_fn = DECL_STRUCT_FUNCTION (e->caller->decl);
+          if (!caller_fn || !caller_fn->cfg || !gimple_bb (e->call_stmt))
+            continue;
+          if (profile_status_for_fn (caller_fn) != PROFILE_READ)
+            caller_dropped = true;

+          call_count += cgraph_function_with_gimple_body_p (e->caller)
+                        ? gimple_bb (e->call_stmt)->count : 0;
+        }
+
+      if (!fn || !fn->cfg)
+        continue;
+
+      gcov_type node_count = ENTRY_BLOCK_PTR_FOR_FN (fn)->count;
+
+      /* When the PARAM_MIN_CALLER_REESTIMATE_RATIO is 0, then we only drop
+         profiles for 0-count functions called by non-zero call edges.  */
+      if (!min_reest_ratio && node_count > 0)
+        continue;
+      else if (min_reest_ratio)
+        {
+          /* If all callers still have their profile, see if this function
+             has a significantly smaller count than expected, and if so
+             drop the profile.  */
+          if (!caller_dropped && node_count * min_reest_ratio > call_count)
+            continue;
+
+          /* If we did drop a caller's profile since it was significantly
+             smaller than its call edges, then drop the profile on any callees
+             whose node count is now exceeded by the new caller node count.
+             Since at least some of the bb counts used to compute the
+             aggregate call_count were estimated, don't require that
+             the ratio meet the same threshold.  */
+          else if (caller_dropped && node_count >= call_count)
+            continue;
+        }
+
       /* If time profile is missing, let assign the maximum that comes from
         caller functions.  */
       if (!node->tp_first_run && max_tp_first_run)
        node->tp_first_run = max_tp_first_run + 1;

       if (call_count
-          && fn && fn->cfg
           && (call_count * unlikely_count_fraction >= profile_info->runs))
         {
-          drop_profile (node, call_count);
-          worklist.safe_push (node);
+          drop_profile (node, call_count, caller_dropped);
         }
     }
-
-  /* Propagate the profile dropping to other 0-count COMDATs that are
-     potentially called by COMDATs we already dropped the profile on.  */
-  while (worklist.length () > 0)
-    {
-      struct cgraph_edge *e;
-
-      node = worklist.pop ();
-      for (e = node->callees; e; e = e->next_caller)
-        {
-          struct cgraph_node *callee = e->callee;
-          struct function *fn = DECL_STRUCT_FUNCTION (callee->decl);
-
-          if (callee->count > 0)
-            continue;
-          if (DECL_COMDAT (callee->decl) && fn && fn->cfg
-              && profile_status_for_fn (fn) == PROFILE_READ)
-            {
-              drop_profile (node, 0);
-              worklist.safe_push (callee);
-            }
-        }
-    }
-  worklist.release ();
 }

 /* Convert counts measured by profile driven feedback to frequencies.
@@ -2900,12 +3005,6 @@  counts_to_freqs (void)
   gcov_type count_max, true_count_max = 0;
   basic_block bb;

-  /* Don't overwrite the estimated frequencies when the profile for
-     the function is missing.  We may drop this function PROFILE_GUESSED
-     later in drop_profile ().  */
-  if (!ENTRY_BLOCK_PTR_FOR_FN (cfun)->count)
-    return 0;
-
   FOR_BB_BETWEEN (bb, ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, next_bb)
     true_count_max = MAX (bb->count, true_count_max);

Index: predict.h
===================================================================
--- predict.h   (revision 208492)
+++ predict.h   (working copy)
@@ -51,7 +51,7 @@  extern void handle_missing_profiles (void);
 extern void estimate_bb_frequencies (bool);
 extern const char *predictor_name (enum br_predictor);
 extern tree build_predict_expr (enum br_predictor, enum prediction);
-extern void tree_estimate_probability (void);
+extern void tree_estimate_probability (bool);
 extern void compute_function_frequency (void);
 extern void rebuild_frequencies (void);

Index: tree-inline.c
===================================================================
--- tree-inline.c       (revision 208492)
+++ tree-inline.c       (working copy)
@@ -2384,29 +2384,6 @@  redirect_all_calls (copy_body_data * id, basic_blo
     }
 }

-/* Convert estimated frequencies into counts for NODE, scaling COUNT
-   with each bb's frequency. Used when NODE has a 0-weight entry
-   but we are about to inline it into a non-zero count call bb.
-   See the comments for handle_missing_profiles() in predict.c for
-   when this can happen for COMDATs.  */
-
-void
-freqs_to_counts (struct cgraph_node *node, gcov_type count)
-{
-  basic_block bb;
-  edge_iterator ei;
-  edge e;
-  struct function *fn = DECL_STRUCT_FUNCTION (node->decl);
-
-  FOR_ALL_BB_FN(bb, fn)
-    {
-      bb->count = apply_scale (count,
-                               GCOV_COMPUTE_SCALE (bb->frequency,
BB_FREQ_MAX));
-      FOR_EACH_EDGE (e, ei, bb->succs)
-        e->count = apply_probability (e->src->count, e->probability);
-    }
-}
-
 /* Make a copy of the body of FN so that it can be inserted inline in
    another function.  Walks FN via CFG, returns new fndecl.  */

@@ -2427,24 +2404,6 @@  copy_cfg_body (copy_body_data * id, gcov_type coun
   int incoming_frequency = 0;
   gcov_type incoming_count = 0;

-  /* This can happen for COMDAT routines that end up with 0 counts
-     despite being called (see the comments for handle_missing_profiles()
-     in predict.c as to why). Apply counts to the blocks in the callee
-     before inlining, using the guessed edge frequencies, so that we don't
-     end up with a 0-count inline body which can confuse downstream
-     optimizations such as function splitting.  */
-  if (!ENTRY_BLOCK_PTR_FOR_FN (src_cfun)->count && count)
-    {
-      /* Apply the larger of the call bb count and the total incoming
-         call edge count to the callee.  */
-      gcov_type in_count = 0;
-      struct cgraph_edge *in_edge;
-      for (in_edge = id->src_node->callers; in_edge;
-           in_edge = in_edge->next_caller)
-        in_count += in_edge->count;
-      freqs_to_counts (id->src_node, count > in_count ? count : in_count);
-    }
-
   if (ENTRY_BLOCK_PTR_FOR_FN (src_cfun)->count)
     count_scale
         = GCOV_COMPUTE_SCALE (count,
@@ -2452,6 +2411,13 @@  copy_cfg_body (copy_body_data * id, gcov_type coun
   else
     count_scale = REG_BR_PROB_BASE;

+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file,
+             "Scaling entry count %ld to %ld with scale %ld while inlining "
+             "%s into %s\n",
+             count, ENTRY_BLOCK_PTR_FOR_FN (src_cfun)->count, count_scale,
+             id->src_node->name (), id->dst_node->name ());
+
   /* Register specific tree functions.  */
   gimple_register_cfg_hooks ();

Index: tree-profile.c
===================================================================
--- tree-profile.c      (revision 208492)
+++ tree-profile.c      (working copy)
@@ -621,6 +621,8 @@  tree_profiling (void)
       cgraph_set_pure_flag (node, false, false);
     }

+  handle_missing_profiles ();
+
   /* Update call statements and rebuild the cgraph.  */
   FOR_EACH_DEFINED_FUNCTION (node)
     {
@@ -657,8 +659,6 @@  tree_profiling (void)
       pop_cfun ();
     }

-  handle_missing_profiles ();
-
   del_node_map ();
   return 0;
 }