diff mbox

Sanitize block partitioning under -freorder-blocks-and-partition

Message ID CAAe5K+XM_RCJBmtKSyQCfD86hi2Ls_BCyuZkd2WKSoGUFm84DA@mail.gmail.com
State New
Headers show

Commit Message

Teresa Johnson Aug. 2, 2013, 2:51 p.m. UTC
On Fri, Aug 2, 2013 at 4:22 AM, Bernhard Reutner-Fischer
<rep.dot.nop@gmail.com> wrote:
> On 1 August 2013 18:32, Teresa Johnson <tejohnson@google.com> wrote:
>> Patch 3 of 3 split out from the patch I sent in May that fixes problems with
>> -freorder-blocks-and-partition, with changes/fixes discussed in that thread.
>>
>> See http://gcc.gnu.org/ml/gcc-patches/2013-05/threads.html#00388 for context.
>>
>> This patch sanitizes the partitioning to address issues such as edge
>> weight insanities that sometimes occur due to upstream optimizations,
>> and ensures that hot blocks are not dominated by cold blocks. This
>> needs to be resanitized after certain cfg optimizations that may
>> cause hot blocks previously reached via both hot and cold paths to
>> only be reached by cold paths.
>>
>> The verification code in sanitize_dominator_hotness was contributed by
>> Steven Bosscher.
>>
>> Bootstrapped and tested on x86-64-unknown-linux-gnu. Also ensured that
>> a profiledbootstrap passed with -freorder-blocks-and-partition enabled
>> (and with the dwarf version changed to 2 to work around PR57451).
>>
>> Ok for trunk?
>>
>> (I also included the patch as an attachment since my mailer invariably
>> messes up the formatting in the pasted version.)
>>
>> Thanks,
>> Teresa
>>
>> 2013-08-01  Teresa Johnson  <tejohnson@google.com>
>>             Steven Bosscher  <steven@gcc.gnu.org>
>>
>>         * cfgrtl.c (fixup_bb_partition): New routine.
>>         (commit_edge_insertions): Invoke fixup_partitions.
>>         (find_partition_fixes): New routine.
>>         (fixup_partitions): Ditto.
>>         (verify_hot_cold_block_grouping): Update comments.
>>         (rtl_verify_edges): Invoke find_partition_fixes.
>>         (rtl_verify_bb_pointers): Update comments.
>>         (rtl_verify_bb_layout): Ditto.
>>         * basic-block.h (fixup_partitions): Declare.
>>         * cfgcleanup.c (try_optimize_cfg): Invoke fixup_partitions.
>>         * bb-reorder.c (sanitize_dominator_hotness): New function.
>>         (find_rarely_executed_basic_blocks_and_crossing_edges): Invoke
>>         sanitize_dominator_hotness.
>>
>> Index: cfgrtl.c
>> ===================================================================
>> --- cfgrtl.c (revision 201281)
>> +++ cfgrtl.c (working copy)
>> @@ -1341,6 +1341,34 @@ fixup_partition_crossing (edge e)
>>      }
>>  }
>>
>> +/* Called when block BB has been reassigned to a different partition,
>> +   to ensure that the region crossing attributes are updated.  */
>> +
>> +static void
>> +fixup_bb_partition (basic_block bb)
>> +{
>> +  edge e;
>> +  edge_iterator ei;
>> +
>> +  /* Now need to make bb's pred edges non-region crossing.  */
>> +  FOR_EACH_EDGE (e, ei, bb->preds)
>> +    {
>> +      fixup_partition_crossing (e);
>> +    }
>> +
>> +  /* Possibly need to make bb's successor edges region crossing,
>> +     or remove stale region crossing.  */
>> +  FOR_EACH_EDGE (e, ei, bb->succs)
>> +    {
>> +      if ((e->flags & EDGE_FALLTHRU)
>> +          && BB_PARTITION (bb) != BB_PARTITION (e->dest)
>> +          && e->dest != EXIT_BLOCK_PTR)
>> +        force_nonfallthru (e);
>> +      else
>> +        fixup_partition_crossing (e);
>> +    }
>> +}
>> +
>>  /* Attempt to change code to redirect edge E to TARGET.  Don't do that on
>>     expense of adding new instructions or reordering basic blocks.
>>
>> @@ -1979,6 +2007,14 @@ commit_edge_insertions (void)
>>  {
>>    basic_block bb;
>>
>> +  /* Optimization passes that invoke this routine can cause hot blocks
>> +     previously reached by both hot and cold blocks to become dominated only
>> +     by cold blocks. This will cause the verification below to fail,
>> +     and lead to now cold code in the hot section. In some cases this
>> +     may only be visible after newly unreachable blocks are deleted,
>> +     which will be done by fixup_partitions.  */
>> +  fixup_partitions ();
>> +
>>  #ifdef ENABLE_CHECKING
>>    verify_flow_info ();
>>  #endif
>> @@ -2173,6 +2209,101 @@ get_last_bb_insn (basic_block bb)
>>    return end;
>>  }
>>
>> +/* Sanity check partition hotness to ensure that basic blocks in
>> +   the cold partition don't dominate basic blocks in the hot partition.
>> +   If FLAG_ONLY is true, report violations as errors. Otherwise
>> +   re-mark the dominated blocks as cold, since this is run after
>> +   cfg optimizations that may make hot blocks previously reached
>> +   by both hot and cold blocks now only reachable along cold paths.  */
>> +
>> +vec<basic_block>
>> +find_partition_fixes (bool flag_only)
>> +{
>> +  basic_block bb;
>> +  vec<basic_block> bbs_in_cold_partition = vNULL;
>> +  vec<basic_block> bbs_to_fix = vNULL;
>> +
>> +  if (!crtl->has_bb_partition)
>> +    return vNULL;
>
> I'd push this early return into the callers instead, at most turn it into a
> gcc_checking_assert to be safe.
>
> Both callers, fixup_partitions and rtl_verify_edges, look at
> ctrl->has_bb_partition already before calling this, so the above should
> be dead already.

Right, I was being paranoid - changed to a gcc_checking_assert.

>
> Did my mailer somehow swallow the static from find_partition_fixes?

Oops, I missed that - added the static.

>
>> +
>> +  FOR_EACH_BB (bb)
>> +    if ((BB_PARTITION (bb) == BB_COLD_PARTITION))
>> +      bbs_in_cold_partition.safe_push (bb);
>> +
>> +  if (bbs_in_cold_partition.is_empty ())
>> +    return vNULL;
>> +
>> +  bool dom_calculated_here = !dom_info_available_p (CDI_DOMINATORS);
>> +
>> +  if (dom_calculated_here)
>> +    calculate_dominance_info (CDI_DOMINATORS);
>> +
>> +  while (! bbs_in_cold_partition.is_empty  ())
>> +    {
>> +      bb = bbs_in_cold_partition.pop ();
>> +      /* Any blocks dominated by a block in the cold section
>> +         must also be cold.  */
>> +      basic_block son;
>> +      for (son = first_dom_son (CDI_DOMINATORS, bb);
>> +           son;
>> +           son = next_dom_son (CDI_DOMINATORS, son))
>> +        {
>> +          /* If son is not yet cold, then mark it cold here and
>> +             enqueue it for further processing.  */
>> +          if ((BB_PARTITION (son) != BB_COLD_PARTITION))
>> +            {
>> +              if (flag_only)
>> +                error ("non-cold basic block %d dominated "
>> +                       "by a block in the cold partition", son->index);
>> +              else
>> +                BB_SET_PARTITION (son, BB_COLD_PARTITION);
>> +              bbs_to_fix.safe_push (son);
>> +              bbs_in_cold_partition.safe_push (son);
>> +            }
>> +        }
>> +    }
>> +
>> +  if (dom_calculated_here)
>> +    free_dominance_info (CDI_DOMINATORS);
>> +
>> +  return bbs_to_fix;
>> +}
>> +
>> +/* Perform cleanup on the hot/cold bb partitioning after optimization
>> +   passes that modify the cfg.  */
>> +
>> +void
>> +fixup_partitions (void)
>> +{
>> +  basic_block bb;
>> +
>> +  if (!crtl->has_bb_partition)
>> +    return;
>> +
>> +  /* Delete any blocks that became unreachable and weren't
>> +     already cleaned up, for example during edge forwarding
>> +     and convert_jumps_to_returns. This will expose more
>> +     opportunities for fixing the partition boundaries here.
>> +     Also, the calculation of the dominance graph during verification
>> +     will assert if there are unreachable nodes.  */
>> +  delete_unreachable_blocks ();
>> +
>> +  /* If there are partitions, do a sanity check on them: A basic block in
>> +     a cold partition cannot dominate a basic block in a hot partition.
>> +     Fixup any that now violate this requirement, as a result of edge
>> +     forwarding and unreachable block deletion.  */
>> +  vec<basic_block> bbs_to_fix = find_partition_fixes (false);
>> +
>> +  /* Do the partition fixup after all necessary blocks have been converted to
>> +     cold, so that we only update the region crossings the minimum number of
>> +     places, which can require forcing edges to be non fallthru.  */
>> +  while (! bbs_to_fix.is_empty ())
>> +    {
>> +      bb = bbs_to_fix.pop ();
>> +      fixup_bb_partition (bb);
>> +    }
>> +}
>> +
>>  /* Verify, in the basic block chain, that there is at most one switch
>>     between hot/cold partitions. This condition will not be true until
>>     after reorder_basic_blocks is called.  */
>> @@ -2219,7 +2350,8 @@ verify_hot_cold_block_grouping (void)
>>  /* Perform several checks on the edges out of each block, such as
>>     the consistency of the branch probabilities, the correctness
>>     of hot/cold partition crossing edges, and the number of expected
>> -   successor edges.  */
>> +   successor edges.  Also verify that the dominance relationship
>> +   between hot/cold blocks is sane.  */
>>
>>  static int
>>  rtl_verify_edges (void)
>> @@ -2382,6 +2514,14 @@ rtl_verify_edges (void)
>>   }
>>      }
>>
>> +  /* If there are partitions, do a sanity check on them: A basic block in
>> +     a cold partition cannot dominate a basic block in a hot partition.  */
>> +  if (crtl->has_bb_partition && !err)
>> +    {
>> +      vec<basic_block> bbs_to_fix = find_partition_fixes (true);
>> +      err = !bbs_to_fix.is_empty ();
>> +    }
>> +
>>    /* Clean up.  */
>>    return err;
>>  }
>> @@ -2515,7 +2655,7 @@ rtl_verify_bb_pointers (void)
>>       and NOTE_INSN_BASIC_BLOCK
>>     - verify that no fall_thru edge crosses hot/cold partition boundaries
>>     - verify that there are no pending RTL branch predictions
>> -   - verify that there is a single hot/cold partition boundary after bbro
>> +   - verify that hot blocks are not dominated by cold blocks
>>
>>     In future it can be extended check a lot of other stuff as well
>>     (reachability of basic blocks, life information, etc. etc.).  */
>> @@ -2761,7 +2901,8 @@ rtl_verify_bb_layout (void)
>>     - check that all insns are in the basic blocks
>>       (except the switch handling code, barriers and notes)
>>     - check that all returns are followed by barriers
>> -   - check that all fallthru edge points to the adjacent blocks.  */
>> +   - check that all fallthru edge points to the adjacent blocks
>> +   - verify that there is a single hot/cold partition boundary after bbro  */
>>
>>  static int
>>  rtl_verify_flow_info (void)
>> Index: basic-block.h
>> ===================================================================
>> --- basic-block.h (revision 201281)
>> +++ basic-block.h (working copy)
>> @@ -797,6 +797,7 @@ extern bool contains_no_active_insn_p (const_basic
>>  extern bool forwarder_block_p (const_basic_block);
>>  extern bool can_fallthru (basic_block, basic_block);
>>  extern void emit_barrier_after_bb (basic_block bb);
>> +extern void fixup_partitions (void);
>>
>>  /* In cfgbuild.c.  */
>>  extern void find_many_sub_basic_blocks (sbitmap);
>> Index: cfgcleanup.c
>> ===================================================================
>> --- cfgcleanup.c (revision 201281)
>> +++ cfgcleanup.c (working copy)
>> @@ -2807,10 +2807,21 @@ try_optimize_cfg (int mode)
>>        df_analyze ();
>>      }
>>
>> +  if (changed)
>> +            {
>> +              /* Edge forwarding in particular can cause hot blocks previously
>> +                 reached by both hot and cold blocks to become dominated only
>> +                 by cold blocks. This will cause the verification
>> below to fail,
>> +                 and lead to now cold code in the hot section. This is not easy
>> +                 to detect and fix during edge forwarding, and in some cases
>> +                 is only visible after newly unreachable blocks are deleted,
>> +                 which will be done in fixup_partitions.  */
>> +              fixup_partitions ();
>> +
>>  #ifdef ENABLE_CHECKING
>> -  if (changed)
>> -    verify_flow_info ();
>> +              verify_flow_info ();
>>  #endif
>> +            }
>>
>>    changed_overall |= changed;
>>    first_pass = false;
>> Index: bb-reorder.c
>> ===================================================================
>> --- bb-reorder.c (revision 201281)
>> +++ bb-reorder.c (working copy)
>> @@ -1444,6 +1444,55 @@ fix_up_crossing_landing_pad (eh_landing_pad old_lp
>>        ei_next (&ei);
>>  }
>>
>> +
>> +/* Ensure that no cold bbs dominate hot bbs along the dominance or
>> +   post-dominance DIR, for example as a result of edge weight insanities.
>> +   Returns the updated value of COLD_BB_COUNT and adds newly-hot bbs
>> +   to BBS_IN_HOT_PARTITION.  */
>> +
>> +static unsigned int
>> +sanitize_dominator_hotness (enum cdi_direction dir, unsigned int cold_bb_count,
>> +                            vec<basic_block> *bbs_in_hot_partition)
>> +{
>> +  if (!cold_bb_count)
>> +    return 0;
>
> Same pattern as above. Callers do not invoke us if !cold_bb_count so the above
> check is dead code. Again, remove or checking assert?

Changed to checking assert.

>> +
>> +  bool dom_calculated_here = !dom_info_available_p (dir);
>> +
>> +  if (dom_calculated_here)
>> +    calculate_dominance_info (dir);
>> +
>> +  /* Keep examining hot bbs until we have either checked them all, or
>> +     re-marked all cold bbs as hot.  */
>> +  vec<basic_block> hot_bbs_to_check = bbs_in_hot_partition->copy ();
>> +  while (! hot_bbs_to_check.is_empty ()
>> +         && cold_bb_count)
>
> The comment says "or", which sounds plausible, but the code says "and"?

The comment is describing the conditions for exiting the loop, which
is why it was an "or". I changed the comment to describe the
conditions for staying in the loop to make it more consistent/clearer:

  /* Keep examining hot bbs while we still have some left to check
     and there are remaining cold bbs.  */

>> +    {
>> +      basic_block bb = hot_bbs_to_check.pop ();
>> +      basic_block dom_bb = get_immediate_dominator (dir, bb);
>> +
>> +      /* If bb's immediate dominator is also hot then it is ok.  */
>> +      if (BB_PARTITION (dom_bb) != BB_COLD_PARTITION)
>
> Why not follow the comment here and == BB_HOT_PARTITION instead, for clarity?

The entry/exit blocks are unpartitioned, and we want to skip those. I
have clarified that in the comment:

      /* If bb's immediate dominator is also hot (or unpartitioned,
         e.g. the entry block) then it is ok. If it is cold, it
         needs to be adjusted.  */

>
>> +        continue;
>> +
>> +      /* We have a hot bb with an immediate dominator that is cold.
>> +         The dominator needs to be re-marked hot.  */
>> +      BB_SET_PARTITION (dom_bb, BB_HOT_PARTITION);
>> +      cold_bb_count--;
>> +
>> +      /* Now we need to examine newly-hot dom_bb to see if it is also
>> +         dominated by a cold bb.  */
>> +      bbs_in_hot_partition->safe_push (dom_bb);
>> +      hot_bbs_to_check.safe_push (dom_bb);
>> +    }
>> +
>> +  if (dom_calculated_here)
>> +    free_dominance_info (dir);
>> +
>> +  return cold_bb_count;
>> +}
>> +
>> +
>>  /* Find the basic blocks that are rarely executed and need to be moved to
>>     a separate section of the .o file (to cut down on paging and improve
>>     cache locality).  Return a vector of all edges that cross.  */
>> @@ -1455,16 +1504,42 @@ find_rarely_executed_basic_blocks_and_crossing_edg
>>    basic_block bb;
>>    edge e;
>>    edge_iterator ei;
>> +  unsigned int cold_bb_count = 0;
>> +  vec<basic_block> bbs_in_hot_partition = vNULL;
>>
>>    /* Mark which partition (hot/cold) each basic block belongs in.  */
>>    FOR_EACH_BB (bb)
>>      {
>>        if (probably_never_executed_bb_p (cfun, bb))
>> - BB_SET_PARTITION (bb, BB_COLD_PARTITION);
>> +        {
>> +          BB_SET_PARTITION (bb, BB_COLD_PARTITION);
>> +          cold_bb_count++;
>> +        }
>>        else
>> - BB_SET_PARTITION (bb, BB_HOT_PARTITION);
>> +        {
>> +          BB_SET_PARTITION (bb, BB_HOT_PARTITION);
>> +          bbs_in_hot_partition.safe_push (bb);
>> +        }
>>      }
>>
>> +  /* Ensure that no cold bbs dominate hot bbs. This could happen as a result of
>> +     several different possibilities. One is that there are edge
>> weight insanities
>> +     due to optimization phases that do not properly update basic block profile
>> +     counts. The second is that the entry of the function may not be
>> hot, because
>> +     it is entered fewer times than the number of profile training
>> runs, but there
>> +     is a loop inside the function that causes blocks within the function to be
>> +     above the threshold for hotness. Then do the same along the post-dominator
>> +     tree (which could have additional changes required after fixing up
>> +     dominators).  */
>> +  if (cold_bb_count)
>> +    cold_bb_count = sanitize_dominator_hotness (CDI_DOMINATORS,
>> +                                                cold_bb_count,
>> +                                                &bbs_in_hot_partition);
>> +  if (cold_bb_count)
>> +    cold_bb_count = sanitize_dominator_hotness (CDI_POST_DOMINATORS,
>> +                                                cold_bb_count,
>> +                                                &bbs_in_hot_partition);
>
> I take it this last store to cold_bb_count is eliminated anyway.

Yep, and I went ahead and removed the dead store.

Thanks for the review. New patch below.

Teresa


2013-08-01  Teresa Johnson  <tejohnson@google.com>
            Steven Bosscher  <steven@gcc.gnu.org>

        * cfgrtl.c (fixup_bb_partition): New routine.
        (commit_edge_insertions): Invoke fixup_partitions.
        (find_partition_fixes): New routine.
        (fixup_partitions): Ditto.
        (verify_hot_cold_block_grouping): Update comments.
        (rtl_verify_edges): Invoke find_partition_fixes.
        (rtl_verify_bb_pointers): Update comments.
        (rtl_verify_bb_layout): Ditto.
        * basic-block.h (fixup_partitions): Declare.
        * cfgcleanup.c (try_optimize_cfg): Invoke fixup_partitions.
        * bb-reorder.c (sanitize_dominator_hotness): New function.
        (find_rarely_executed_basic_blocks_and_crossing_edges): Invoke
        sanitize_dominator_hotness.

   /* The format of .gcc_except_table does not allow landing pads to
      be in a different partition as the throw.  Fix this by either
      moving or duplicating the landing pads.  */

Comments

Jan Hubicka Aug. 2, 2013, 3:05 p.m. UTC | #1
> 
> 2013-08-01  Teresa Johnson  <tejohnson@google.com>
>             Steven Bosscher  <steven@gcc.gnu.org>
> 
>         * cfgrtl.c (fixup_bb_partition): New routine.
>         (commit_edge_insertions): Invoke fixup_partitions.
>         (find_partition_fixes): New routine.
>         (fixup_partitions): Ditto.
>         (verify_hot_cold_block_grouping): Update comments.
>         (rtl_verify_edges): Invoke find_partition_fixes.
>         (rtl_verify_bb_pointers): Update comments.
>         (rtl_verify_bb_layout): Ditto.
>         * basic-block.h (fixup_partitions): Declare.
>         * cfgcleanup.c (try_optimize_cfg): Invoke fixup_partitions.
>         * bb-reorder.c (sanitize_dominator_hotness): New function.
>         (find_rarely_executed_basic_blocks_and_crossing_edges): Invoke
>         sanitize_dominator_hotness.
> 
> Index: cfgrtl.c
> ===================================================================
> --- cfgrtl.c    (revision 201281)
> +++ cfgrtl.c    (working copy)
> @@ -1341,6 +1341,34 @@ fixup_partition_crossing (edge e)
>      }
>  }
> 
> +/* Called when block BB has been reassigned to a different partition,
> +   to ensure that the region crossing attributes are updated.  */
> +
> +static void
> +fixup_bb_partition (basic_block bb)
> +{
> +  edge e;
> +  edge_iterator ei;
> +
> +  /* Now need to make bb's pred edges non-region crossing.  */
> +  FOR_EACH_EDGE (e, ei, bb->preds)
> +    {
> +      fixup_partition_crossing (e);
> +    }
> +
> +  /* Possibly need to make bb's successor edges region crossing,
> +     or remove stale region crossing.  */
> +  FOR_EACH_EDGE (e, ei, bb->succs)
> +    {
> +      if ((e->flags & EDGE_FALLTHRU)
> +          && BB_PARTITION (bb) != BB_PARTITION (e->dest)
> +          && e->dest != EXIT_BLOCK_PTR)
> +        force_nonfallthru (e);
> +      else
> +        fixup_partition_crossing (e);
> +    }
> +}

Is there particular reason why preds can not be fallhtrus and why
force_nonfallthru edge does not need partition crossing fixup?
(if so, perhpas it could be mentioned in the description, if not,
I think force_nonfallthru path has to check if new BB was introduced
and do the right thing on the edge.

> +/* Sanity check partition hotness to ensure that basic blocks in
> +   the cold partition don't dominate basic blocks in the hot partition.
> +   If FLAG_ONLY is true, report violations as errors. Otherwise
> +   re-mark the dominated blocks as cold, since this is run after
> +   cfg optimizations that may make hot blocks previously reached
> +   by both hot and cold blocks now only reachable along cold paths.  */

With profile, I suppose we can have cold blocks dominating hot blocks when the
hot blocks is in loop whose trip count is high enough.  Indeed for partitioning
reasons it does not make sense to push those into different section.

I also wonder, if we finally get the pass stable, can we enable it by default
and offline probably cold blocks w/o profile? Primarily blocks reachable only
by EH + blocks leading to a crash or throw().  For C++ those should be common
enough to make a difference...

Honza
Steven Bosscher Aug. 2, 2013, 11:04 p.m. UTC | #2
On Fri, Aug 2, 2013 at 5:05 PM, Jan Hubicka <hubicka@ucw.cz> wrote:
>> +/* Called when block BB has been reassigned to a different partition,
>> +   to ensure that the region crossing attributes are updated.  */
>> +
>> +static void
>> +fixup_bb_partition (basic_block bb)
>> +{
>> +  edge e;
>> +  edge_iterator ei;
>> +
>> +  /* Now need to make bb's pred edges non-region crossing.  */
>> +  FOR_EACH_EDGE (e, ei, bb->preds)
>> +    {
>> +      fixup_partition_crossing (e);
>> +    }
>> +
>> +  /* Possibly need to make bb's successor edges region crossing,
>> +     or remove stale region crossing.  */
>> +  FOR_EACH_EDGE (e, ei, bb->succs)
>> +    {
>> +      if ((e->flags & EDGE_FALLTHRU)
>> +          && BB_PARTITION (bb) != BB_PARTITION (e->dest)
>> +          && e->dest != EXIT_BLOCK_PTR)
>> +        force_nonfallthru (e);
>> +      else
>> +        fixup_partition_crossing (e);
>> +    }
>> +}
>
> Is there particular reason why preds can not be fallhtrus

Yes, by definition a crossing edge cannot fall through. There is
always a control transfer from one section to another.


>> +/* Sanity check partition hotness to ensure that basic blocks in
>> +   the cold partition don't dominate basic blocks in the hot partition.
>> +   If FLAG_ONLY is true, report violations as errors. Otherwise
>> +   re-mark the dominated blocks as cold, since this is run after
>> +   cfg optimizations that may make hot blocks previously reached
>> +   by both hot and cold blocks now only reachable along cold paths.  */
>
> With profile, I suppose we can have cold blocks dominating hot blocks when the
> hot blocks is in loop whose trip count is high enough.

That is the common case, actually.

>  Indeed for partitioning
> reasons it does not make sense to push those into different section.

The partitioning algrorithm makes sure this doesn't happen. The
hottest path from the entry block to a hot basic block is always part
of the hot partition.


> I also wonder, if we finally get the pass stable, can we enable it by default
> and offline probably cold blocks w/o profile?

That is the general idea behind all this work, obviously ;-)

> Primarily blocks reachable only
> by EH + blocks leading to a crash or throw().  For C++ those should be common
> enough to make a difference...

Yup, and IIRC Theresa posted some numbers that showed this.

Ciao!
Steven
Teresa Johnson Aug. 3, 2013, 4:48 a.m. UTC | #3
On Fri, Aug 2, 2013 at 8:05 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
>>
>> 2013-08-01  Teresa Johnson  <tejohnson@google.com>
>>             Steven Bosscher  <steven@gcc.gnu.org>
>>
>>         * cfgrtl.c (fixup_bb_partition): New routine.
>>         (commit_edge_insertions): Invoke fixup_partitions.
>>         (find_partition_fixes): New routine.
>>         (fixup_partitions): Ditto.
>>         (verify_hot_cold_block_grouping): Update comments.
>>         (rtl_verify_edges): Invoke find_partition_fixes.
>>         (rtl_verify_bb_pointers): Update comments.
>>         (rtl_verify_bb_layout): Ditto.
>>         * basic-block.h (fixup_partitions): Declare.
>>         * cfgcleanup.c (try_optimize_cfg): Invoke fixup_partitions.
>>         * bb-reorder.c (sanitize_dominator_hotness): New function.
>>         (find_rarely_executed_basic_blocks_and_crossing_edges): Invoke
>>         sanitize_dominator_hotness.
>>
>> Index: cfgrtl.c
>> ===================================================================
>> --- cfgrtl.c    (revision 201281)
>> +++ cfgrtl.c    (working copy)
>> @@ -1341,6 +1341,34 @@ fixup_partition_crossing (edge e)
>>      }
>>  }
>>
>> +/* Called when block BB has been reassigned to a different partition,
>> +   to ensure that the region crossing attributes are updated.  */
>> +
>> +static void
>> +fixup_bb_partition (basic_block bb)
>> +{
>> +  edge e;
>> +  edge_iterator ei;
>> +
>> +  /* Now need to make bb's pred edges non-region crossing.  */
>> +  FOR_EACH_EDGE (e, ei, bb->preds)
>> +    {
>> +      fixup_partition_crossing (e);
>> +    }
>> +
>> +  /* Possibly need to make bb's successor edges region crossing,
>> +     or remove stale region crossing.  */
>> +  FOR_EACH_EDGE (e, ei, bb->succs)
>> +    {
>> +      if ((e->flags & EDGE_FALLTHRU)
>> +          && BB_PARTITION (bb) != BB_PARTITION (e->dest)
>> +          && e->dest != EXIT_BLOCK_PTR)
>> +        force_nonfallthru (e);
>> +      else
>> +        fixup_partition_crossing (e);
>> +    }
>> +}
>
> Is there particular reason why preds can not be fallhtrus and why
> force_nonfallthru edge does not need partition crossing fixup?
> (if so, perhpas it could be mentioned in the description, if not,
> I think force_nonfallthru path has to check if new BB was introduced
> and do the right thing on the edge.

I need to clarify the comments in this routine, because without the
context of how this is called it isn't clear. This routine is only
called when we detect a hot bb that is now dominated by a cold bb and
needs to become cold. Therefore, its preds will no longer be region
crossing (any non-dominating blocks that were previously hot would
have been marked cold in the caller for the same reason, so we will
not end up adjusting the region crossing-ness or fallthrough-ness of
those pred edges). Any that were region crossing before but aren't any
longer could not have been fall through (as Steven noted, you can't
have a fall through across a partition boundary). I will add some
better comments here.

Regarding the call to force_nonfallthru, that routine calls
fixup_partition_crossing as needed, and I will update the comment to
reflect that too.

>
>> +/* Sanity check partition hotness to ensure that basic blocks in
>> +   the cold partition don't dominate basic blocks in the hot partition.
>> +   If FLAG_ONLY is true, report violations as errors. Otherwise
>> +   re-mark the dominated blocks as cold, since this is run after
>> +   cfg optimizations that may make hot blocks previously reached
>> +   by both hot and cold blocks now only reachable along cold paths.  */
>
> With profile, I suppose we can have cold blocks dominating hot blocks when the
> hot blocks is in loop whose trip count is high enough.  Indeed for partitioning
> reasons it does not make sense to push those into different section.
>
> I also wonder, if we finally get the pass stable, can we enable it by default
> and offline probably cold blocks w/o profile? Primarily blocks reachable only
> by EH + blocks leading to a crash or throw().  For C++ those should be common
> enough to make a difference...

Yep, as soon as PR57451 is fixed, which I hope to get to next week,
then I am going to send a patch to turn this on by default, at least
with profile feedback, which is where I've been doing performance
tuning. But you are right that there are some cases where it should be
beneficial without profile data as well.

Thanks,
Teresa

>
> Honza
Teresa Johnson Aug. 3, 2013, 4:52 a.m. UTC | #4
On Fri, Aug 2, 2013 at 4:04 PM, Steven Bosscher <stevenb.gcc@gmail.com> wrote:
> On Fri, Aug 2, 2013 at 5:05 PM, Jan Hubicka <hubicka@ucw.cz> wrote:
>>> +/* Called when block BB has been reassigned to a different partition,
>>> +   to ensure that the region crossing attributes are updated.  */
>>> +
>>> +static void
>>> +fixup_bb_partition (basic_block bb)
>>> +{
>>> +  edge e;
>>> +  edge_iterator ei;
>>> +
>>> +  /* Now need to make bb's pred edges non-region crossing.  */
>>> +  FOR_EACH_EDGE (e, ei, bb->preds)
>>> +    {
>>> +      fixup_partition_crossing (e);
>>> +    }
>>> +
>>> +  /* Possibly need to make bb's successor edges region crossing,
>>> +     or remove stale region crossing.  */
>>> +  FOR_EACH_EDGE (e, ei, bb->succs)
>>> +    {
>>> +      if ((e->flags & EDGE_FALLTHRU)
>>> +          && BB_PARTITION (bb) != BB_PARTITION (e->dest)
>>> +          && e->dest != EXIT_BLOCK_PTR)
>>> +        force_nonfallthru (e);
>>> +      else
>>> +        fixup_partition_crossing (e);
>>> +    }
>>> +}
>>
>> Is there particular reason why preds can not be fallhtrus
>
> Yes, by definition a crossing edge cannot fall through. There is
> always a control transfer from one section to another.
>
>
>>> +/* Sanity check partition hotness to ensure that basic blocks in
>>> +   the cold partition don't dominate basic blocks in the hot partition.
>>> +   If FLAG_ONLY is true, report violations as errors. Otherwise
>>> +   re-mark the dominated blocks as cold, since this is run after
>>> +   cfg optimizations that may make hot blocks previously reached
>>> +   by both hot and cold blocks now only reachable along cold paths.  */
>>
>> With profile, I suppose we can have cold blocks dominating hot blocks when the
>> hot blocks is in loop whose trip count is high enough.
>
> That is the common case, actually.
>
>>  Indeed for partitioning
>> reasons it does not make sense to push those into different section.
>
> The partitioning algrorithm makes sure this doesn't happen. The
> hottest path from the entry block to a hot basic block is always part
> of the hot partition.

Well, at least with this patch that will be true. The trunk version
just partitions based on the bb's count without regard to paths.

Thanks,
Teresa

>
>
>> I also wonder, if we finally get the pass stable, can we enable it by default
>> and offline probably cold blocks w/o profile?
>
> That is the general idea behind all this work, obviously ;-)
>
>> Primarily blocks reachable only
>> by EH + blocks leading to a crash or throw().  For C++ those should be common
>> enough to make a difference...
>
> Yup, and IIRC Theresa posted some numbers that showed this.
>
> Ciao!
> Steven
diff mbox

Patch

Index: cfgrtl.c
===================================================================
--- cfgrtl.c    (revision 201281)
+++ cfgrtl.c    (working copy)
@@ -1341,6 +1341,34 @@  fixup_partition_crossing (edge e)
     }
 }

+/* Called when block BB has been reassigned to a different partition,
+   to ensure that the region crossing attributes are updated.  */
+
+static void
+fixup_bb_partition (basic_block bb)
+{
+  edge e;
+  edge_iterator ei;
+
+  /* Now need to make bb's pred edges non-region crossing.  */
+  FOR_EACH_EDGE (e, ei, bb->preds)
+    {
+      fixup_partition_crossing (e);
+    }
+
+  /* Possibly need to make bb's successor edges region crossing,
+     or remove stale region crossing.  */
+  FOR_EACH_EDGE (e, ei, bb->succs)
+    {
+      if ((e->flags & EDGE_FALLTHRU)
+          && BB_PARTITION (bb) != BB_PARTITION (e->dest)
+          && e->dest != EXIT_BLOCK_PTR)
+        force_nonfallthru (e);
+      else
+        fixup_partition_crossing (e);
+    }
+}
+
 /* Attempt to change code to redirect edge E to TARGET.  Don't do that on
    expense of adding new instructions or reordering basic blocks.

@@ -1979,6 +2007,14 @@  commit_edge_insertions (void)
 {
   basic_block bb;

+  /* Optimization passes that invoke this routine can cause hot blocks
+     previously reached by both hot and cold blocks to become dominated only
+     by cold blocks. This will cause the verification below to fail,
+     and lead to now cold code in the hot section. In some cases this
+     may only be visible after newly unreachable blocks are deleted,
+     which will be done by fixup_partitions.  */
+  fixup_partitions ();
+
 #ifdef ENABLE_CHECKING
   verify_flow_info ();
 #endif
@@ -2173,6 +2209,101 @@  get_last_bb_insn (basic_block bb)
   return end;
 }

+/* Sanity check partition hotness to ensure that basic blocks in
+   the cold partition don't dominate basic blocks in the hot partition.
+   If FLAG_ONLY is true, report violations as errors. Otherwise
+   re-mark the dominated blocks as cold, since this is run after
+   cfg optimizations that may make hot blocks previously reached
+   by both hot and cold blocks now only reachable along cold paths.  */
+
+static vec<basic_block>
+find_partition_fixes (bool flag_only)
+{
+  basic_block bb;
+  vec<basic_block> bbs_in_cold_partition = vNULL;
+  vec<basic_block> bbs_to_fix = vNULL;
+
+  /* Callers check this.  */
+  gcc_checking_assert (crtl->has_bb_partition);
+
+  FOR_EACH_BB (bb)
+    if ((BB_PARTITION (bb) == BB_COLD_PARTITION))
+      bbs_in_cold_partition.safe_push (bb);
+
+  if (bbs_in_cold_partition.is_empty ())
+    return vNULL;
+
+  bool dom_calculated_here = !dom_info_available_p (CDI_DOMINATORS);
+
+  if (dom_calculated_here)
+    calculate_dominance_info (CDI_DOMINATORS);
+
+  while (! bbs_in_cold_partition.is_empty  ())
+    {
+      bb = bbs_in_cold_partition.pop ();
+      /* Any blocks dominated by a block in the cold section
+         must also be cold.  */
+      basic_block son;
+      for (son = first_dom_son (CDI_DOMINATORS, bb);
+           son;
+           son = next_dom_son (CDI_DOMINATORS, son))
+        {
+          /* If son is not yet cold, then mark it cold here and
+             enqueue it for further processing.  */
+          if ((BB_PARTITION (son) != BB_COLD_PARTITION))
+            {
+              if (flag_only)
+                error ("non-cold basic block %d dominated "
+                       "by a block in the cold partition", son->index);
+              else
+                BB_SET_PARTITION (son, BB_COLD_PARTITION);
+              bbs_to_fix.safe_push (son);
+              bbs_in_cold_partition.safe_push (son);
+            }
+        }
+    }
+
+  if (dom_calculated_here)
+    free_dominance_info (CDI_DOMINATORS);
+
+  return bbs_to_fix;
+}
+
+/* Perform cleanup on the hot/cold bb partitioning after optimization
+   passes that modify the cfg.  */
+
+void
+fixup_partitions (void)
+{
+  basic_block bb;
+
+  if (!crtl->has_bb_partition)
+    return;
+
+  /* Delete any blocks that became unreachable and weren't
+     already cleaned up, for example during edge forwarding
+     and convert_jumps_to_returns. This will expose more
+     opportunities for fixing the partition boundaries here.
+     Also, the calculation of the dominance graph during verification
+     will assert if there are unreachable nodes.  */
+  delete_unreachable_blocks ();
+
+  /* If there are partitions, do a sanity check on them: A basic block in
+     a cold partition cannot dominate a basic block in a hot partition.
+     Fixup any that now violate this requirement, as a result of edge
+     forwarding and unreachable block deletion.  */
+  vec<basic_block> bbs_to_fix = find_partition_fixes (false);
+
+  /* Do the partition fixup after all necessary blocks have been converted to
+     cold, so that we only update the region crossings the minimum number of
+     places, which can require forcing edges to be non fallthru.  */
+  while (! bbs_to_fix.is_empty ())
+    {
+      bb = bbs_to_fix.pop ();
+      fixup_bb_partition (bb);
+    }
+}
+
 /* Verify, in the basic block chain, that there is at most one switch
    between hot/cold partitions. This condition will not be true until
    after reorder_basic_blocks is called.  */
@@ -2219,7 +2350,8 @@  verify_hot_cold_block_grouping (void)
 /* Perform several checks on the edges out of each block, such as
    the consistency of the branch probabilities, the correctness
    of hot/cold partition crossing edges, and the number of expected
-   successor edges.  */
+   successor edges.  Also verify that the dominance relationship
+   between hot/cold blocks is sane.  */

 static int
 rtl_verify_edges (void)
@@ -2382,6 +2514,14 @@  rtl_verify_edges (void)
        }
     }

+  /* If there are partitions, do a sanity check on them: A basic block in
+     a cold partition cannot dominate a basic block in a hot partition.  */
+  if (crtl->has_bb_partition && !err)
+    {
+      vec<basic_block> bbs_to_fix = find_partition_fixes (true);
+      err = !bbs_to_fix.is_empty ();
+    }
+
   /* Clean up.  */
   return err;
 }
@@ -2515,7 +2655,7 @@  rtl_verify_bb_pointers (void)
      and NOTE_INSN_BASIC_BLOCK
    - verify that no fall_thru edge crosses hot/cold partition boundaries
    - verify that there are no pending RTL branch predictions
-   - verify that there is a single hot/cold partition boundary after bbro
+   - verify that hot blocks are not dominated by cold blocks

    In future it can be extended check a lot of other stuff as well
    (reachability of basic blocks, life information, etc. etc.).  */
@@ -2761,7 +2901,8 @@  rtl_verify_bb_layout (void)
    - check that all insns are in the basic blocks
      (except the switch handling code, barriers and notes)
    - check that all returns are followed by barriers
-   - check that all fallthru edge points to the adjacent blocks.  */
+   - check that all fallthru edge points to the adjacent blocks
+   - verify that there is a single hot/cold partition boundary after bbro  */

 static int
 rtl_verify_flow_info (void)
Index: basic-block.h
===================================================================
--- basic-block.h       (revision 201281)
+++ basic-block.h       (working copy)
@@ -797,6 +797,7 @@  extern bool contains_no_active_insn_p (const_basic
 extern bool forwarder_block_p (const_basic_block);
 extern bool can_fallthru (basic_block, basic_block);
 extern void emit_barrier_after_bb (basic_block bb);
+extern void fixup_partitions (void);

 /* In cfgbuild.c.  */
 extern void find_many_sub_basic_blocks (sbitmap);
Index: cfgcleanup.c
===================================================================
--- cfgcleanup.c        (revision 201281)
+++ cfgcleanup.c        (working copy)
@@ -2807,10 +2807,21 @@  try_optimize_cfg (int mode)
              df_analyze ();
            }

+         if (changed)
+            {
+              /* Edge forwarding in particular can cause hot blocks previously
+                 reached by both hot and cold blocks to become dominated only
+                 by cold blocks. This will cause the verification
below to fail,
+                 and lead to now cold code in the hot section. This is not easy
+                 to detect and fix during edge forwarding, and in some cases
+                 is only visible after newly unreachable blocks are deleted,
+                 which will be done in fixup_partitions.  */
+              fixup_partitions ();
+
 #ifdef ENABLE_CHECKING
-         if (changed)
-           verify_flow_info ();
+              verify_flow_info ();
 #endif
+            }

          changed_overall |= changed;
          first_pass = false;
Index: bb-reorder.c
===================================================================
--- bb-reorder.c        (revision 201281)
+++ bb-reorder.c        (working copy)
@@ -1444,6 +1444,57 @@  fix_up_crossing_landing_pad (eh_landing_pad old_lp
       ei_next (&ei);
 }

+
+/* Ensure that no cold bbs dominate hot bbs along the dominance or
+   post-dominance DIR, for example as a result of edge weight insanities.
+   Returns the updated value of COLD_BB_COUNT and adds newly-hot bbs
+   to BBS_IN_HOT_PARTITION.  */
+
+static unsigned int
+sanitize_dominator_hotness (enum cdi_direction dir, unsigned int cold_bb_count,
+                            vec<basic_block> *bbs_in_hot_partition)
+{
+  /* Callers check this.  */
+  gcc_checking_assert (cold_bb_count);
+
+  bool dom_calculated_here = !dom_info_available_p (dir);
+
+  if (dom_calculated_here)
+    calculate_dominance_info (dir);
+
+  /* Keep examining hot bbs while we still have some left to check
+     and there are remaining cold bbs.  */
+  vec<basic_block> hot_bbs_to_check = bbs_in_hot_partition->copy ();
+  while (! hot_bbs_to_check.is_empty ()
+         && cold_bb_count)
+    {
+      basic_block bb = hot_bbs_to_check.pop ();
+      basic_block dom_bb = get_immediate_dominator (dir, bb);
+
+      /* If bb's immediate dominator is also hot (or unpartitioned,
+         e.g. the entry block) then it is ok. If it is cold, it
+         needs to be adjusted.  */
+      if (BB_PARTITION (dom_bb) != BB_COLD_PARTITION)
+        continue;
+
+      /* We have a hot bb with an immediate dominator that is cold.
+         The dominator needs to be re-marked hot.  */
+      BB_SET_PARTITION (dom_bb, BB_HOT_PARTITION);
+      cold_bb_count--;
+
+      /* Now we need to examine newly-hot dom_bb to see if it is also
+         dominated by a cold bb.  */
+      bbs_in_hot_partition->safe_push (dom_bb);
+      hot_bbs_to_check.safe_push (dom_bb);
+    }
+
+  if (dom_calculated_here)
+    free_dominance_info (dir);
+
+  return cold_bb_count;
+}
+
+
 /* Find the basic blocks that are rarely executed and need to be moved to
    a separate section of the .o file (to cut down on paging and improve
    cache locality).  Return a vector of all edges that cross.  */
@@ -1455,16 +1506,42 @@  find_rarely_executed_basic_blocks_and_crossing_edg
   basic_block bb;
   edge e;
   edge_iterator ei;
+  unsigned int cold_bb_count = 0;
+  vec<basic_block> bbs_in_hot_partition = vNULL;

   /* Mark which partition (hot/cold) each basic block belongs in.  */
   FOR_EACH_BB (bb)
     {
       if (probably_never_executed_bb_p (cfun, bb))
-       BB_SET_PARTITION (bb, BB_COLD_PARTITION);
+        {
+          BB_SET_PARTITION (bb, BB_COLD_PARTITION);
+          cold_bb_count++;
+        }
       else
-       BB_SET_PARTITION (bb, BB_HOT_PARTITION);
+        {
+          BB_SET_PARTITION (bb, BB_HOT_PARTITION);
+          bbs_in_hot_partition.safe_push (bb);
+        }
     }

+  /* Ensure that no cold bbs dominate hot bbs. This could happen as a result of
+     several different possibilities. One is that there are edge
weight insanities
+     due to optimization phases that do not properly update basic block profile
+     counts. The second is that the entry of the function may not be
hot, because
+     it is entered fewer times than the number of profile training
runs, but there
+     is a loop inside the function that causes blocks within the function to be
+     above the threshold for hotness. Then do the same along the post-dominator
+     tree (which could have additional changes required after fixing up
+     dominators).  */
+  if (cold_bb_count)
+    cold_bb_count = sanitize_dominator_hotness (CDI_DOMINATORS,
+                                                cold_bb_count,
+                                                &bbs_in_hot_partition);
+  if (cold_bb_count)
+    sanitize_dominator_hotness (CDI_POST_DOMINATORS,
+                                cold_bb_count,
+                                &bbs_in_hot_partition);
+