diff mbox

Optimize df_worklist_dataflow

Message ID 20100612134039.GH3487@kam.mff.cuni.cz
State New
Headers show

Commit Message

Jan Hubicka June 12, 2010, 1:40 p.m. UTC
Hi,
according to oprofile, bitmap_ior_into is one of most commonly called functions:
46060     4.0246  lto1                     htab_find_slot_with_hash
17385     1.5191  lto1                     bitmap_ior_into
15324     1.3390  lto1                     bitmap_set_bit_1
12731     1.1124  lto1                     fast_dce.466329.constprop.608
12663     1.1065  lto1                     df_worklist_dataflow
12400     1.0835  lto1                     df_note_compute.41166.3750
11505     1.0053  lto1                     bitmap_clear_bit
11216     0.9800  lto1                     ggc_internal_alloc_stat
10745     0.9389  lto1                     htab_find_with_hash
9795      0.8559  lto1                     record_reg_classes.103257.constprop.831.3374
9529      0.8326  lto1                     bitmap_ior_and_compl
9395      0.8209  lto1                     walk_tree_1.constprop.798
9223      0.8059  lto1                     bitmap_and
9037      0.7896  lto1                     bitmap_copy
8844      0.7728  lto1                     execute_function_todo.150826.4276
8117      0.7092  lto1                     extract_insn

looking at callgrind, the most common callers are:
104,443,839  < /abuild/jh/trunk/build-new/gcc/../../gcc/df-problems.c:df_rd_confluence_n (53439x) [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
223,397,697  < /abuild/jh/trunk/build-new/gcc/../../gcc/df-problems.c:df_live_confluence_n (480009x) [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
314,987,157  < /abuild/jh/trunk/build-new/gcc/../../gcc/df-problems.c:df_lr_confluence_n (957632x) [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
218,567,697  *  /abuild/jh/trunk/build-new/gcc/../../gcc/bitmap.c:bitmap_ior_into [/abuild/jh/trunk/build-new/stage1-gcc/cc1]

This patch change it into:
 63,254,260  < /abuild/jh/trunk/build-new/gcc/../../gcc/df-problems.c:df_rd_confluence_n (29983x) [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
124,535,655  < /abuild/jh/trunk/build-new/gcc/../../gcc/df-problems.c:df_live_confluence_n (237200x) [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
182,610,443  < /abuild/jh/trunk/build-new/gcc/../../gcc/df-problems.c:df_lr_confluence_n (523018x) [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
123,750,894  *  /abuild/jh/trunk/build-new/gcc/../../gcc/bitmap.c:bitmap_ior_into [/abuild/jh/trunk/build-new/stage1-gcc/cc1]

It is done by making df_worklist_dataflow to track what confluences need to
computed and also to avoid computation when nothing changed (since bitmap
operations readilly give us return value if something has changed or not).
I also noticed that df_worklist_dataflow can be written using bitmap
iterator that is a lot faster than bitmap_clear_bit and find_bit operations.

I am tracking age of basic block.  One age is last time when BB info has
changed and other age is last time it was re-scanned. When scanning we need to
compute confluences only of those basic blocks that changed since last
checking.

Everythign in the main dataflow loop seems performance critical, so one age
is stored in vector and the last change age (that is accessed more times)
in the AUX field of BB structure.

the oprofile now change into:

49333     3.9682  htab_find_slot_with_hash
20376     1.6390  df_worklist_dataflow
17643     1.4192  df_note_compute.41314.3750
17159     1.3802  bitmap_set_bit_1
15097     1.2144  fast_dce.467556.constprop.620
12753     1.0258  bitmap_ior_into
12063     0.9703  ggc_internal_alloc_stat
11887     0.9562  htab_find_with_hash
11782     0.9477  record_reg_classes.103545.constprop.841.3374
10908     0.8774  bitmap_clear_bit
10108     0.8131  bitmap_copy
9918      0.7978  execute_function_todo.151805.4276
9907      0.7969  walk_tree_1.constprop.804
9700      0.7802  extract_insn

Note the increase of df_worklist_dataflow.  I lack much of logical
explanation for that: it is not related to iterator change (without it
situation is even worse) and I don't think the extra array lookup is that
expensive.  So I guess we just collecting cache misses that was orginally
attributed to the confluence functions.

The profile is as follows:

               :/* Helper function for df_worklist_dataflow.
               :   Propagate the dataflow forward.
               :   Given a BB_INDEX, do the dataflow propagation
               :   and set bits on for successors in PENDING
               :   if the out set of the dataflow has changed. */
               :
               :static bool
               :df_worklist_propagate_forward (struct dataflow *dataflow,
               :                               unsigned bb_index,
               :                               unsigned *bbindex_to_postorder,
               :                               bitmap pending,
               :                               sbitmap considered,
               :			       size_t age)
               :{
               :  edge e;
               :  edge_iterator ei;
               :  basic_block bb = BASIC_BLOCK (bb_index);
    69  0.0056 :  bool changed = !age;
               :
               :  /*  Calculate <conf_op> of incoming edges.  */
   232  0.0187 :  if (EDGE_COUNT (bb->preds) > 0)
   127  0.0102 :    FOR_EACH_EDGE (e, ei, bb->preds)
               :      {
   187  0.0150 :        if ((age <= (size_t)e->src->aux)
   329  0.0265 :	     && TEST_BIT (considered, e->src->index))
  1035  0.0833 :          changed |= dataflow->problem->con_fun_n (e);
               :      }
    12 9.7e-04 :  else if (dataflow->problem->con_fun_0)
               :    {
     2 1.6e-04 :      dataflow->problem->con_fun_0 (bb);
               :      changed = true;
               :    }
               :
   421  0.0339 :  if (changed
   111  0.0089 :      && dataflow->problem->trans_fun (bb_index))
               :    {
               :      /* The out set of this block has changed.
               :         Propagate to the outgoing blocks.  */
   114  0.0092 :      FOR_EACH_EDGE (e, ei, bb->succs)
               :        {
   355  0.0286 :          unsigned ob_index = e->dest->index;
               :
   261  0.0210 :          if (TEST_BIT (considered, ob_index))
               :            bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
               :        }
               :      return true;
               :    }
               :  return false;
               :}
               :
               :
               :/* Helper function for df_worklist_dataflow.
               :   Propagate the dataflow backward.  */
               :
               :static bool
               :df_worklist_propagate_backward (struct dataflow *dataflow,
               :                                unsigned bb_index,
               :                                unsigned *bbindex_to_postorder,
               :                                bitmap pending,
               :                                sbitmap considered,
               :			        size_t age)
               :{
               :  edge e;
               :  edge_iterator ei;
               :  basic_block bb = BASIC_BLOCK (bb_index);
    41  0.0033 :  bool changed = !age;
               :
               :  /*  Calculate <conf_op> of incoming edges.  */
   240  0.0193 :  if (EDGE_COUNT (bb->succs) > 0)
   229  0.0184 :    FOR_EACH_EDGE (e, ei, bb->succs)
               :      {
   147  0.0118 :        if ((age <= (size_t)e->dest->aux)
   417  0.0335 :	     && TEST_BIT (considered, e->dest->index))
  1030  0.0829 :          changed |= dataflow->problem->con_fun_n (e);
               :      }
    21  0.0017 :  else if (dataflow->problem->con_fun_0)
               :    {
               :      dataflow->problem->con_fun_0 (bb);
               :      changed = true;
               :    }
               :
    16  0.0013 :  if (changed
    86  0.0069 :      && dataflow->problem->trans_fun (bb_index))
               :    {
               :      /* The out set of this block has changed.
               :         Propagate to the outgoing blocks.  */
    31  0.0025 :      FOR_EACH_EDGE (e, ei, bb->preds)
               :        {
   719  0.0578 :          unsigned ob_index = e->src->index;
               :
   133  0.0107 :          if (TEST_BIT (considered, ob_index))
    29  0.0023 :            bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
               :        }
               :      return true;
               :    }
               :  return false;
               :}
               :
   739  0.0594 :DEF_VEC_I(size_t);
     4 3.2e-04 :DEF_VEC_ALLOC_I(size_t,heap);
               :
               :/* This will free "pending". */
               :
               :static void
               :df_worklist_dataflow_doublequeue (struct dataflow *dataflow,
               :			  	  bitmap pending,
               :                                  sbitmap considered,
               :                                  int *blocks_in_postorder,
               :				  unsigned *bbindex_to_postorder,
               :				  int n_blocks)
               :{
               :  enum df_flow_dir dir = dataflow->problem->dir;
     1 8.0e-05 :  int dcount = 0;
     1 8.0e-05 :  bitmap worklist = BITMAP_ALLOC (&df_bitmap_obstack);
     1 8.0e-05 :  size_t age = 0;
               :  bool changed;
               :  VEC(size_t, heap) *last_age = NULL;
               :  size_t prev_age;
               :  basic_block bb;
               :  int i;
               :
               :  VEC_safe_grow_cleared (size_t, heap, last_age, n_blocks);
               :
               :  /* Double-queueing. Worklist is for the current iteration,
               :     and pending is for the next. */
    12 9.7e-04 :  while (!bitmap_empty_p (pending))
               :    {
               :      bitmap_iterator bi;
               :      unsigned int index;
               :
               :      /* Swap pending and worklist. */
               :      bitmap temp = worklist;
               :      worklist = pending;
               :      pending = temp;
               :
               :      EXECUTE_IF_SET_IN_BITMAP (worklist, 0, index, bi)
               :	{
               :	  unsigned bb_index;
   338  0.0272 :	  dcount++;
               :
   957  0.0770 :	  bb_index = blocks_in_postorder[index];
     3 2.4e-04 :	  bb = BASIC_BLOCK (bb_index);
     4 3.2e-04 :	  prev_age = VEC_index (size_t, last_age, index);
   116  0.0093 :	  if (dir == DF_FORWARD)
               :	    changed = df_worklist_propagate_forward (dataflow, bb_index,
               :						     bbindex_to_postorder,
               :						     pending, considered,
               :						     prev_age);
               :	  else
               :	    changed = df_worklist_propagate_backward (dataflow, bb_index,
               :						      bbindex_to_postorder,
               :						      pending, considered,
               :						      prev_age);
               :	  age++;
               :	  if (changed)
   687  0.0553 :	    bb->aux = (void *)age;
               :	  VEC_replace (size_t, last_age, index, age);
               :	  age++;
               :	}
    10 8.0e-04 :      bitmap_clear (worklist);
               :    }
   188  0.0151 :  for (i = 0; i < n_blocks; i++)
   168  0.0135 :    BASIC_BLOCK (blocks_in_postorder[i])->aux = NULL;
               :
               :  BITMAP_FREE (worklist);
               :  BITMAP_FREE (pending);
               :  VEC_free (size_t, heap, last_age);
               :
               :  /* Dump statistics. */
     4 3.2e-04 :  if (dump_file)
               :    fprintf (dump_file, "df_worklist_dataflow_doublequeue:"
               :	     "n_basic_blocks %d n_edges %d"
               :	     " count %d (%5.2g)\n",
               :	     n_basic_blocks, n_edges,
               :	     dcount, dcount / (float)n_basic_blocks);
               :}

Main change is in dispatch to con_fun_n

               :static void
               :df_worklist_propagate_forward (struct dataflow *dataflow,
               :                               unsigned bb_index,
               :                               unsigned *bbindex_to_postorder,
               :                               bitmap pending,
               :                               sbitmap considered)
               :{
               :  edge e;
               :  edge_iterator ei;
    91  0.0069 :  basic_block bb = BASIC_BLOCK (bb_index);
               :
               :  /*  Calculate <conf_op> of incoming edges.  */
   179  0.0136 :  if (EDGE_COUNT (bb->preds) > 0)
               :    FOR_EACH_EDGE (e, ei, bb->preds)
               :      {
   230  0.0175 :        if (TEST_BIT (considered, e->src->index))
   489  0.0372 :          dataflow->problem->con_fun_n (e);
               :      }
     6 4.6e-04 :  else if (dataflow->problem->con_fun_0)
   319  0.0243 :    dataflow->problem->con_fun_0 (bb);
               :
   190  0.0145 :  if (dataflow->problem->trans_fun (bb_index))
               :    {
               :      /* The out set of this block has changed.
               :         Propagate to the outgoing blocks.  */
   170  0.0129 :      FOR_EACH_EDGE (e, ei, bb->succs)
               :        {
   516  0.0392 :          unsigned ob_index = e->dest->index;
               :
    69  0.0052 :          if (TEST_BIT (considered, ob_index))
    38  0.0029 :            bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
               :        }
               :    }
               :}

We probably should consider using macros (or tempaltes) to avoid indirect calls
here and just spcialize the dataflow function for specific problems.

Overall compile time is improved, from 10m28s to 10m5s. 

Bootstrapped/regtested x86_64-linux, I am running the df checking bootstrap now.
Seems to make sense?

Honza

	* df-problems.c (df_rd_confluence_n, df_lr_confluence_n, df_live_confluence_n,
	df_byte_lr_confluence_n, df_md_confluence_n): Return true if something changed.
	* df.h (df_confluence_function_n): Return bool.
	* df-core.c (df_worklist_propagate_forward, df_worklist_propagate_backward):
	track changes and ages.
	(df_worklist_dataflow_doublequeue): Use bitmap iterator for main walk;
	track ages.
	* dse.c (dse_confluence_n): Return always true.

Comments

Steven Bosscher June 12, 2010, 1:44 p.m. UTC | #1
On Sat, Jun 12, 2010 at 3:40 PM, Jan Hubicka <hubicka@ucw.cz> wrote:
> Index: df.h
> ===================================================================
> --- df.h        (revision 160661)
> +++ df.h        (working copy)
> @@ -223,7 +223,7 @@ typedef void (*df_dataflow_function) (st
>  typedef void (*df_confluence_function_0) (basic_block);
>
>  /* Confluence operator for blocks with 1 or more out (or in) edges.  */
> -typedef void (*df_confluence_function_n) (edge);
> +typedef bool (*df_confluence_function_n) (edge);

Can you document what the return value should be, please?

Ciao!
Steven
Jan Hubicka June 12, 2010, 2:32 p.m. UTC | #2
> On Sat, Jun 12, 2010 at 3:40 PM, Jan Hubicka <hubicka@ucw.cz> wrote:
> > Index: df.h
> > ===================================================================
> > --- df.h        (revision 160661)
> > +++ df.h        (working copy)
> > @@ -223,7 +223,7 @@ typedef void (*df_dataflow_function) (st
> >  typedef void (*df_confluence_function_0) (basic_block);
> >
> >  /* Confluence operator for blocks with 1 or more out (or in) edges.  */
> > -typedef void (*df_confluence_function_n) (edge);
> > +typedef bool (*df_confluence_function_n) (edge);
> 
> Can you document what the return value should be, please?

Sure, it is the same as for df_transfer_function where it is also undocumented.
I updated both in my local copy.

Honza
> 
> Ciao!
> Steven
Paolo Bonzini June 12, 2010, 5:18 p.m. UTC | #3
> I am tracking age of basic block.  One age is last time when BB info has
> changed and other age is last time it was re-scanned. When scanning we need to
> compute confluences only of those basic blocks that changed since last
> checking.

Steven is the one who experimented with all the dataflow solvers so I'll 
gladly defer this review to him.

Paolo
Jan Hubicka June 12, 2010, 5:44 p.m. UTC | #4
>> I am tracking age of basic block.  One age is last time when BB info has
>> changed and other age is last time it was re-scanned. When scanning we need to
>> compute confluences only of those basic blocks that changed since last
>> checking.
>
> Steven is the one who experimented with all the dataflow solvers so I'll  
> gladly defer this review to him.
OK :)
Note that today profile has changed (I believe due to Eric's patch).
The magical slowness of fast_dce I intended to analyze is gone.  I guess problem
was/is that we do not fit the coniditonals for removing pure/const calls on
SSA,du and fast DCEs that might result in scenrario where fast_dce executed
after reload needs many iterations since it is first instance removing some
pure calls.  If the problem returns back, I will double check the theory but
it seems only plausible explanation.

Current profile is:
51198     3.8267  lto1                     htab_find_slot_with_hash
19428     1.4521  lto1                     df_worklist_dataflow
17117     1.2794  lto1                     bitmap_set_bit_1
16122     1.2050  lto1                     df_note_compute
14601     1.0913  lto1                     htab_traverse_noresize
12909     0.9649  lto1                     record_reg_classes.constprop.1
12652     0.9457  lto1                     htab_find_with_hash
11987     0.8960  lto1                     ggc_internal_alloc_stat
11802     0.8821  lto1                     bitmap_clear_bit
10892     0.8141  lto1                     walk_tree_1
10197     0.7622  lto1                     bitmap_copy
8886      0.6642  lto1                     bitmap_bit_p_1
8658      0.6471  lto1                     et_splay

I am wondering if there are any obvious improvements for df_note_compute?
One thing I noticed is that it might be reorganized to walk reverse superblocks
avoid copying the live bitmaps all the time (it is also one of most busy callers
of bitmap_copy).  But I guess benefits of such a trick won't be that grand
and it is bit difficult to reorganize it since it is executed at all_blocks bitmap
(that is I believe always whole CFG) and anything that tries to walk superblocks
would result in random access to it.

Honza
Steven Bosscher June 12, 2010, 5:47 p.m. UTC | #5
On Sat, Jun 12, 2010 at 3:40 PM, Jan Hubicka <hubicka@ucw.cz> wrote:


> Everythign in the main dataflow loop seems performance critical, so one age
> is stored in vector and the last change age (that is accessed more times)
> in the AUX field of BB structure.

Why not store both in a vector?  I really dislike using the void*
pointer aux field for an integer value, casting back-and-forth from
(void*) to (size_t).


>        * df-problems.c (df_rd_confluence_n, df_lr_confluence_n, df_live_confluence_n,
>        df_byte_lr_confluence_n, df_md_confluence_n): Return true if something changed.
>        * df.h (df_confluence_function_n): Return bool.
>        * df-core.c (df_worklist_propagate_forward, df_worklist_propagate_backward):
>        track changes and ages.

Nit: s/track/Track/

You should also explain the algorithmic changes you're making. This
"age" trick is not found in text-book dataflow solvers, so it's not as
obvious as the existing code. Therefore, please add a not-too-small
comment explaining what the algorithm is doing. Probably belongs
somewhere before df_worklist_dataflow().


> -static void
> +static bool
>  df_lr_confluence_n (edge e)
...
> -  bitmap_ior_into (op1, &df->hardware_regs_used);
> +  return bitmap_ior_into (op1, &df->hardware_regs_used) || changed;

For readability I prefer,
  changed |= bitmap_ior_into (op1, &df->hardware_regs_used);
  return changed;


> +static bool
>  df_byte_lr_confluence_n (edge e)
...
> -    bitmap_ior_and_compl_into (op1, op2, &problem_data->invalidated_by_call);
> +    changed = bitmap_ior_and_compl_into (op1, op2, &problem_data->invalidated_by_call);

Long line, please break it.

>   else
> -    bitmap_ior_into (op1, op2);
> +    changed = bitmap_ior_into (op1, op2);
>
> -  bitmap_ior_into (op1, &problem_data->hardware_regs_used);
> +  return bitmap_ior_into (op1, &problem_data->hardware_regs_used) || changed;

Same comment as for df_lr_confluence_n: please "return changed;" on a
separate line.


> +static bool
>  df_md_confluence_n (edge e)
...
>   if (e->flags & EDGE_EH)
> -    bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
> +    return bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);

Another long line.


>  /* Free all storage associated with the problem.  */
> Index: df.h
> ===================================================================
> --- df.h        (revision 160661)
> +++ df.h        (working copy)
> @@ -223,7 +223,7 @@ typedef void (*df_dataflow_function) (st
>  typedef void (*df_confluence_function_0) (basic_block);
>
>  /* Confluence operator for blocks with 1 or more out (or in) edges.  */
> -typedef void (*df_confluence_function_n) (edge);
> +typedef bool (*df_confluence_function_n) (edge);

Already mentioned: Please add a comment what the return value should be.


>  /* Transfer function for blocks.  */
>  typedef bool (*df_transfer_function) (int);
> Index: df-core.c
> ===================================================================
> --- df-core.c   (revision 160661)
> +++ df-core.c   (working copy)
> @@ -858,28 +858,35 @@ struct rtl_opt_pass pass_df_finish =
>    and set bits on for successors in PENDING
>    if the out set of the dataflow has changed. */
>
> -static void
> +static bool

Document return value in the pre-function comment.

>  df_worklist_propagate_forward (struct dataflow *dataflow,
>                                unsigned bb_index,
>                                unsigned *bbindex_to_postorder,
>                                bitmap pending,
> -                               sbitmap considered)
> +                               sbitmap considered,
> +                              size_t age)

Document the new parameter in the pre-function comment.


>  {
>   edge e;
>   edge_iterator ei;
>   basic_block bb = BASIC_BLOCK (bb_index);
> +  bool changed = !age;
>
>   /*  Calculate <conf_op> of incoming edges.  */
>   if (EDGE_COUNT (bb->preds) > 0)
>     FOR_EACH_EDGE (e, ei, bb->preds)
>       {
> -        if (TEST_BIT (considered, e->src->index))
> -          dataflow->problem->con_fun_n (e);
> +        if ((age <= (size_t)e->src->aux)

Yuck!


> +            && TEST_BIT (considered, e->src->index))
> +          changed |= dataflow->problem->con_fun_n (e);
>       }
>   else if (dataflow->problem->con_fun_0)
> -    dataflow->problem->con_fun_0 (bb);
> +    {
> +      dataflow->problem->con_fun_0 (bb);
> +      changed = true;

I guess it's not very important for what you're trying to achieve, but
why do you have to set changed = true always after con_fun_0? It makes
you run the transfer function and propagated "changed-ness" all over.
So IIUC, e.g. for a forward propagation that starts with a maximum set
you always propagate changed to all successors.

Am I missing something?


> -static void
> +static bool
>  df_worklist_propagate_backward (struct dataflow *dataflow,
>                                 unsigned bb_index,
>                                 unsigned *bbindex_to_postorder,
>                                 bitmap pending,
> -                                sbitmap considered)
> +                                sbitmap considered,
> +                               size_t age)

Return value and new age argument should be documented.


> +DEF_VEC_I(size_t);
> +DEF_VEC_ALLOC_I(size_t,heap);

> @@ -941,46 +960,65 @@ df_worklist_dataflow_doublequeue (struct
>                                  bitmap pending,
>                                   sbitmap considered,
>                                   int *blocks_in_postorder,
> -                                 unsigned *bbindex_to_postorder)
> +                                 unsigned *bbindex_to_postorder,
> +                                 int n_blocks)

Document n_blocks please.


Looks OK to me otherwise.

Ciao!
Steven
Steven Bosscher June 12, 2010, 5:49 p.m. UTC | #6
On Sat, Jun 12, 2010 at 7:44 PM, Jan Hubicka <hubicka@ucw.cz> wrote:

> I am wondering if there are any obvious improvements for df_note_compute?

You may get more benefit from avoiding the need to call
df_note_compute in the first place ;-)

Ciao!
Steven
Jan Hubicka June 12, 2010, 6:04 p.m. UTC | #7
> On Sat, Jun 12, 2010 at 3:40 PM, Jan Hubicka <hubicka@ucw.cz> wrote:
> 
> 
> > Everythign in the main dataflow loop seems performance critical, so one age
> > is stored in vector and the last change age (that is accessed more times)
> > in the AUX field of BB structure.
> 
> Why not store both in a vector?  I really dislike using the void*
> pointer aux field for an integer value, casting back-and-forth from
> (void*) to (size_t).

Because I don't want to pack memory cache misses.  We arrive to BB structure
and check it anyway, so bb->aux is in cache. Another vector would be not.
> 
> 
> >        * df-problems.c (df_rd_confluence_n, df_lr_confluence_n, df_live_confluence_n,
> >        df_byte_lr_confluence_n, df_md_confluence_n): Return true if something changed.
> >        * df.h (df_confluence_function_n): Return bool.
> >        * df-core.c (df_worklist_propagate_forward, df_worklist_propagate_backward):
> >        track changes and ages.
> 
> Nit: s/track/Track/
> 
> You should also explain the algorithmic changes you're making. This
> "age" trick is not found in text-book dataflow solvers, so it's not as
> obvious as the existing code. Therefore, please add a not-too-small

Hmm, I tought it is not my invention, but I don not remember seeing it anywhere.

Honza
Jan Hubicka June 12, 2010, 6:06 p.m. UTC | #8
> On Sat, Jun 12, 2010 at 7:44 PM, Jan Hubicka <hubicka@ucw.cz> wrote:
> 
> > I am wondering if there are any obvious improvements for df_note_compute?
> 
> You may get more benefit from avoiding the need to call
> df_note_compute in the first place ;-)

Yep, I wondered about that too.  IRA calls df_note_compute twice for reasons
I do not understand.  Otherwise it is forwprop, combine, regmove and other
passes that seem to make use of it.

You might know if i.e. forwprop can be easilly reorganized to avoid the need.

Honza
> 
> Ciao!
> Steven
Paolo Bonzini June 12, 2010, 6:08 p.m. UTC | #9
On 06/12/2010 08:06 PM, Jan Hubicka wrote:
>> On Sat, Jun 12, 2010 at 7:44 PM, Jan Hubicka<hubicka@ucw.cz>  wrote:
>>
>>> I am wondering if there are any obvious improvements for df_note_compute?
>>
>> You may get more benefit from avoiding the need to call
>> df_note_compute in the first place ;-)
>
> Yep, I wondered about that too.  IRA calls df_note_compute twice for reasons
> I do not understand.  Otherwise it is forwprop, combine, regmove and other
> passes that seem to make use of it.
>
> You might know if i.e. forwprop can be easilly reorganized to avoid the need.

fwprop uses notes to do forward simulation of liveness.  Since its two 
problems (LIVE and MD) are a forwards problem and a backwards problem, 
there's really no way to avoid that.

If you prefer, I can avoid the notes and go back to reaching 
definitions. :-P :-P

Paolo
Steven Bosscher June 12, 2010, 6:09 p.m. UTC | #10
On Sat, Jun 12, 2010 at 8:06 PM, Jan Hubicka <hubicka@ucw.cz> wrote:
>> On Sat, Jun 12, 2010 at 7:44 PM, Jan Hubicka <hubicka@ucw.cz> wrote:
>>
>> > I am wondering if there are any obvious improvements for df_note_compute?
>>
>> You may get more benefit from avoiding the need to call
>> df_note_compute in the first place ;-)
>
> Yep, I wondered about that too.  IRA calls df_note_compute twice for reasons
> I do not understand.  Otherwise it is forwprop, combine, regmove and other
> passes that seem to make use of it.
>
> You might know if i.e. forwprop can be easilly reorganized to avoid the need.

Notes are mostly needed for REG_UNUSED notes, for single_set. So I
think it is more a matter of:
1. creating a simpled version of single_set that looks at DF_INSN_DEFS
for a single non-clobber DEF (i.e. the single set)
2. accepting that regmove/fwprop do not handle multiple_sets insns.

Ciao!
Steven
Jan Hubicka June 12, 2010, 10:50 p.m. UTC | #11
Hi,
here is updated patch.
I noticed one more problem - df_worklist_propagate_forward sets bit for all blocks
to be recomputed into pending.  Some of them are however in worklist and thus
we process some blocks twice (and because we work in postorder it is most of them).

This is why the dataflow code collect so many profile samples for just walking
the control flow graph.  I fixed this by clearing the bit in pending list too.
This is not at all optimal, since setbit/clearbit is quite expensive.  We
should track the bit if the BB is already in one of worklists that I will play
with as followup.  The patch makes situation no-worse since it trades the new
clear_bit for the original clear_bit on worklist.

I think ideally we should set bit in worklist of all basic blocks whose index
is higher than index of current BB - think of function body consisting of
acyclic sequencence closed in loop.  First scan goes well as we process
everything in postorder.  Second scan however does not, we will process only
the first BB and then for each of them we will do swapping of worklist and
re-starting of scan that is slower than just walk of worklist.  This is bit
difficult to do as bitmap iterator does not allow changes in the bitmap it is
walking.

Well, since re-scanning is not that expensive, perhaps we can get around with
only the extra bit to avoid unnecesary bitmap traffic.

Current profile is as follows:
CPU: AMD64 family10, speed 800 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 750000
samples  %        symbol name
46017     4.1503  htab_find_slot_with_hash
14089     1.2707  bitmap_set_bit_1
13438     1.2120  df_note_compute
12353     1.1141  htab_traverse_noresize
11508     1.0379  df_worklist_dataflow
10687     0.9639  htab_find_with_hash
10395     0.9375  record_reg_classes.constprop.1
10368     0.9351  bitmap_clear_bit
9844      0.8878  ggc_internal_alloc_stat
8857      0.7988  walk_tree_1
8353      0.7534  bitmap_copy
7338      0.6618  bitmap_bit_p_1
7275      0.6561  bitmap_ior_into
7082      0.6387  constrain_operands
6887      0.6211  fast_dce

So worklist dataflow is almost twice as fast.

Bootstraped/regtested x86_64-linux, OK?

Honza
	
	* df-problems.c (df_rd_confluence_n, df_lr_confluence_n, df_live_confluence_n,
	df_byte_lr_confluence_n, df_md_confluence_n): Return true if something changed.
	* df.h (df_confluence_function_n): Return bool.
	* df-core.c (df_worklist_propagate_forward, df_worklist_propagate_backward):
	Track changes and ages.
	(df_worklist_dataflow_doublequeue): Use bitmap iterator for main walk;
	track ages.
	* dse.c (dse_confluence_n): Return always true.
Index: df-core.c
===================================================================
*** df-core.c	(revision 160663)
--- df-core.c	(working copy)
*************** struct rtl_opt_pass pass_df_finish =
*** 851,885 ****
     The general data flow analysis engine.
  ----------------------------------------------------------------------------*/
  
  
  /* Helper function for df_worklist_dataflow.
     Propagate the dataflow forward.
     Given a BB_INDEX, do the dataflow propagation
     and set bits on for successors in PENDING
!    if the out set of the dataflow has changed. */
  
! static void
  df_worklist_propagate_forward (struct dataflow *dataflow,
                                 unsigned bb_index,
                                 unsigned *bbindex_to_postorder,
                                 bitmap pending,
!                                sbitmap considered)
  {
    edge e;
    edge_iterator ei;
    basic_block bb = BASIC_BLOCK (bb_index);
  
    /*  Calculate <conf_op> of incoming edges.  */
    if (EDGE_COUNT (bb->preds) > 0)
      FOR_EACH_EDGE (e, ei, bb->preds)
        {
!         if (TEST_BIT (considered, e->src->index))
!           dataflow->problem->con_fun_n (e);
        }
    else if (dataflow->problem->con_fun_0)
      dataflow->problem->con_fun_0 (bb);
  
!   if (dataflow->problem->trans_fun (bb_index))
      {
        /* The out set of this block has changed.
           Propagate to the outgoing blocks.  */
--- 851,902 ----
     The general data flow analysis engine.
  ----------------------------------------------------------------------------*/
  
+ /* Return time BB when it was visited for last time.  */
+ #define BB_LAST_CHANGE_AGE(bb) ((ptrdiff_t)(bb)->aux)
  
  /* Helper function for df_worklist_dataflow.
     Propagate the dataflow forward.
     Given a BB_INDEX, do the dataflow propagation
     and set bits on for successors in PENDING
!    if the out set of the dataflow has changed.
  
!    AGE specify time when BB was visited last time.
!    AGE of 0 means we are visiting for first time and need to
!    compute transfer function to initialize datastructures.
!    Otherwise we re-do transfer function only if something change
!    while computing confluence functions.
!    We need to compute confluence only of basic block that are younger
!    then last visit of the BB.
! 
!    Return true if BB info has changed.  This is always the case
!    in the first visit.  */
! 
! static bool
  df_worklist_propagate_forward (struct dataflow *dataflow,
                                 unsigned bb_index,
                                 unsigned *bbindex_to_postorder,
                                 bitmap pending,
!                                sbitmap considered,
! 			       ptrdiff_t age)
  {
    edge e;
    edge_iterator ei;
    basic_block bb = BASIC_BLOCK (bb_index);
+   bool changed = !age;
  
    /*  Calculate <conf_op> of incoming edges.  */
    if (EDGE_COUNT (bb->preds) > 0)
      FOR_EACH_EDGE (e, ei, bb->preds)
        {
!         if (age <= BB_LAST_CHANGE_AGE (e->src)
! 	    && TEST_BIT (considered, e->src->index))
!           changed |= dataflow->problem->con_fun_n (e);
        }
    else if (dataflow->problem->con_fun_0)
      dataflow->problem->con_fun_0 (bb);
  
!   if (changed
!       && dataflow->problem->trans_fun (bb_index))
      {
        /* The out set of this block has changed.
           Propagate to the outgoing blocks.  */
*************** df_worklist_propagate_forward (struct da
*** 890,924 ****
            if (TEST_BIT (considered, ob_index))
              bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
          }
      }
  }
  
  
  /* Helper function for df_worklist_dataflow.
     Propagate the dataflow backward.  */
  
! static void
  df_worklist_propagate_backward (struct dataflow *dataflow,
                                  unsigned bb_index,
                                  unsigned *bbindex_to_postorder,
                                  bitmap pending,
!                                 sbitmap considered)
  {
    edge e;
    edge_iterator ei;
    basic_block bb = BASIC_BLOCK (bb_index);
  
    /*  Calculate <conf_op> of incoming edges.  */
    if (EDGE_COUNT (bb->succs) > 0)
      FOR_EACH_EDGE (e, ei, bb->succs)
        {
!         if (TEST_BIT (considered, e->dest->index))
!           dataflow->problem->con_fun_n (e);
        }
    else if (dataflow->problem->con_fun_0)
      dataflow->problem->con_fun_0 (bb);
  
!   if (dataflow->problem->trans_fun (bb_index))
      {
        /* The out set of this block has changed.
           Propagate to the outgoing blocks.  */
--- 907,947 ----
            if (TEST_BIT (considered, ob_index))
              bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
          }
+       return true;
      }
+   return false;
  }
  
  
  /* Helper function for df_worklist_dataflow.
     Propagate the dataflow backward.  */
  
! static bool
  df_worklist_propagate_backward (struct dataflow *dataflow,
                                  unsigned bb_index,
                                  unsigned *bbindex_to_postorder,
                                  bitmap pending,
!                                 sbitmap considered,
! 			        ptrdiff_t age)
  {
    edge e;
    edge_iterator ei;
    basic_block bb = BASIC_BLOCK (bb_index);
+   bool changed = !age;
  
    /*  Calculate <conf_op> of incoming edges.  */
    if (EDGE_COUNT (bb->succs) > 0)
      FOR_EACH_EDGE (e, ei, bb->succs)
        {
!         if (age <= BB_LAST_CHANGE_AGE (e->dest)
! 	    && TEST_BIT (considered, e->dest->index))
!           changed |= dataflow->problem->con_fun_n (e);
        }
    else if (dataflow->problem->con_fun_0)
      dataflow->problem->con_fun_0 (bb);
  
!   if (changed
!       && dataflow->problem->trans_fun (bb_index))
      {
        /* The out set of this block has changed.
           Propagate to the outgoing blocks.  */
*************** df_worklist_propagate_backward (struct d
*** 929,986 ****
            if (TEST_BIT (considered, ob_index))
              bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
          }
      }
  }
  
  
  
! /* This will free "pending". */
  
  static void
  df_worklist_dataflow_doublequeue (struct dataflow *dataflow,
  			  	  bitmap pending,
                                    sbitmap considered,
                                    int *blocks_in_postorder,
! 				  unsigned *bbindex_to_postorder)
  {
    enum df_flow_dir dir = dataflow->problem->dir;
    int dcount = 0;
    bitmap worklist = BITMAP_ALLOC (&df_bitmap_obstack);
  
    /* Double-queueing. Worklist is for the current iteration,
       and pending is for the next. */
    while (!bitmap_empty_p (pending))
      {
        /* Swap pending and worklist. */
        bitmap temp = worklist;
        worklist = pending;
        pending = temp;
  
!       do
  	{
- 	  int index;
  	  unsigned bb_index;
  	  dcount++;
  
! 	  index = bitmap_first_set_bit (worklist);
! 	  bitmap_clear_bit (worklist, index);
! 
  	  bb_index = blocks_in_postorder[index];
! 
  	  if (dir == DF_FORWARD)
! 	    df_worklist_propagate_forward (dataflow, bb_index,
! 					   bbindex_to_postorder,
! 					   pending, considered);
  	  else
! 	    df_worklist_propagate_backward (dataflow, bb_index,
! 					    bbindex_to_postorder,
! 					    pending, considered);
  	}
!       while (!bitmap_empty_p (worklist));
      }
  
    BITMAP_FREE (worklist);
    BITMAP_FREE (pending);
  
    /* Dump statistics. */
    if (dump_file)
--- 952,1044 ----
            if (TEST_BIT (considered, ob_index))
              bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
          }
+       return true;
      }
+   return false;
  }
  
+ /* Main dataflow solver loop.
  
+    DATAFLOW is problem we are solving, PENDING is worklist of basic blocks we
+    need to visit.
+    BLOCK_IN_POSTORDER is array of size N_BLOCKS specifying postorder in BBs and
+    BBINDEX_TO_POSTORDER is array mapping back BB->index to postorder possition.
+    PENDING will be freed.
  
!    The worklists are bitmaps indexed by postorder positions.  
! 
!    The function implements standard algorithm for dataflow solving with two
!    worklists (we are processing WORKLIST and storing new BBs to visit in
!    PENDING).
! 
!    As an optimization we maintain ages when BB was changed (stored in bb->aux)
!    and when it was last visited (stored in last_visit_age).  This avoids need
!    to re-do confluence function for edges to basic blocks whose source
!    did not change since destination was visited last time.  */
  
  static void
  df_worklist_dataflow_doublequeue (struct dataflow *dataflow,
  			  	  bitmap pending,
                                    sbitmap considered,
                                    int *blocks_in_postorder,
! 				  unsigned *bbindex_to_postorder,
! 				  int n_blocks)
  {
    enum df_flow_dir dir = dataflow->problem->dir;
    int dcount = 0;
    bitmap worklist = BITMAP_ALLOC (&df_bitmap_obstack);
+   int age = 0;
+   bool changed;
+   VEC(int, heap) *last_visit_age = NULL;
+   int prev_age;
+   basic_block bb;
+   int i;
+ 
+   VEC_safe_grow_cleared (int, heap, last_visit_age, n_blocks);
  
    /* Double-queueing. Worklist is for the current iteration,
       and pending is for the next. */
    while (!bitmap_empty_p (pending))
      {
+       bitmap_iterator bi;
+       unsigned int index;
+ 
        /* Swap pending and worklist. */
        bitmap temp = worklist;
        worklist = pending;
        pending = temp;
  
!       EXECUTE_IF_SET_IN_BITMAP (worklist, 0, index, bi)
  	{
  	  unsigned bb_index;
  	  dcount++;
  
! 	  bitmap_clear_bit (pending, index);
  	  bb_index = blocks_in_postorder[index];
! 	  bb = BASIC_BLOCK (bb_index);
! 	  prev_age = VEC_index (int, last_visit_age, index);
  	  if (dir == DF_FORWARD)
! 	    changed = df_worklist_propagate_forward (dataflow, bb_index,
! 						     bbindex_to_postorder,
! 						     pending, considered,
! 						     prev_age);
  	  else
! 	    changed = df_worklist_propagate_backward (dataflow, bb_index,
! 						      bbindex_to_postorder,
! 						      pending, considered,
! 						      prev_age);
! 	  VEC_replace (int, last_visit_age, index, ++age);
! 	  if (changed)
! 	    bb->aux = (void *)(ptrdiff_t)age;
  	}
!       bitmap_clear (worklist);
      }
+   for (i = 0; i < n_blocks; i++)
+     BASIC_BLOCK (blocks_in_postorder[i])->aux = NULL;
  
    BITMAP_FREE (worklist);
    BITMAP_FREE (pending);
+   VEC_free (int, heap, last_visit_age);
  
    /* Dump statistics. */
    if (dump_file)
*************** df_worklist_dataflow (struct dataflow *d
*** 1044,1051 ****
    /* Solve it.  */
    df_worklist_dataflow_doublequeue (dataflow, pending, considered,
  				    blocks_in_postorder,
! 				    bbindex_to_postorder);
! 
    sbitmap_free (considered);
    free (bbindex_to_postorder);
  }
--- 1102,1109 ----
    /* Solve it.  */
    df_worklist_dataflow_doublequeue (dataflow, pending, considered,
  				    blocks_in_postorder,
! 				    bbindex_to_postorder,
! 				    n_blocks);
    sbitmap_free (considered);
    free (bbindex_to_postorder);
  }
Index: dse.c
===================================================================
*** dse.c	(revision 160663)
--- dse.c	(working copy)
*************** dse_confluence_0 (basic_block bb)
*** 3396,3402 ****
     out set of the src of E.  If the various in or out sets are not
     there, that means they are all ones.  */
  
! static void
  dse_confluence_n (edge e)
  {
    bb_info_t src_info = bb_table[e->src->index];
--- 3396,3402 ----
     out set of the src of E.  If the various in or out sets are not
     there, that means they are all ones.  */
  
! static bool
  dse_confluence_n (edge e)
  {
    bb_info_t src_info = bb_table[e->src->index];
*************** dse_confluence_n (edge e)
*** 3412,3417 ****
--- 3412,3418 ----
  	  bitmap_copy (src_info->out, dest_info->in);
  	}
      }
+   return true;
  }
  
  
Index: df.h
===================================================================
*** df.h	(revision 160663)
--- df.h	(working copy)
*************** typedef void (*df_dataflow_function) (st
*** 222,231 ****
  /* Confluence operator for blocks with 0 out (or in) edges.  */
  typedef void (*df_confluence_function_0) (basic_block);
  
! /* Confluence operator for blocks with 1 or more out (or in) edges.  */
! typedef void (*df_confluence_function_n) (edge);
  
! /* Transfer function for blocks.  */
  typedef bool (*df_transfer_function) (int);
  
  /* Function to massage the information after the problem solving.  */
--- 222,233 ----
  /* Confluence operator for blocks with 0 out (or in) edges.  */
  typedef void (*df_confluence_function_0) (basic_block);
  
! /* Confluence operator for blocks with 1 or more out (or in) edges.
!    Return true if BB input data has changed.  */
! typedef bool (*df_confluence_function_n) (edge);
  
! /* Transfer function for blocks. 
!    Return true if BB output data has changed.  */
  typedef bool (*df_transfer_function) (int);
  
  /* Function to massage the information after the problem solving.  */
Index: df-problems.c
===================================================================
*** df-problems.c	(revision 160663)
--- df-problems.c	(working copy)
*************** df_rd_init_solution (bitmap all_blocks)
*** 479,492 ****
  
  /* In of target gets or of out of source.  */
  
! static void
  df_rd_confluence_n (edge e)
  {
    bitmap op1 = &df_rd_get_bb_info (e->dest->index)->in;
    bitmap op2 = &df_rd_get_bb_info (e->src->index)->out;
  
    if (e->flags & EDGE_FAKE)
!     return;
  
    if (e->flags & EDGE_EH)
      {
--- 479,493 ----
  
  /* In of target gets or of out of source.  */
  
! static bool
  df_rd_confluence_n (edge e)
  {
    bitmap op1 = &df_rd_get_bb_info (e->dest->index)->in;
    bitmap op2 = &df_rd_get_bb_info (e->src->index)->out;
+   bool changed = false;
  
    if (e->flags & EDGE_FAKE)
!     return false;
  
    if (e->flags & EDGE_EH)
      {
*************** df_rd_confluence_n (edge e)
*** 508,518 ****
   			      DF_DEFS_BEGIN (regno),
   			      DF_DEFS_COUNT (regno));
  	}
!       bitmap_ior_into (op1, &tmp);
        bitmap_clear (&tmp);
      }
    else
!     bitmap_ior_into (op1, op2);
  }
  
  
--- 509,520 ----
   			      DF_DEFS_BEGIN (regno),
   			      DF_DEFS_COUNT (regno));
  	}
!       changed |= bitmap_ior_into (op1, &tmp);
        bitmap_clear (&tmp);
+       return changed;
      }
    else
!     return bitmap_ior_into (op1, op2);
  }
  
  
*************** df_lr_confluence_0 (basic_block bb)
*** 966,986 ****
  
  /* Confluence function that ignores fake edges.  */
  
! static void
  df_lr_confluence_n (edge e)
  {
    bitmap op1 = &df_lr_get_bb_info (e->src->index)->out;
    bitmap op2 = &df_lr_get_bb_info (e->dest->index)->in;
  
    /* Call-clobbered registers die across exception and call edges.  */
    /* ??? Abnormal call edges ignored for the moment, as this gets
       confused by sibling call edges, which crashes reg-stack.  */
    if (e->flags & EDGE_EH)
!     bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
    else
!     bitmap_ior_into (op1, op2);
  
!   bitmap_ior_into (op1, &df->hardware_regs_used);
  }
  
  
--- 968,990 ----
  
  /* Confluence function that ignores fake edges.  */
  
! static bool
  df_lr_confluence_n (edge e)
  {
    bitmap op1 = &df_lr_get_bb_info (e->src->index)->out;
    bitmap op2 = &df_lr_get_bb_info (e->dest->index)->in;
+   bool changed = false;
  
    /* Call-clobbered registers die across exception and call edges.  */
    /* ??? Abnormal call edges ignored for the moment, as this gets
       confused by sibling call edges, which crashes reg-stack.  */
    if (e->flags & EDGE_EH)
!     changed = bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
    else
!     changed = bitmap_ior_into (op1, op2);
  
!   changed |= bitmap_ior_into (op1, &df->hardware_regs_used);
!   return changed;
  }
  
  
*************** df_live_init (bitmap all_blocks)
*** 1503,1518 ****
  
  /* Forward confluence function that ignores fake edges.  */
  
! static void
  df_live_confluence_n (edge e)
  {
    bitmap op1 = &df_live_get_bb_info (e->dest->index)->in;
    bitmap op2 = &df_live_get_bb_info (e->src->index)->out;
  
    if (e->flags & EDGE_FAKE)
!     return;
  
!   bitmap_ior_into (op1, op2);
  }
  
  
--- 1507,1522 ----
  
  /* Forward confluence function that ignores fake edges.  */
  
! static bool
  df_live_confluence_n (edge e)
  {
    bitmap op1 = &df_live_get_bb_info (e->dest->index)->in;
    bitmap op2 = &df_live_get_bb_info (e->src->index)->out;
  
    if (e->flags & EDGE_FAKE)
!     return false;
  
!   return bitmap_ior_into (op1, op2);
  }
  
  
*************** df_byte_lr_confluence_0 (basic_block bb)
*** 2710,2732 ****
  
  /* Confluence function that ignores fake edges.  */
  
! static void
  df_byte_lr_confluence_n (edge e)
  {
    struct df_byte_lr_problem_data *problem_data
      = (struct df_byte_lr_problem_data *)df_byte_lr->problem_data;
    bitmap op1 = &df_byte_lr_get_bb_info (e->src->index)->out;
    bitmap op2 = &df_byte_lr_get_bb_info (e->dest->index)->in;
  
    /* Call-clobbered registers die across exception and call edges.  */
    /* ??? Abnormal call edges ignored for the moment, as this gets
       confused by sibling call edges, which crashes reg-stack.  */
    if (e->flags & EDGE_EH)
!     bitmap_ior_and_compl_into (op1, op2, &problem_data->invalidated_by_call);
    else
!     bitmap_ior_into (op1, op2);
  
!   bitmap_ior_into (op1, &problem_data->hardware_regs_used);
  }
  
  
--- 2714,2739 ----
  
  /* Confluence function that ignores fake edges.  */
  
! static bool
  df_byte_lr_confluence_n (edge e)
  {
    struct df_byte_lr_problem_data *problem_data
      = (struct df_byte_lr_problem_data *)df_byte_lr->problem_data;
    bitmap op1 = &df_byte_lr_get_bb_info (e->src->index)->out;
    bitmap op2 = &df_byte_lr_get_bb_info (e->dest->index)->in;
+   bool changed = false;
  
    /* Call-clobbered registers die across exception and call edges.  */
    /* ??? Abnormal call edges ignored for the moment, as this gets
       confused by sibling call edges, which crashes reg-stack.  */
    if (e->flags & EDGE_EH)
!     changed = bitmap_ior_and_compl_into (op1, op2,
! 					 &problem_data->invalidated_by_call);
    else
!     changed = bitmap_ior_into (op1, op2);
  
!   changed |= bitmap_ior_into (op1, &problem_data->hardware_regs_used);
!   return changed;
  }
  
  
*************** df_md_confluence_0 (basic_block bb)
*** 4426,4444 ****
  
  /* In of target gets or of out of source.  */
  
! static void
  df_md_confluence_n (edge e)
  {
    bitmap op1 = &df_md_get_bb_info (e->dest->index)->in;
    bitmap op2 = &df_md_get_bb_info (e->src->index)->out;
  
    if (e->flags & EDGE_FAKE)
!     return;
  
    if (e->flags & EDGE_EH)
!     bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
    else
!     bitmap_ior_into (op1, op2);
  }
  
  /* Free all storage associated with the problem.  */
--- 4433,4452 ----
  
  /* In of target gets or of out of source.  */
  
! static bool
  df_md_confluence_n (edge e)
  {
    bitmap op1 = &df_md_get_bb_info (e->dest->index)->in;
    bitmap op2 = &df_md_get_bb_info (e->src->index)->out;
  
    if (e->flags & EDGE_FAKE)
!     return false;
  
    if (e->flags & EDGE_EH)
!     return bitmap_ior_and_compl_into (op1, op2,
! 				      regs_invalidated_by_call_regset);
    else
!     return bitmap_ior_into (op1, op2);
  }
  
  /* Free all storage associated with the problem.  */
Jan Hubicka June 17, 2010, 11:49 a.m. UTC | #12
> 	* df-problems.c (df_rd_confluence_n, df_lr_confluence_n, df_live_confluence_n,
> 	df_byte_lr_confluence_n, df_md_confluence_n): Return true if something changed.
> 	* df.h (df_confluence_function_n): Return bool.
> 	* df-core.c (df_worklist_propagate_forward, df_worklist_propagate_backward):
> 	Track changes and ages.
> 	(df_worklist_dataflow_doublequeue): Use bitmap iterator for main walk;
> 	track ages.
> 	* dse.c (dse_confluence_n): Return always true.

Ping...

Honza
Jan Hubicka June 22, 2010, 11 a.m. UTC | #13
> > 	* df-problems.c (df_rd_confluence_n, df_lr_confluence_n, df_live_confluence_n,
> > 	df_byte_lr_confluence_n, df_md_confluence_n): Return true if something changed.
> > 	* df.h (df_confluence_function_n): Return bool.
> > 	* df-core.c (df_worklist_propagate_forward, df_worklist_propagate_backward):
> > 	Track changes and ages.
> > 	(df_worklist_dataflow_doublequeue): Use bitmap iterator for main walk;
> > 	track ages.
> > 	* dse.c (dse_confluence_n): Return always true.
> 
> Ping...

Ping^2 :)

Honza
> 
> Honza
Paolo Bonzini June 22, 2010, 12:47 p.m. UTC | #14
On 06/22/2010 01:00 PM, Jan Hubicka wrote:
>>> 	* df-problems.c (df_rd_confluence_n, df_lr_confluence_n, df_live_confluence_n,
>>> 	df_byte_lr_confluence_n, df_md_confluence_n): Return true if something changed.
>>> 	* df.h (df_confluence_function_n): Return bool.
>>> 	* df-core.c (df_worklist_propagate_forward, df_worklist_propagate_backward):
>>> 	Track changes and ages.
>>> 	(df_worklist_dataflow_doublequeue): Use bitmap iterator for main walk;
>>> 	track ages.
>>> 	* dse.c (dse_confluence_n): Return always true.
>>
>> Ping...
>
> Ping^2 :)

If Steven doesn't complain, go ahead.

Paolo
Jan Hubicka June 22, 2010, 3:56 p.m. UTC | #15
>
> If Steven doesn't complain, go ahead.

Hi,
thanks, I've commited it now.  Can track Steven's comments incrementally, if any.
I would like to experiment with optimizing the dataflow worklist implementation now
(i.e. stealing a bit from age to make it cheap to test if the BB is already in queue
and perhaps switch from bitmap to array as worklist implementation).

The WHOPR build time went down to 9m36s now, 10s is due to Jakub's genattrtab
change, 16% improvmenet since the first WHOPR builds.

With inlining bitset/test I can get to 9m32s but I would first like to figure
out if some of most abusive bitmap users can't be better switched to something
else.

The following are passes taking over 1% of time on CC1 LTO link:

 garbage collection    :  11.32 ( 2%) usr   0.24 ( 3%) sys  11.59 ( 2%) wall       0 kB ( 0%) ggc
 ipa lto gimple in     :   7.23 ( 1%) usr   0.77 ( 9%) sys   8.68 ( 2%) wall  878440 kB (29%) ggc
 ipa lto decl in       :   4.77 ( 1%) usr   0.21 ( 3%) sys   4.98 ( 1%) wall  248056 kB ( 8%) ggc
 cfg cleanup           :   9.17 ( 2%) usr   0.02 ( 0%) sys   9.42 ( 2%) wall   30639 kB ( 1%) ggc
 trivially dead code   :   3.19 ( 1%) usr   0.00 ( 0%) sys   2.96 ( 1%) wall       0 kB ( 0%) ggc
 df reaching defs      :   4.64 ( 1%) usr   0.03 ( 0%) sys   4.75 ( 1%) wall       0 kB ( 0%) ggc
 df live regs          :  24.17 ( 5%) usr   0.02 ( 0%) sys  24.22 ( 5%) wall       0 kB ( 0%) ggc
 df live&initialized regs:  13.53 ( 3%) usr   0.03 ( 0%) sys  13.73 ( 3%) wall       0 kB ( 0%) ggc
 df use-def / def-use chains:   2.70 ( 1%) usr   0.02 ( 0%) sys   2.50 ( 0%) wall       0 kB ( 0%) ggc
 df reg dead/unused notes:   9.41 ( 2%) usr   0.05 ( 1%) sys   9.44 ( 2%) wall   68255 kB ( 2%) ggc
 register information  :   3.12 ( 1%) usr   0.01 ( 0%) sys   3.04 ( 1%) wall       0 kB ( 0%) ggc
 alias analysis        :   9.25 ( 2%) usr   0.03 ( 0%) sys   9.24 ( 2%) wall  197016 kB ( 7%) ggc
 alias stmt walking    :   6.75 ( 1%) usr   0.79 (10%) sys   7.36 ( 1%) wall   18244 kB ( 1%) ggc
 integration           :   7.86 ( 2%) usr   0.55 ( 7%) sys   8.41 ( 2%) wall  722881 kB (24%) ggc
 tree CFG cleanup      :   6.69 ( 1%) usr   0.05 ( 1%) sys   6.71 ( 1%) wall   17282 kB ( 1%) ggc
 tree VRP              :  10.64 ( 2%) usr   0.32 ( 4%) sys  10.85 ( 2%) wall  295619 kB (10%) ggc
 tree PTA              :   4.34 ( 1%) usr   0.01 ( 0%) sys   4.84 ( 1%) wall   40842 kB ( 1%) ggc
 tree SSA rewrite      :   2.94 ( 1%) usr   0.04 ( 0%) sys   2.95 ( 1%) wall   52184 kB ( 2%) ggc
 tree SSA incremental  :   6.18 ( 1%) usr   0.30 ( 4%) sys   6.08 ( 1%) wall   53161 kB ( 2%) ggc
 tree operand scan     :   2.97 ( 1%) usr   1.33 (16%) sys   4.11 ( 1%) wall  442136 kB (15%) ggc
 dominator optimization:   5.28 ( 1%) usr   0.06 ( 1%) sys   5.43 ( 1%) wall  116917 kB ( 4%) ggc
 tree PRE              :  26.72 ( 5%) usr   0.32 ( 4%) sys  26.66 ( 5%) wall  211068 kB ( 7%) ggc
 tree FRE              :   5.33 ( 1%) usr   0.25 ( 3%) sys   6.19 ( 1%) wall   20183 kB ( 1%) ggc
 tree slp vectorization:   3.44 ( 1%) usr   0.06 ( 1%) sys   3.83 ( 1%) wall  277483 kB ( 9%) ggc
 dominance computation :   5.51 ( 1%) usr   0.04 ( 0%) sys   5.75 ( 1%) wall       0 kB ( 0%) ggc
 expand                :  42.65 ( 9%) usr   0.37 ( 5%) sys  43.32 ( 9%) wall  915669 kB (31%) ggc
 forward prop          :   4.34 ( 1%) usr   0.03 ( 0%) sys   4.95 ( 1%) wall   64467 kB ( 2%) ggc
 CSE                   :  11.46 ( 2%) usr   0.02 ( 0%) sys  11.53 ( 2%) wall   18427 kB ( 1%) ggc
 dead store elim1      :   3.67 ( 1%) usr   0.04 ( 0%) sys   4.05 ( 1%) wall   41282 kB ( 1%) ggc
 dead store elim2      :   3.68 ( 1%) usr   0.03 ( 0%) sys   3.82 ( 1%) wall   48489 kB ( 2%) ggc
 CPROP                 :   9.85 ( 2%) usr   0.03 ( 0%) sys   9.98 ( 2%) wall   90860 kB ( 3%) ggc
 PRE                   :  11.78 ( 2%) usr   0.04 ( 0%) sys  11.37 ( 2%) wall   14087 kB ( 0%) ggc
 CSE 2                 :   6.38 ( 1%) usr   0.00 ( 0%) sys   6.25 ( 1%) wall   11012 kB ( 0%) ggc
 combiner              :  14.21 ( 3%) usr   0.03 ( 0%) sys  14.32 ( 3%) wall  234301 kB ( 8%) ggc
 if-conversion         :   3.39 ( 1%) usr   0.01 ( 0%) sys   3.13 ( 1%) wall   29919 kB ( 1%) ggc
 integrated RA         :  27.45 ( 6%) usr   0.02 ( 0%) sys  27.35 ( 5%) wall  119575 kB ( 4%) ggc
 reload                :  12.18 ( 2%) usr   0.02 ( 0%) sys  12.24 ( 2%) wall   39450 kB ( 1%) ggc
 reload CSE regs       :   8.64 ( 2%) usr   0.05 ( 1%) sys   8.59 ( 2%) wall  106855 kB ( 4%) ggc
 hard reg cprop        :   3.14 ( 1%) usr   0.01 ( 0%) sys   3.17 ( 1%) wall    2226 kB ( 0%) ggc
 scheduling 2          :  14.65 ( 3%) usr   0.02 ( 0%) sys  13.78 ( 3%) wall    5666 kB ( 0%) ggc
 final                 :   9.01 ( 2%) usr   0.34 ( 4%) sys  11.21 ( 2%) wall  140923 kB ( 5%) ggc
 symout                :   6.34 ( 1%) usr   0.34 ( 4%) sys   6.52 ( 1%) wall  390733 kB (13%) ggc
 variable tracking     :  51.66 (10%) usr   0.05 ( 1%) sys  52.09 (10%) wall  360699 kB (12%) ggc
 TOTAL                 : 494.52             8.22           505.99            2984350 kB

It seems that df is still one of lowest hanging fruits.  Just liveness related
stuff is about 10% of compile time.

Honza
diff mbox

Patch

Index: df-problems.c
===================================================================
--- df-problems.c	(revision 160661)
+++ df-problems.c	(working copy)
@@ -479,14 +480,15 @@  df_rd_init_solution (bitmap all_blocks)
 
 /* In of target gets or of out of source.  */
 
-static void
+static bool
 df_rd_confluence_n (edge e)
 {
   bitmap op1 = &df_rd_get_bb_info (e->dest->index)->in;
   bitmap op2 = &df_rd_get_bb_info (e->src->index)->out;
+  bool changed = false;
 
   if (e->flags & EDGE_FAKE)
-    return;
+    return false;
 
   if (e->flags & EDGE_EH)
     {
@@ -508,11 +510,12 @@  df_rd_confluence_n (edge e)
  			      DF_DEFS_BEGIN (regno),
  			      DF_DEFS_COUNT (regno));
 	}
-      bitmap_ior_into (op1, &tmp);
+      changed |= bitmap_ior_into (op1, &tmp);
       bitmap_clear (&tmp);
+      return changed;
     }
   else
-    bitmap_ior_into (op1, op2);
+    return bitmap_ior_into (op1, op2);
 }
 
 
@@ -966,21 +969,22 @@  df_lr_confluence_0 (basic_block bb)
 
 /* Confluence function that ignores fake edges.  */
 
-static void
+static bool
 df_lr_confluence_n (edge e)
 {
   bitmap op1 = &df_lr_get_bb_info (e->src->index)->out;
   bitmap op2 = &df_lr_get_bb_info (e->dest->index)->in;
+  bool changed = false;
 
   /* Call-clobbered registers die across exception and call edges.  */
   /* ??? Abnormal call edges ignored for the moment, as this gets
      confused by sibling call edges, which crashes reg-stack.  */
   if (e->flags & EDGE_EH)
-    bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
+    changed = bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
   else
-    bitmap_ior_into (op1, op2);
+    changed = bitmap_ior_into (op1, op2);
 
-  bitmap_ior_into (op1, &df->hardware_regs_used);
+  return bitmap_ior_into (op1, &df->hardware_regs_used) || changed;
 }
 
 
@@ -1503,16 +1507,16 @@  df_live_init (bitmap all_blocks)
 
 /* Forward confluence function that ignores fake edges.  */
 
-static void
+static bool
 df_live_confluence_n (edge e)
 {
   bitmap op1 = &df_live_get_bb_info (e->dest->index)->in;
   bitmap op2 = &df_live_get_bb_info (e->src->index)->out;
 
   if (e->flags & EDGE_FAKE)
-    return;
+    return false;
 
-  bitmap_ior_into (op1, op2);
+  return bitmap_ior_into (op1, op2);
 }
 
 
@@ -2710,23 +2714,24 @@  df_byte_lr_confluence_0 (basic_block bb)
 
 /* Confluence function that ignores fake edges.  */
 
-static void
+static bool
 df_byte_lr_confluence_n (edge e)
 {
   struct df_byte_lr_problem_data *problem_data
     = (struct df_byte_lr_problem_data *)df_byte_lr->problem_data;
   bitmap op1 = &df_byte_lr_get_bb_info (e->src->index)->out;
   bitmap op2 = &df_byte_lr_get_bb_info (e->dest->index)->in;
+  bool changed = false;
 
   /* Call-clobbered registers die across exception and call edges.  */
   /* ??? Abnormal call edges ignored for the moment, as this gets
      confused by sibling call edges, which crashes reg-stack.  */
   if (e->flags & EDGE_EH)
-    bitmap_ior_and_compl_into (op1, op2, &problem_data->invalidated_by_call);
+    changed = bitmap_ior_and_compl_into (op1, op2, &problem_data->invalidated_by_call);
   else
-    bitmap_ior_into (op1, op2);
+    changed = bitmap_ior_into (op1, op2);
 
-  bitmap_ior_into (op1, &problem_data->hardware_regs_used);
+  return bitmap_ior_into (op1, &problem_data->hardware_regs_used) || changed;
 }
 
 
@@ -4426,19 +4379,19 @@  df_md_confluence_0 (basic_block bb)
 
 /* In of target gets or of out of source.  */
 
-static void
+static bool
 df_md_confluence_n (edge e)
 {
   bitmap op1 = &df_md_get_bb_info (e->dest->index)->in;
   bitmap op2 = &df_md_get_bb_info (e->src->index)->out;
 
   if (e->flags & EDGE_FAKE)
-    return;
+    return false;
 
   if (e->flags & EDGE_EH)
-    bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
+    return bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
   else
-    bitmap_ior_into (op1, op2);
+    return bitmap_ior_into (op1, op2);
 }
 
 /* Free all storage associated with the problem.  */
Index: df.h
===================================================================
--- df.h	(revision 160661)
+++ df.h	(working copy)
@@ -223,7 +223,7 @@  typedef void (*df_dataflow_function) (st
 typedef void (*df_confluence_function_0) (basic_block);
 
 /* Confluence operator for blocks with 1 or more out (or in) edges.  */
-typedef void (*df_confluence_function_n) (edge);
+typedef bool (*df_confluence_function_n) (edge);
 
 /* Transfer function for blocks.  */
 typedef bool (*df_transfer_function) (int);
Index: df-core.c
===================================================================
--- df-core.c	(revision 160661)
+++ df-core.c	(working copy)
@@ -858,28 +858,35 @@  struct rtl_opt_pass pass_df_finish =
    and set bits on for successors in PENDING
    if the out set of the dataflow has changed. */
 
-static void
+static bool
 df_worklist_propagate_forward (struct dataflow *dataflow,
                                unsigned bb_index,
                                unsigned *bbindex_to_postorder,
                                bitmap pending,
-                               sbitmap considered)
+                               sbitmap considered,
+			       size_t age)
 {
   edge e;
   edge_iterator ei;
   basic_block bb = BASIC_BLOCK (bb_index);
+  bool changed = !age;
 
   /*  Calculate <conf_op> of incoming edges.  */
   if (EDGE_COUNT (bb->preds) > 0)
     FOR_EACH_EDGE (e, ei, bb->preds)
       {
-        if (TEST_BIT (considered, e->src->index))
-          dataflow->problem->con_fun_n (e);
+        if ((age <= (size_t)e->src->aux)
+	     && TEST_BIT (considered, e->src->index))
+          changed |= dataflow->problem->con_fun_n (e);
       }
   else if (dataflow->problem->con_fun_0)
-    dataflow->problem->con_fun_0 (bb);
+    {
+      dataflow->problem->con_fun_0 (bb);
+      changed = true;
+    }
 
-  if (dataflow->problem->trans_fun (bb_index))
+  if (changed
+      && dataflow->problem->trans_fun (bb_index))
     {
       /* The out set of this block has changed.
          Propagate to the outgoing blocks.  */
@@ -890,35 +897,44 @@  df_worklist_propagate_forward (struct da
           if (TEST_BIT (considered, ob_index))
             bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
         }
+      return true;
     }
+  return false;
 }
 
 
 /* Helper function for df_worklist_dataflow.
    Propagate the dataflow backward.  */
 
-static void
+static bool
 df_worklist_propagate_backward (struct dataflow *dataflow,
                                 unsigned bb_index,
                                 unsigned *bbindex_to_postorder,
                                 bitmap pending,
-                                sbitmap considered)
+                                sbitmap considered,
+			        size_t age)
 {
   edge e;
   edge_iterator ei;
   basic_block bb = BASIC_BLOCK (bb_index);
+  bool changed = !age;
 
   /*  Calculate <conf_op> of incoming edges.  */
   if (EDGE_COUNT (bb->succs) > 0)
     FOR_EACH_EDGE (e, ei, bb->succs)
       {
-        if (TEST_BIT (considered, e->dest->index))
-          dataflow->problem->con_fun_n (e);
+        if ((age <= (size_t)e->dest->aux)
+	     && TEST_BIT (considered, e->dest->index))
+          changed |= dataflow->problem->con_fun_n (e);
       }
   else if (dataflow->problem->con_fun_0)
-    dataflow->problem->con_fun_0 (bb);
+    {
+      dataflow->problem->con_fun_0 (bb);
+      changed = true;
+    }
 
-  if (dataflow->problem->trans_fun (bb_index))
+  if (changed
+      && dataflow->problem->trans_fun (bb_index))
     {
       /* The out set of this block has changed.
          Propagate to the outgoing blocks.  */
@@ -929,10 +945,13 @@  df_worklist_propagate_backward (struct d
           if (TEST_BIT (considered, ob_index))
             bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
         }
+      return true;
     }
+  return false;
 }
 
-
+DEF_VEC_I(size_t);
+DEF_VEC_ALLOC_I(size_t,heap);
 
 /* This will free "pending". */
 
@@ -941,46 +960,65 @@  df_worklist_dataflow_doublequeue (struct
 			  	  bitmap pending,
                                   sbitmap considered,
                                   int *blocks_in_postorder,
-				  unsigned *bbindex_to_postorder)
+				  unsigned *bbindex_to_postorder,
+				  int n_blocks)
 {
   enum df_flow_dir dir = dataflow->problem->dir;
   int dcount = 0;
   bitmap worklist = BITMAP_ALLOC (&df_bitmap_obstack);
+  size_t age = 0;
+  bool changed;
+  VEC(size_t, heap) *last_age = NULL;
+  size_t prev_age;
+  basic_block bb;
+  int i;
+
+  VEC_safe_grow_cleared (size_t, heap, last_age, n_blocks);
 
   /* Double-queueing. Worklist is for the current iteration,
      and pending is for the next. */
   while (!bitmap_empty_p (pending))
     {
+      bitmap_iterator bi;
+      unsigned int index;
+
       /* Swap pending and worklist. */
       bitmap temp = worklist;
       worklist = pending;
       pending = temp;
 
-      do
+      EXECUTE_IF_SET_IN_BITMAP (worklist, 0, index, bi)
 	{
-	  int index;
 	  unsigned bb_index;
 	  dcount++;
 
-	  index = bitmap_first_set_bit (worklist);
-	  bitmap_clear_bit (worklist, index);
-
 	  bb_index = blocks_in_postorder[index];
-
+	  bb = BASIC_BLOCK (bb_index);
+	  prev_age = VEC_index (size_t, last_age, index);
 	  if (dir == DF_FORWARD)
-	    df_worklist_propagate_forward (dataflow, bb_index,
-					   bbindex_to_postorder,
-					   pending, considered);
+	    changed = df_worklist_propagate_forward (dataflow, bb_index,
+						     bbindex_to_postorder,
+						     pending, considered,
+						     prev_age);
 	  else
-	    df_worklist_propagate_backward (dataflow, bb_index,
-					    bbindex_to_postorder,
-					    pending, considered);
+	    changed = df_worklist_propagate_backward (dataflow, bb_index,
+						      bbindex_to_postorder,
+						      pending, considered,
+						      prev_age);
+	  age++;
+	  if (changed)
+	    bb->aux = (void *)age;
+	  VEC_replace (size_t, last_age, index, age);
+	  age++;
 	}
-      while (!bitmap_empty_p (worklist));
+      bitmap_clear (worklist);
     }
+  for (i = 0; i < n_blocks; i++)
+    BASIC_BLOCK (blocks_in_postorder[i])->aux = NULL;
 
   BITMAP_FREE (worklist);
   BITMAP_FREE (pending);
+  VEC_free (size_t, heap, last_age);
 
   /* Dump statistics. */
   if (dump_file)
@@ -1044,8 +1082,8 @@  df_worklist_dataflow (struct dataflow *d
   /* Solve it.  */
   df_worklist_dataflow_doublequeue (dataflow, pending, considered,
 				    blocks_in_postorder,
-				    bbindex_to_postorder);
-
+				    bbindex_to_postorder,
+				    n_blocks);
   sbitmap_free (considered);
   free (bbindex_to_postorder);
 }
Index: dse.c
===================================================================
--- dse.c	(revision 160661)
+++ dse.c	(working copy)
@@ -3396,7 +3396,7 @@  dse_confluence_0 (basic_block bb)
    out set of the src of E.  If the various in or out sets are not
    there, that means they are all ones.  */
 
-static void
+static bool
 dse_confluence_n (edge e)
 {
   bb_info_t src_info = bb_table[e->src->index];
@@ -3412,6 +3412,7 @@  dse_confluence_n (edge e)
 	  bitmap_copy (src_info->out, dest_info->in);
 	}
     }
+  return true;
 }