diff mbox

Time profiler - phase 2

Message ID CAObPJ3OBCKZthHZQFM-jMQjiNvD-xYXqSJKD7-f+1ptHtGyNLg@mail.gmail.com
State New
Headers show

Commit Message

Martin Liška Dec. 15, 2013, 11:07 p.m. UTC
There's latest version of the patch.
Could you please approve the patch?

Martin

On 5 December 2013 22:32, Jan Hubicka <hubicka@ucw.cz> wrote:
>> Hello,
>>    thank you for the trick in ipa-split.c. It really helped! I
>
> Good!, this patch is pre-approved after testing.
>> prepared 2 tests for Inkscape, first was just with my function
>> reordering pass. And for the second, I enable also
>> -freorder-blocks-and-partition (note: files ending with _blocks.xxx in
>> attached tar).
>>
>> Touched pages:
>> just reorder functions: 1108
>> with -freorder-blocks-and-partition: 1120
>>
>> Please see dumps at:
>> https://drive.google.com/file/d/0B0pisUJ80pO1R19zSXR6U1Q4NmM/edit?usp=sharing
>> Note: I put all function to a single section (for easier layout orientation).
>
> I think for -freorder-blocks-and-partition you will need to split the sections
> again, otherwise we will not see the effect of the pass. Can you, please, make
> one extra systemtap with this (it would be good to have both
> -fno-reorder-blocks-and-partition and -freorder-blocks-and-partition so we can
> compare size of bloth)?
> But overall it looks good!
>
> Honza
>>
>> If I'm correct, there a small chunk of about 10 functions after the
>> boundary of untouched functions and a single miss at the end of
>> binary: __libc_csu_init.
>> If you look at the graph line, it looks really well.
>>
>> I will prepare patch for mailing list as a phase 2.
>>
>> Martin
>>
>> On 5 December 2013 14:38, Jan Hubicka <hubicka@ucw.cz> wrote:
>> >> Can you, please, send me the -flto systemtaps for gimp and/or inkscape so we can decide
>> >> on the patch? We should enable it earlier in stage3 rather than later.
>> >
>> > I see, the PDF was included within the tar file.  Was this with
>> > -freorder-blocks-and-partition?  If so, the patch is OK.
>> > I still think we should put cold parts of hot/normal function into a subsection different
>> > from unlikely section, but lets handle that incrementally.
>> >
>> > Thanks,
>> > Honza
>> >>
>> >> Honza

Comments

Jan Hubicka Dec. 15, 2013, 11:21 p.m. UTC | #1
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 93e857df..d5a0ac8 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,14 @@
> +2013-12-15  Martin Liska  <marxin.liska@gmail.com>
> +	    Jan Hubicka  <jh@suse.cz>
> +
> +	* cgraphunit.c (node_cmp): New function.
> +	(expand_all_functions): Function ordering added.
> +	* common.opt: New profile based function reordering flag introduced.
> +	* lto-partition.c: Support for time profile added.
> +	* lto.c: Likewise.
> +	* predict.c (handle_missing_profiles): Time profile handled in
> +	  missing profiles.
> +

OK, thanks, with the changes bellow!
(I tought this patch was already in! Also please be careful about
applying the changes - it seems that in the previous commit you
M
omitted some)
> @@ -1842,11 +1859,14 @@ expand_function (struct cgraph_node *node)
>     to use subsections to make the output functions appear in top-down
>     order).  */
>  
> +
Bogus whitespace
>  static void
>  expand_all_functions (void)
>  {
>    struct cgraph_node *node;
>    struct cgraph_node **order = XCNEWVEC (struct cgraph_node *, cgraph_n_nodes);
> +
> +  unsigned int expanded_func_count = 0, profiled_func_count = 0;
>    int order_pos, new_order_pos = 0;
>    int i;
>  
> @@ -1859,20 +1879,39 @@ expand_all_functions (void)
>      if (order[i]->process)
>        order[new_order_pos++] = order[i];
>  
> +  if (flag_profile_reorder_functions)
> +    qsort (order, new_order_pos, sizeof (struct cgraph_node *), node_cmp);
> +
>    for (i = new_order_pos - 1; i >= 0; i--)
>      {
>        node = order[i];
> +
>        if (node->process)
>  	{
> +     expanded_func_count++;
> +     if(node->tp_first_run)
> +       profiled_func_count++;
> +
> +    if (cgraph_dump_file)
> +      fprintf (cgraph_dump_file, "Time profile order in expand_all_functions:%s:%d\n", node->asm_name (), node->tp_first_run);
> +
>  	  node->process = 0;
>  	  expand_function (node);
>  	}
>      }
> +
> +    if (in_lto_p && dump_file)
> +      fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
> +               main_input_filename, profiled_func_count, expanded_func_count);
> +
> +  if (cgraph_dump_file && flag_profile_reorder_functions && in_lto_p)
> +    fprintf (cgraph_dump_file, "Expanded functions with time profile:%u/%u\n",
> +             profiled_func_count, expanded_func_count);

Make the dumps unconditoinal, I do not see why they should be in_lto_p here.
> @@ -689,7 +713,6 @@ lto_balanced_map (void)
>  	  best_i = i;
>  	  best_n_nodes = lto_symtab_encoder_size (partition->encoder);
>  	  best_total_size = total_size;
> -	  best_varpool_pos = varpool_pos;
>  	}
>        if (cgraph_dump_file)
>  	fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
> @@ -707,7 +730,6 @@ lto_balanced_map (void)
>  		fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n",
>  			 i - best_i, best_i);
>  	      undo_partition (partition, best_n_nodes);
> -	      varpool_pos = best_varpool_pos;
>  	    }
>  	  i = best_i;
>   	  /* When we are finished, avoid creating empty partition.  */

I already asked you to remove these changes - they revert earlier fix.

> diff --git a/gcc/predict.c b/gcc/predict.c
> index a5ad34f..1826a06 100644
> --- a/gcc/predict.c
> +++ b/gcc/predict.c
> @@ -2839,12 +2839,24 @@ handle_missing_profiles (void)
>      {
>        struct cgraph_edge *e;
>        gcov_type call_count = 0;
> +      gcov_type max_tp_first_run = 0;
>        struct function *fn = DECL_STRUCT_FUNCTION (node->decl);
>  
>        if (node->count)
>          continue;
>        for (e = node->callers; e; e = e->next_caller)
> +      {
>          call_count += e->count;
> +
> +	if (e->caller->tp_first_run > max_tp_first_run)
> +	  max_tp_first_run = e->caller->tp_first_run;
> +      }
> +
> +      /* If time profile is missing, let assign the maximum that comes from
> +	 caller functions.  */
> +      if (!node->tp_first_run && max_tp_first_run)
> +	node->tp_first_run = max_tp_first_run + 1;
> +

I believe you also need minizming node->tp_first_run in ipa_merge_profiles.
>        if (call_count
>            && fn && fn->cfg
>            && (call_count * unlikely_count_fraction >= profile_info->runs))
> diff --git a/gcc/varasm.c b/gcc/varasm.c
> index 5c5025a..f34946c 100644
> --- a/gcc/varasm.c
> +++ b/gcc/varasm.c
> @@ -552,7 +552,14 @@ default_function_section (tree decl, enum node_frequency freq,
>       unlikely executed (this happens especially with function splitting
>       where we can split away unnecessary parts of static constructors.  */
>    if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
> -    return get_named_text_section (decl, ".text.startup", NULL);
> +  {
> +    /* If we do have a profile or(and) LTO phase is executed, we do not need
> +    these ELF section.  */
> +    if (!in_lto_p || !flag_profile_values)
> +      return get_named_text_section (decl, ".text.startup", NULL);
> +    else
> +      return NULL;
> +  }
>  
>    /* Similarly for exit.  */
>    if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
> @@ -564,7 +571,10 @@ default_function_section (tree decl, enum node_frequency freq,
>        case NODE_FREQUENCY_UNLIKELY_EXECUTED:
>  	return get_named_text_section (decl, ".text.unlikely", NULL);
>        case NODE_FREQUENCY_HOT:
> -	return get_named_text_section (decl, ".text.hot", NULL);
> +        /* If we do have a profile or(and) LTO phase is executed, we do not need
> +           these ELF section.  */
> +        if (!in_lto_p || !flag_profile_values)
> +          return get_named_text_section (decl, ".text.hot", NULL);
>        default:
>  	return NULL;
Please duplicate these changes into config/darwin.c that has identical code in it.

OK with those changes.

Honza
diff mbox

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 93e857df..d5a0ac8 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,14 @@ 
+2013-12-15  Martin Liska  <marxin.liska@gmail.com>
+	    Jan Hubicka  <jh@suse.cz>
+
+	* cgraphunit.c (node_cmp): New function.
+	(expand_all_functions): Function ordering added.
+	* common.opt: New profile based function reordering flag introduced.
+	* lto-partition.c: Support for time profile added.
+	* lto.c: Likewise.
+	* predict.c (handle_missing_profiles): Time profile handled in
+	  missing profiles.
+
 2013-12-14   Jan Hubicka  <jh@suse.cz>
 
 	PR middle-end/58477
diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
index 44f3afd..2b66331 100644
--- a/gcc/cgraphunit.c
+++ b/gcc/cgraphunit.c
@@ -1831,6 +1831,23 @@  expand_function (struct cgraph_node *node)
   ipa_remove_all_references (&node->ref_list);
 }
 
+/* Node comparer that is responsible for the order that corresponds
+   to time when a function was launched for the first time.  */
+
+static int
+node_cmp (const void *pa, const void *pb)
+{
+  const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
+  const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+  /* Functions with time profile must be before these without profile.  */
+  if (!a->tp_first_run || !b->tp_first_run)
+    return a->tp_first_run - b->tp_first_run;
+
+  return a->tp_first_run != b->tp_first_run
+	 ? b->tp_first_run - a->tp_first_run
+	 : b->order - a->order;
+}
 
 /* Expand all functions that must be output.
 
@@ -1842,11 +1859,14 @@  expand_function (struct cgraph_node *node)
    to use subsections to make the output functions appear in top-down
    order).  */
 
+
 static void
 expand_all_functions (void)
 {
   struct cgraph_node *node;
   struct cgraph_node **order = XCNEWVEC (struct cgraph_node *, cgraph_n_nodes);
+
+  unsigned int expanded_func_count = 0, profiled_func_count = 0;
   int order_pos, new_order_pos = 0;
   int i;
 
@@ -1859,20 +1879,39 @@  expand_all_functions (void)
     if (order[i]->process)
       order[new_order_pos++] = order[i];
 
+  if (flag_profile_reorder_functions)
+    qsort (order, new_order_pos, sizeof (struct cgraph_node *), node_cmp);
+
   for (i = new_order_pos - 1; i >= 0; i--)
     {
       node = order[i];
+
       if (node->process)
 	{
+     expanded_func_count++;
+     if(node->tp_first_run)
+       profiled_func_count++;
+
+    if (cgraph_dump_file)
+      fprintf (cgraph_dump_file, "Time profile order in expand_all_functions:%s:%d\n", node->asm_name (), node->tp_first_run);
+
 	  node->process = 0;
 	  expand_function (node);
 	}
     }
+
+    if (in_lto_p && dump_file)
+      fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
+               main_input_filename, profiled_func_count, expanded_func_count);
+
+  if (cgraph_dump_file && flag_profile_reorder_functions && in_lto_p)
+    fprintf (cgraph_dump_file, "Expanded functions with time profile:%u/%u\n",
+             profiled_func_count, expanded_func_count);
+
   cgraph_process_new_functions ();
   free_gimplify_stack ();
 
   free (order);
-
 }
 
 /* This is used to sort the node types by the cgraph order number.  */
@@ -2194,6 +2233,7 @@  compile (void)
 #endif
 
   cgraph_state = CGRAPH_STATE_EXPANSION;
+
   if (!flag_toplevel_reorder)
     output_in_order ();
   else
diff --git a/gcc/common.opt b/gcc/common.opt
index 0cd1fdd..ea323fd 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1712,6 +1712,10 @@  fprofile-report
 Common Report Var(profile_report)
 Report on consistency of profile
 
+fprofile-reorder-functions
+Common Report Var(flag_profile_reorder_functions)
+Enable function reordering that improves code placement
+
 frandom-seed
 Common Var(common_deferred_options) Defer
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index b655a64..b30a368 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -394,7 +394,7 @@  Objective-C and Objective-C++ Dialects}.
 -fprefetch-loop-arrays -fprofile-report @gol
 -fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol
 -fprofile-generate=@var{path} @gol
--fprofile-use -fprofile-use=@var{path} -fprofile-values @gol
+-fprofile-use -fprofile-use=@var{path} -fprofile-values -fprofile-reorder-functions @gol
 -freciprocal-math -free -frename-registers -freorder-blocks @gol
 -freorder-blocks-and-partition -freorder-functions @gol
 -frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol
@@ -9071,6 +9071,14 @@  from profiling values of expressions for usage in optimizations.
 
 Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
 
+@item -fprofile-reoder-functions
+@opindex fprofile-reorder-functions
+Function reordering based on profile instrumentation collects
+first time of execution of a function and orders these functions
+in ascending order.
+
+Enabled with @option{-fprofile-use}.
+
 @item -fvpt
 @opindex fvpt
 If combined with @option{-fprofile-arcs}, this option instructs the compiler
diff --git a/gcc/ipa-split.c b/gcc/ipa-split.c
index 390adf1..db056c8 100644
--- a/gcc/ipa-split.c
+++ b/gcc/ipa-split.c
@@ -1234,6 +1234,10 @@  split_function (struct split_point *split_point)
 				     !split_part_return_p,
 				     split_point->split_bbs,
 				     split_point->entry_bb, "part");
+
+  /* Let's take a time profile for splitted function.  */
+  node->tp_first_run = cur_node->tp_first_run + 1;
+
   /* For usual cloning it is enough to clear builtin only when signature
      changes.  For partial inlining we however can not expect the part
      of builtin implementation to have same semantic as the whole.  */
diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c
index 5b46af9..8119b11 100644
--- a/gcc/lto/lto-partition.c
+++ b/gcc/lto/lto-partition.c
@@ -286,9 +286,11 @@  add_symbol_to_partition (ltrans_partition part, symtab_node *node)
 
      Be lax about comdats; they may or may not be duplicated and we may
      end up in need to duplicate keyed comdat because it has unkeyed alias.  */
+
   gcc_assert (get_symbol_class (node) == SYMBOL_DUPLICATE
 	      || DECL_COMDAT (node->decl)
 	      || !symbol_partitioned_p (node));
+
   add_symbol_to_partition_1 (part, node);
 }
 
@@ -401,6 +403,25 @@  node_cmp (const void *pa, const void *pb)
 {
   const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
   const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+  /* Profile reorder flag enables function reordering based on first execution
+     of a function. All functions with profile are placed in ascending
+     order at the beginning.  */
+
+  if (flag_profile_reorder_functions)
+  {
+    /* Functions with time profile are sorted in ascending order.  */
+    if (a->tp_first_run && b->tp_first_run)
+      return a->tp_first_run != b->tp_first_run
+	? a->tp_first_run - b->tp_first_run
+        : a->order - b->order;
+
+    /* Functions with time profile are sorted before the functions
+       that do not have the profile.  */
+    if (a->tp_first_run || b->tp_first_run)
+      return b->tp_first_run - a->tp_first_run;
+  }
+
   return b->order - a->order;
 }
 
@@ -455,7 +476,7 @@  void
 lto_balanced_map (void)
 {
   int n_nodes = 0;
-  int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0;
+  int n_varpool_nodes = 0, varpool_pos = 0;
   struct cgraph_node **order = XNEWVEC (struct cgraph_node *, cgraph_max_uid);
   varpool_node **varpool_order = NULL;
   int i;
@@ -487,10 +508,13 @@  lto_balanced_map (void)
      get better about minimizing the function bounday, but until that
      things works smoother if we order in source order.  */
   qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
+
+  if (cgraph_dump_file)
+    for(i = 0; i < n_nodes; i++)
+      fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", order[i]->name (), order[i]->tp_first_run);
+
   if (!flag_toplevel_reorder)
     {
-      qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
-
       FOR_EACH_VARIABLE (vnode)
 	if (get_symbol_class (vnode) == SYMBOL_PARTITION)
 	  n_varpool_nodes++;
@@ -689,7 +713,6 @@  lto_balanced_map (void)
 	  best_i = i;
 	  best_n_nodes = lto_symtab_encoder_size (partition->encoder);
 	  best_total_size = total_size;
-	  best_varpool_pos = varpool_pos;
 	}
       if (cgraph_dump_file)
 	fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
@@ -707,7 +730,6 @@  lto_balanced_map (void)
 		fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n",
 			 i - best_i, best_i);
 	      undo_partition (partition, best_n_nodes);
-	      varpool_pos = best_varpool_pos;
 	    }
 	  i = best_i;
  	  /* When we are finished, avoid creating empty partition.  */
@@ -855,7 +877,7 @@  may_need_named_section_p (lto_symtab_encoder_t encoder, symtab_node *node)
    of the same name in partition ENCODER (or in whole compilation unit if
    ENCODER is NULL) and if so, mangle the statics.  Always mangle all
    conflicting statics, so we reduce changes of silently miscompiling
-   asm statemnets referring to them by symbol name.  */
+   asm statements referring to them by symbol name.  */
 
 static void
 rename_statics (lto_symtab_encoder_t encoder, symtab_node *node)
diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
index f322a00..8e5eeb3 100644
--- a/gcc/lto/lto.c
+++ b/gcc/lto/lto.c
@@ -2503,9 +2503,12 @@  lto_wpa_write_files (void)
   /* Sort partitions by size so small ones are compiled last.
      FIXME: Even when not reordering we may want to output one list for parallel make
      and other for final link command.  */
-  ltrans_partitions.qsort (flag_toplevel_reorder
+
+  if (!flag_profile_reorder_functions || !flag_profile_use)
+    ltrans_partitions.qsort (flag_toplevel_reorder
 			   ? cmp_partitions_size
 			   : cmp_partitions_order);
+
   for (i = 0; i < n_sets; i++)
     {
       size_t len;
diff --git a/gcc/opts.c b/gcc/opts.c
index 4cb2cdf..5be03fa 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -1710,6 +1710,8 @@  common_handle_option (struct gcc_options *opts,
 	opts->x_flag_vect_cost_model = VECT_COST_MODEL_DYNAMIC;
       if (!opts_set->x_flag_tree_loop_distribute_patterns)
 	opts->x_flag_tree_loop_distribute_patterns = value;
+      if (!opts_set->x_flag_profile_reorder_functions)
+	opts->x_flag_profile_reorder_functions = value;
       /* Indirect call profiling should do all useful transformations
  	 speculative devirtualization does.  */
       if (!opts_set->x_flag_devirtualize_speculatively
diff --git a/gcc/predict.c b/gcc/predict.c
index a5ad34f..1826a06 100644
--- a/gcc/predict.c
+++ b/gcc/predict.c
@@ -2839,12 +2839,24 @@  handle_missing_profiles (void)
     {
       struct cgraph_edge *e;
       gcov_type call_count = 0;
+      gcov_type max_tp_first_run = 0;
       struct function *fn = DECL_STRUCT_FUNCTION (node->decl);
 
       if (node->count)
         continue;
       for (e = node->callers; e; e = e->next_caller)
+      {
         call_count += e->count;
+
+	if (e->caller->tp_first_run > max_tp_first_run)
+	  max_tp_first_run = e->caller->tp_first_run;
+      }
+
+      /* If time profile is missing, let assign the maximum that comes from
+	 caller functions.  */
+      if (!node->tp_first_run && max_tp_first_run)
+	node->tp_first_run = max_tp_first_run + 1;
+
       if (call_count
           && fn && fn->cfg
           && (call_count * unlikely_count_fraction >= profile_info->runs))
diff --git a/gcc/varasm.c b/gcc/varasm.c
index 5c5025a..f34946c 100644
--- a/gcc/varasm.c
+++ b/gcc/varasm.c
@@ -552,7 +552,14 @@  default_function_section (tree decl, enum node_frequency freq,
      unlikely executed (this happens especially with function splitting
      where we can split away unnecessary parts of static constructors.  */
   if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
-    return get_named_text_section (decl, ".text.startup", NULL);
+  {
+    /* If we do have a profile or(and) LTO phase is executed, we do not need
+    these ELF section.  */
+    if (!in_lto_p || !flag_profile_values)
+      return get_named_text_section (decl, ".text.startup", NULL);
+    else
+      return NULL;
+  }
 
   /* Similarly for exit.  */
   if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
@@ -564,7 +571,10 @@  default_function_section (tree decl, enum node_frequency freq,
       case NODE_FREQUENCY_UNLIKELY_EXECUTED:
 	return get_named_text_section (decl, ".text.unlikely", NULL);
       case NODE_FREQUENCY_HOT:
-	return get_named_text_section (decl, ".text.hot", NULL);
+        /* If we do have a profile or(and) LTO phase is executed, we do not need
+           these ELF section.  */
+        if (!in_lto_p || !flag_profile_values)
+          return get_named_text_section (decl, ".text.hot", NULL);
       default:
 	return NULL;
     }