diff mbox

[google] New fdo summary-based icache sensitive unrolling (issue6282045)

Message ID 20120603063000.394AA61578@tjsboxrox.mtv.corp.google.com
State New
Headers show

Commit Message

Teresa Johnson June 3, 2012, 6:30 a.m. UTC
This patch adds new program summary information to the gcov
profile files that indicate how many profiled counts compose
the majority of the program's execution time. This is used to
provide an indication of the overall code size of the program.

The new profile summary information is then used to guide
codesize based unroll and peel decisions, to prevent those
optimizations from increasing code size too much when the
program may be sensitive to icache effects. Previously this
was done via code size estimates computed during tree optimization
from dynamic IPA (LIPO) partial call graphs, and thus only
kicked in for LIPO builds in large module groupings. The new
method allows the heuristic to be applied for regular FDO
compiles as well, and is globally applied to all FDO compiled
objects.

The old LIPO-specific portion of the previous approach is
reverted.

Bootstrapped and tested on x86_64-unknown-linux-gnu. Ok for google
branches?

Thanks,
Teresa

2012-06-01   Teresa Johnson  <tejohnson@google.com>

	* libgcc/libgcov.c (sort_by_reverse_gcov_value): New function.
	(gcov_compute_cutoff_values): Ditto.
        (gcov_merge_gcda_file): Merge new summary information.
	(gcov_exit_init): Call gcov_compute_cutoff_values.
	* gcc/doc/invoke.texi: Rename -fripa-peel-size-limit
        and -fripa-unroll-size-limit to -fpeel-codesize-limit and
        -funroll-codesize-limit, respectively. Remove
        codesize-hotness-threshold param and add unrollpeel-hotness-threshold
        param.
	* gcc/gcov-io.c (gcov_write_summary): Write new summary info.
	(gcov_read_summary): Read new summary info.
	* gcc/gcov-io.h (GCOV_TAG_SUMMARY_LENGTH): Update for new summary info.
        (struct gcov_ctr_summary): Add new summary info: sum_cutoff_percent
        and num_to_cutoff.
	* gcc/loop-unroll.c (limit_code_size): Update to use new summary info
        and refine for very hot loops.
	(decide_unrolling_and_peeling, decide_unroll_runtime_iterations): Ditto.
	(decide_peel_simple, decide_unroll_stupid): Ditto.
	* gcc/coverage.c (read_counts_file): Propagate new summary info.
	* gcc/common.opt: Rename -fripa-peel-size-limit
        and -fripa-unroll-size-limit to -fpeel-codesize-limit and
        -funroll-codesize-limit, respectively.
	* gcc/tree-optimize.c (cgraph_codesize_estimate): Remove.
	(compute_codesize_estimate): Remove.
	(execute_cleanup_cfg_post_optimizing): Remove call to
        compute_codesize_estimate.
	* gcc/params.def (PARAM_UNROLLPEEL_CODESIZE_THRESHOLD): Remove.
        (PARAM_UNROLLPEEL_HOTNESS_THRESHOLD): Add.
	* gcc/gcov-dump.c (tag_summary): Dump new summary info.


--
This patch is available for review at http://codereview.appspot.com/6282045
diff mbox

Patch

Index: libgcc/libgcov.c
===================================================================
--- libgcc/libgcov.c	(revision 188119)
+++ libgcc/libgcov.c	(working copy)
@@ -795,6 +795,104 @@  gcov_sort_topn_counter_arrays (const struct gcov_i
      }
 }
 
+/* Used by qsort to sort gcov values in descending order.  */
+
+static int
+sort_by_reverse_gcov_value (const void *pa, const void *pb)
+{
+  const gcov_type a = *(gcov_type const *)pa;
+  const gcov_type b = *(gcov_type const *)pb;
+
+  if (b > a)
+    return 1;
+  else if (b == a)
+    return 0;
+  else
+    return -1;
+}
+
+/* Determines the number of counters required to cover a given percentage
+   of the total sum of execution counts in the summary, which is then also
+   recorded in SUM.  */
+
+static void
+gcov_compute_cutoff_values (struct gcov_summary *sum)
+{
+  struct gcov_info *gi_ptr;
+  const struct gcov_fn_info *gfi_ptr;
+  const struct gcov_ctr_info *ci_ptr;
+  struct gcov_ctr_summary *cs_ptr;
+  unsigned t_ix, f_ix, i, ctr_info_ix, index;
+  gcov_unsigned_t c_num;
+  gcov_type *value_array;
+  gcov_type cum, cum_cutoff;
+  char *cutoff_str;
+  unsigned cutoff_perc;
+
+#define CUM_CUTOFF_PERCENT_TIMES_10 999
+  cutoff_str = getenv ("GCOV_HOTCODE_CUTOFF_TIMES_10");
+  if (cutoff_str && strlen (cutoff_str))
+    cutoff_perc = atoi (cutoff_str);
+  else
+    cutoff_perc = CUM_CUTOFF_PERCENT_TIMES_10;
+
+  for (t_ix = 0; t_ix < GCOV_COUNTERS_SUMMABLE; t_ix++)
+    {
+      /* First check if there are any counts recorded for this counter.  */
+      cs_ptr = &(sum->ctrs[t_ix]);
+      if (!cs_ptr->num)
+        continue;
+
+      /* Determine the cumulative counter value at the specified cutoff
+         percentage and record the percentage for use by gcov consumers.  */
+      cum_cutoff = (cs_ptr->sum_all * cutoff_perc)/1000;
+      cs_ptr->sum_cutoff_percent = cutoff_perc;
+
+      /* Next, walk through all the per-object structures and save each of
+         the count values in value_array.  */
+      index = 0;
+      value_array = (gcov_type *) malloc (sizeof(gcov_type)*cs_ptr->num);
+      for (gi_ptr = __gcov_list; gi_ptr; gi_ptr = gi_ptr->next)
+        {
+          /* Find the appropriate index into the gcov_ctr_info array
+             for the counter we are currently working on based on the
+             existence of the merge function pointer for this object.  */
+          for (i = 0, ctr_info_ix = 0; i < t_ix; i++)
+            {
+              if (gi_ptr->merge[i])
+                ctr_info_ix++;
+            }
+          for (f_ix = 0; f_ix != gi_ptr->n_functions; f_ix++)
+            {
+              gfi_ptr = gi_ptr->functions[f_ix];
+
+              if (!gfi_ptr || gfi_ptr->key != gi_ptr)
+                continue;
+
+              ci_ptr = &gfi_ptr->ctrs[ctr_info_ix];
+              /* Sanity check that there are enough entries in value_arry
+                for this function's counters.  */
+              gcc_assert (index + ci_ptr->num <= cs_ptr->num);
+              /* Copy over this function's counter values.  */
+              memcpy (&value_array[index], ci_ptr->values,
+                      sizeof (gcov_type) * ci_ptr->num);
+              index += ci_ptr->num;
+            }
+        }
+
+      /* Sort all the counter values by descending value and finally
+         accumulate the values from hottest on down until reaching
+         the cutoff value computed earlier.  */
+      qsort (value_array, cs_ptr->num, sizeof (gcov_type),
+             sort_by_reverse_gcov_value);
+      for (cum = 0, c_num = 0; c_num < cs_ptr->num && cum < cum_cutoff; c_num++)
+        cum += value_array[c_num];
+      /* Record the number of counters required to reach the cutoff value.  */
+      cs_ptr->num_to_cutoff = c_num;
+      free (value_array);
+    }
+}
+
 /* Compute object summary recored in gcov_info INFO. The result is
    stored in OBJ_SUM. Note that the caller is responsible for
    zeroing out OBJ_SUM, otherwise the summary is accumulated.  */
@@ -994,7 +1092,11 @@  rewrite:;
         if (gi_ptr->merge[t_ix])
           {
             if (!cs_prg->runs++)
-              cs_prg->num = cs_tprg->num;
+              {
+                cs_prg->num = cs_tprg->num;
+                cs_prg->sum_cutoff_percent = cs_tprg->sum_cutoff_percent;
+                cs_prg->num_to_cutoff = cs_tprg->num_to_cutoff;
+              }
             cs_prg->sum_all += cs_tprg->sum_all;
             if (cs_prg->run_max < cs_tprg->run_max)
               cs_prg->run_max = cs_tprg->run_max;
@@ -1160,6 +1262,7 @@  gcov_exit_init (void)
          is FDO/LIPO.  */
       dump_module_info |= gi_ptr->mod_info->is_primary;
     }
+  gcov_compute_cutoff_values (&this_program);
 
   gcov_alloc_filename ();
 
Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi	(revision 188119)
+++ gcc/doc/invoke.texi	(working copy)
@@ -392,7 +392,7 @@  Objective-C and Objective-C++ Dialects}.
 -fno-sched-interblock -fno-sched-spec -fno-signed-zeros @gol
 -fno-toplevel-reorder -fno-trapping-math -fno-zero-initialized-in-bss @gol
 -fomit-frame-pointer -foptimize-register-move -foptimize-sibling-calls @gol
--fpartial-inlining -fpeel-loops -fpredictive-commoning @gol
+-fpartial-inlining -fpeel-codesize-limit -fpeel-loops -fpredictive-commoning @gol
 -fprefetch-loop-arrays @gol
 -fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol
 -fprofile-generate=@var{path} -fprofile-generate-sampling @gol
@@ -404,8 +404,8 @@  Objective-C and Objective-C++ Dialects}.
 -freorder-blocks-and-partition -freorder-functions @gol
 -frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol
 -fripa -fripa-disallow-asm-modules -fripa-disallow-opt-mismatch @gol
--fripa-inc-path-sub=@var{path_mapping} -fripa-no-promote-always-inline-func -fripa-verbose @gol
--fripa-peel-size-limit -fripa-unroll-size-limit -frounding-math @gol
+-fripa-inc-path-sub=@var{path_mapping} -fripa-no-promote-always-inline-func @gol
+-fripa-verbose -frounding-math @gol
 -fsched2-use-superblocks -fsched-pressure @gol
 -fsched-spec-load -fsched-spec-load-dangerous @gol
 -fsched-stalled-insns-dep[=@var{n}] -fsched-stalled-insns[=@var{n}] @gol
@@ -428,7 +428,7 @@  Objective-C and Objective-C++ Dialects}.
 -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-pta -ftree-reassoc @gol
 -ftree-sink -ftree-sra -ftree-switch-conversion -ftree-tail-merge @gol
 -ftree-ter -ftree-vect-loop-version -ftree-vectorize -ftree-vrp @gol
--funit-at-a-time -funroll-all-loops -funroll-loops @gol
+-funit-at-a-time -funroll-all-loops -funroll-loops -funroll-codesize-limit @gol
 -funsafe-loop-optimizations -funsafe-math-optimizations -funswitch-loops @gol
 -fvariable-expansion-in-unroller -fvect-cost-model -fvpt -fweb @gol
 -fwhole-program -fwpa -fuse-ld -fuse-linker-plugin @gol
@@ -8424,20 +8424,6 @@  Do not promote static functions with always inline
 Enable printing of verbose information about dynamic inter-procedural optimizations.
 This is used in conjunction with the @option{-fripa}.
 
-@item -fripa-peel-size-limit
-@opindex fripa-peel-size-limit
-Limit loop peeling of non-const non-FP loops in a LIPO compilation under estimates
-of a large code footprint. Enabled by default under @option{-fripa}. Code size
-estimation and thresholds are controlled by the @option{codesize-hotness-threshold}
-and @option{unrollpeel-codesize-threshold} parameters.
-
-@item -fripa-unroll-size-limit
-@opindex fripa-unroll-size-limit
-Limit loop unrolling of non-const non-FP loops in a LIPO compilation under estimates
-of a large code footprint. Enabled by default under @option{-fripa}. Code size
-estimation and thresholds are controlled by the @option{codesize-hotness-threshold}
-and @option{unrollpeel-codesize-threshold} parameters.
-
 @item -fcallgraph-profiles-sections
 @opindex fcallgraph-profiles-sections
 Emit call graph edge profile counts in .note.callgraph.text sections. This is
@@ -8774,6 +8760,14 @@  the loop is entered.  This usually makes programs
 @option{-funroll-all-loops} implies the same options as
 @option{-funroll-loops}.
 
+@item -funroll-codesize-limit
+@opindex funroll-codesize-limit
+Limit loop unrolling of non-const non-FP loops in a profile feedback compilation
+under estimates of a large code footprint. Enabled by default with
+@option{-fprofile-use}. Code size and execution weight thresholds are controlled
+by the @option{unrollpeel-codesize-threshold} and
+@option{unrollpeel-hotness-threshold} parameters.
+
 @item -fpeel-loops
 @opindex fpeel-loops
 Peels loops for which there is enough information that they do not
@@ -8782,6 +8776,14 @@  roll much (from profile feedback).  It also turns
 
 Enabled with @option{-fprofile-use}.
 
+@item -fpeel-codesize-limit
+@opindex fpeel-codesize-limit
+Limit loop peeling of non-const non-FP loops in a profile feedback compilation
+under estimates of a large code footprint. Enabled by default with
+@option{-fprofile-use}. Code size and execution weight thresholds are controlled
+by the @option{unrollpeel-codesize-threshold} and
+@option{unrollpeel-hotness-threshold} parameters.
+
 @item -fmove-loop-invariants
 @opindex fmove-loop-invariants
 Enables the loop invariant motion pass in the RTL loop optimizer.  Enabled
@@ -9168,13 +9170,15 @@  hot loops. Its default value is 16.
 @item max-completely-peel-loop-nest-depth
 The maximum depth of a loop nest suitable for complete peeling.
 
-@item codesize-hotness-threshold
-The minimum profile count of basic blocks to look at when estimating
-the code size footprint of the call graph in a LIPO compile.
-
 @item unrollpeel-codesize-threshold
-Maximum LIPO code size footprint estimate for loop unrolling and peeling.
+Maximum profile-based code size footprint estimate for loop unrolling and
+peeling.
 
+@item unrollpeel-hotness-threshold
+Maximum ratio of total execution count to loop entry block count under which
+most profile-based code size estimates will be ignored for loop unrolling and
+peeling.
+
 @item max-unswitch-insns
 The maximum number of insns of an unswitched loop.
 
Index: gcc/gcov-io.c
===================================================================
--- gcc/gcov-io.c	(revision 188119)
+++ gcc/gcov-io.c	(working copy)
@@ -520,6 +520,8 @@  gcov_write_summary (gcov_unsigned_t tag, const str
   for (csum = summary->ctrs, ix = GCOV_COUNTERS_SUMMABLE; ix--; csum++)
     {
       gcov_write_unsigned (csum->num);
+      gcov_write_unsigned (csum->sum_cutoff_percent);
+      gcov_write_unsigned (csum->num_to_cutoff);
       gcov_write_unsigned (csum->runs);
       gcov_write_counter (csum->sum_all);
       gcov_write_counter (csum->run_max);
@@ -637,6 +639,8 @@  gcov_read_summary (struct gcov_summary *summary)
   for (csum = summary->ctrs, ix = GCOV_COUNTERS_SUMMABLE; ix--; csum++)
     {
       csum->num = gcov_read_unsigned ();
+      csum->sum_cutoff_percent = gcov_read_unsigned ();
+      csum->num_to_cutoff = gcov_read_unsigned ();
       csum->runs = gcov_read_unsigned ();
       csum->sum_all = gcov_read_counter ();
       csum->run_max = gcov_read_counter ();
Index: gcc/gcov-io.h
===================================================================
--- gcc/gcov-io.h	(revision 188119)
+++ gcc/gcov-io.h	(working copy)
@@ -440,7 +440,7 @@  typedef HOST_WIDEST_INT gcov_type;
 #define GCOV_TAG_OBJECT_SUMMARY  ((gcov_unsigned_t)0xa1000000) /* Obsolete */
 #define GCOV_TAG_PROGRAM_SUMMARY ((gcov_unsigned_t)0xa3000000)
 #define GCOV_TAG_SUMMARY_LENGTH  \
-	(1 + GCOV_COUNTERS_SUMMABLE * (2 + 3 * 2))
+	(1 + GCOV_COUNTERS_SUMMABLE * (4 + 3 * 2))
 #define GCOV_TAG_MODULE_INFO ((gcov_unsigned_t)0xa4000000)
 #define GCOV_TAG_PMU_LOAD_LATENCY_INFO ((gcov_unsigned_t)0xa5000000)
 #define GCOV_TAG_PMU_LOAD_LATENCY_LENGTH(filename)  \
@@ -541,6 +541,9 @@  typedef HOST_WIDEST_INT gcov_type;
 struct gcov_ctr_summary
 {
   gcov_unsigned_t num;		/* number of counters.  */
+  gcov_unsigned_t sum_cutoff_percent;/* sum_all cutoff percentage computed
+                                        (times 10).  */
+  gcov_unsigned_t num_to_cutoff;/* number of counters to reach above cutoff.  */
   gcov_unsigned_t runs;		/* number of program runs */
   gcov_type sum_all;		/* sum of all counters accumulated.  */
   gcov_type run_max;		/* maximum value on a single run.  */
Index: gcc/loop-unroll.c
===================================================================
--- gcc/loop-unroll.c	(revision 188119)
+++ gcc/loop-unroll.c	(working copy)
@@ -35,6 +35,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "recog.h"
 #include "target.h"
 #include "diagnostic.h"
+#include "gcov-io.h"
 
 /* This pass performs loop unrolling and peeling.  We only perform these
    optimizations on innermost loops (with single exception) because
@@ -128,12 +129,12 @@  struct opt_info
 
 static void decide_unrolling_and_peeling (int);
 static void peel_loops_completely (int);
-static void decide_peel_simple (struct loop *, int);
+static void decide_peel_simple (struct loop *, int, int);
 static void decide_peel_once_rolling (struct loop *, int);
 static void decide_peel_completely (struct loop *, int);
-static void decide_unroll_stupid (struct loop *, int);
+static void decide_unroll_stupid (struct loop *, int, int);
 static void decide_unroll_constant_iterations (struct loop *, int);
-static void decide_unroll_runtime_iterations (struct loop *, int);
+static void decide_unroll_runtime_iterations (struct loop *, int, int);
 static void peel_loop_simple (struct loop *);
 static void peel_loop_completely (struct loop *);
 static void unroll_loop_stupid (struct loop *);
@@ -189,36 +190,75 @@  typedef enum {
   LIMIT_BOTH = 0x3
 } limit_type;
 
-extern int cgraph_codesize_estimate;
-
 /* Determine whether LOOP unrolling/peeling should be constrained based
-   on code footprint estimates. */
+   on code footprint estimates. If so, sets *CODESIZE_DIVISOR to the
+   codesize-based amount divided into the max instructions in an unrolled
+   or peeled loop.  */
 static limit_type
-limit_code_size(struct loop *loop)
+limit_code_size(struct loop *loop, int *codesize_divisor)
 {
   unsigned size_threshold;
   limit_type result = NO_LIMIT;
   int result_int = 0;
   struct niter_desc *desc = get_simple_loop_desc (loop);
+  gcov_type sum_to_header_ratio;
+  int hotness_ratio_threshold;
 
-  if (!flag_dyn_ipa)
+  /* The codesize-based unroll factor divisor is initialized to 1 so that
+     unroll/peel factors are not reduced by default.  */
+  *codesize_divisor = 1;
+
+  if (!flag_unroll_codesize_limit && !flag_peel_codesize_limit)
     return NO_LIMIT;
 
-  gcc_assert (cgraph_codesize_estimate >= 0);
+  /* First check if the application has a large codesize footprint.
+     This is estimated from FDO profile summary information for the
+     program, where the num_to_cutoff indicates the number of hottest
+     counters (blocks) that compose most of the execution time of
+     the program. A large value would indicate a large flat execution
+     profile where icache misses may be a concern.  */
+  size_threshold = PARAM_VALUE (PARAM_UNROLLPEEL_CODESIZE_THRESHOLD);
+  if (!profile_info
+      || profile_info->num_to_cutoff <= size_threshold)
+    return NO_LIMIT;
 
+  /* Next, exclude some loops where unrolling/peeling may be more
+     important to overall performance.  */
+
   /* Ignore FP loops, which are more likely to benefit heavily from
      unrolling. */
   if (desc->has_fp)
     return NO_LIMIT;
 
-  size_threshold = PARAM_VALUE (PARAM_UNROLLPEEL_CODESIZE_THRESHOLD);
-  if (cgraph_codesize_estimate <= (int)size_threshold)
-    return NO_LIMIT;
+  gcc_assert(profile_info->sum_all > 0);
+  /* Next, set the value of the codesize-based unroll factor divisor which in
+     most loops will need to be set to a value that will reduce or eliminate
+     unrolling/peeling.  */
+  if (profile_info->num_to_cutoff < size_threshold*2) {
+    /* For appliations that are less than twice the codesize limit, allow
+       limited unrolling for very hot loops.  */
+    sum_to_header_ratio = profile_info->sum_all / loop->header->count;
+    hotness_ratio_threshold = PARAM_VALUE (PARAM_UNROLLPEEL_HOTNESS_THRESHOLD);
+    /* When the profile count sum to loop entry header ratio is smaller than
+       the threshold (i.e. the loop entry is hot enough, the divisor is left
+       at its default of 1 so the unroll/peel factor is not reduced. When it
+       is bigger than the ratio, increase the divisor by the amount this ratio
+       is over the threshold, which will quickly reduce the unroll/peel
+       factor to zero as the loop's hotness reduces.  */
+    if (sum_to_header_ratio > hotness_ratio_threshold) {
+      *codesize_divisor = sum_to_header_ratio / hotness_ratio_threshold;
+      gcc_assert (*codesize_divisor >= 1);
+    }
+  }
+  else
+    /* For appliations that are at least twice the codesize limit, set
+       the divisor to a large value that will force the unroll factor to 0.  */
+    *codesize_divisor = INT_MAX;
 
-  if (flag_ripa_peel_size_limit)
+  if (flag_peel_codesize_limit)
     result_int |= LIMIT_PEEL;
 
-  if (flag_ripa_unroll_size_limit)
+  if (flag_unroll_codesize_limit)
     result_int |= LIMIT_UNROLL;
 
   result = (limit_type)result_int;
@@ -443,6 +483,7 @@  decide_unrolling_and_peeling (int flags)
   loop_iterator li;
   location_t locus;
   limit_type limit;
+  int codesize_divisor = 1;
 
   /* Scan the loops, inner ones first.  */
   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
@@ -484,22 +525,23 @@  decide_unrolling_and_peeling (int flags)
       loop->av_ninsns = average_num_loop_insns (loop);
 
       /* Determine whether to limit code size growth from unrolling and
-         peeling. This is currently enabled only under LIPO (dynamic IPA)
-         where we have a partial call graph. It is not applied to loops
+         peeling, using FDO profile summary information that gives an
+         estimated number of executed blocks. It is not applied to loops
          with constant trip counts, as it is easier to determine the
          profitability of unrolling and peeling such loops. */
-      limit = limit_code_size(loop);
+      limit = limit_code_size(loop, &codesize_divisor);
       if (limit != NO_LIMIT)
 	{
 	  if (dump_file)
             {
 	      fprintf (dump_file, ";; Due to large code size footprint estimate, limit ");
               if (limit == (LIMIT_UNROLL|LIMIT_PEEL))
-	        fprintf (dump_file, "unrolling and peeling\n");
+	        fprintf (dump_file, "unrolling and peeling");
               else if (limit == LIMIT_UNROLL)
-	        fprintf (dump_file, "unrolling\n");
+	        fprintf (dump_file, "unrolling");
               else
-	        fprintf (dump_file, "peeling\n");
+	        fprintf (dump_file, "peeling");
+	      fprintf (dump_file, " max insns by divisor %d\n", codesize_divisor);
             }
 	}
 
@@ -507,15 +549,15 @@  decide_unrolling_and_peeling (int flags)
 	 priority.  */
 
       decide_unroll_constant_iterations (loop, flags);
-      if (loop->lpt_decision.decision == LPT_NONE
-          && !(limit & LIMIT_UNROLL))
-	decide_unroll_runtime_iterations (loop, flags);
-      if (loop->lpt_decision.decision == LPT_NONE
-          && !(limit & LIMIT_UNROLL))
-	decide_unroll_stupid (loop, flags);
-      if (loop->lpt_decision.decision == LPT_NONE
-          && !(limit & LIMIT_PEEL))
-	decide_peel_simple (loop, flags);
+      if (loop->lpt_decision.decision == LPT_NONE)
+	decide_unroll_runtime_iterations (loop, flags, (limit & LIMIT_UNROLL)
+                                                       ? codesize_divisor : 1);
+      if (loop->lpt_decision.decision == LPT_NONE)
+	decide_unroll_stupid (loop, flags, (limit & LIMIT_UNROLL)
+                                           ? codesize_divisor : 1);
+      if (loop->lpt_decision.decision == LPT_NONE)
+	decide_peel_simple (loop, flags, (limit & LIMIT_PEEL)
+                                         ? codesize_divisor : 1);
 
       if (flag_opt_info >= OPT_INFO_MIN
           && loop->lpt_decision.decision != LPT_NONE)
@@ -1049,7 +1091,8 @@  unroll_loop_constant_iterations (struct loop *loop
 /* Decide whether to unroll LOOP iterating runtime computable number of times
    and how much.  */
 static void
-decide_unroll_runtime_iterations (struct loop *loop, int flags)
+decide_unroll_runtime_iterations (struct loop *loop, int flags,
+                                  int codesize_divisor)
 {
   unsigned nunroll, nunroll_by_av, nunroll_branches, i;
   struct niter_desc *desc;
@@ -1067,8 +1110,10 @@  static void
 
   /* nunroll = total number of copies of the original loop body in
      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
-  nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
-  nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
+  nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / codesize_divisor
+            / loop->ninsns;
+  nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS)
+                  / codesize_divisor / loop->av_ninsns;
   if (nunroll > nunroll_by_av)
     nunroll = nunroll_by_av;
   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
@@ -1455,7 +1500,7 @@  unroll_loop_runtime_iterations (struct loop *loop)
 
 /* Decide whether to simply peel LOOP and how much.  */
 static void
-decide_peel_simple (struct loop *loop, int flags)
+decide_peel_simple (struct loop *loop, int flags, int codesize_divisor)
 {
   unsigned npeel;
   struct niter_desc *desc;
@@ -1470,7 +1515,8 @@  static void
     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
 
   /* npeel = number of iterations to peel.  */
-  npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
+  npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / codesize_divisor
+          / loop->ninsns;
   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
 
@@ -1608,7 +1654,7 @@  peel_loop_simple (struct loop *loop)
 
 /* Decide whether to unroll LOOP stupidly and how much.  */
 static void
-decide_unroll_stupid (struct loop *loop, int flags)
+decide_unroll_stupid (struct loop *loop, int flags, int codesize_divisor)
 {
   unsigned nunroll, nunroll_by_av, i;
   struct niter_desc *desc;
@@ -1624,9 +1670,10 @@  static void
 
   /* nunroll = total number of copies of the original loop body in
      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
-  nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
-  nunroll_by_av
-    = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
+  nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / codesize_divisor
+            / loop->ninsns;
+  nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS)
+                  / codesize_divisor / loop->av_ninsns;
   if (nunroll > nunroll_by_av)
     nunroll = nunroll_by_av;
   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
Index: gcc/coverage.c
===================================================================
--- gcc/coverage.c	(revision 188119)
+++ gcc/coverage.c	(working copy)
@@ -468,6 +468,9 @@  read_counts_file (const char *da_file_name, unsign
 	  gcov_read_summary (&sum);
 	  for (ix = 0; ix != GCOV_COUNTERS_SUMMABLE; ix++)
 	    {
+	      summary.ctrs[ix].sum_cutoff_percent
+                = sum.ctrs[ix].sum_cutoff_percent;
+	      summary.ctrs[ix].num_to_cutoff += sum.ctrs[ix].num_to_cutoff;
 	      summary.ctrs[ix].runs += sum.ctrs[ix].runs;
 	      summary.ctrs[ix].sum_all += sum.ctrs[ix].sum_all;
 	      if (summary.ctrs[ix].run_max < sum.ctrs[ix].run_max)
Index: gcc/common.opt
===================================================================
--- gcc/common.opt	(revision 188119)
+++ gcc/common.opt	(working copy)
@@ -1139,14 +1139,6 @@  Common Report Var(flag_ripa_no_promote_always_inli
 Don't promote always inline static functions assuming they
 will be inlined and no copy is needed.
 
-fripa-peel-size-limit
-Common Report Var(flag_ripa_peel_size_limit) Init(1) Optimization
-Limit non-const non-FP loop peeling under dynamic IPA estimates of large code footprint
-
-fripa-unroll-size-limit
-Common Report Var(flag_ripa_unroll_size_limit) Init(1) Optimization
-Limit non-const non-FP loop unrolling under dynamic IPA estimates of large code footprint
-
 fripa-inc-path-sub=
 Common Joined RejectNegative Var(lipo_inc_path_pattern)
 Substitute substring in include paths with a new string to allow reuse profile data
@@ -1639,6 +1631,10 @@  fpcc-struct-return
 Common Report Var(flag_pcc_struct_return,1) Init(DEFAULT_PCC_STRUCT_RETURN)
 Return small aggregates in memory, not registers
 
+fpeel-codesize-limit
+Common Report Var(flag_peel_codesize_limit) Init(1) Optimization
+Limit non-const non-FP loop peeling under profile estimates of large code footprint
+
 fpeel-loops
 Common Report Var(flag_peel_loops) Optimization
 Perform loop peeling
@@ -2216,6 +2212,10 @@  funroll-all-loops
 Common Report Var(flag_unroll_all_loops) Optimization
 Perform loop unrolling for all loops
 
+funroll-codesize-limit
+Common Report Var(flag_unroll_codesize_limit) Init(1) Optimization
+Limit non-const non-FP loop unrolling under profile estimates of large code footprint
+
 ; Nonzero means that loop optimizer may assume that the induction variables
 ; that control loops do not overflow and that the loops with nontrivial
 ; exit condition are not infinite
Index: gcc/tree-optimize.c
===================================================================
--- gcc/tree-optimize.c	(revision 188119)
+++ gcc/tree-optimize.c	(working copy)
@@ -46,7 +46,6 @@  along with GCC; see the file COPYING3.  If not see
 #include "except.h"
 #include "plugin.h"
 #include "regset.h"	/* FIXME: For reg_obstack.  */
-#include "params.h"
 
 /* Gate: execute, or not, all of the non-trivial optimizations.  */
 
@@ -149,49 +148,6 @@  struct gimple_opt_pass pass_all_early_optimization
  }
 };
 
-int cgraph_codesize_estimate = -1;
-
-/* Estimate the code size from the dynamic IPA call graph. */
-static void
-compute_codesize_estimate(void)
-{
-  struct cgraph_node *node;
-  basic_block bb;
-  gimple_stmt_iterator bsi;
-  struct function *my_function;
-
-  if (!flag_dyn_ipa)
-    return;
-
-  if (cgraph_codesize_estimate >= 0)
-    return;
-  cgraph_codesize_estimate = 0;
-
-  for (node = cgraph_nodes; node; node = node->next)
-    {
-      if (node->count == 0)
-        continue;
-
-      if (!gimple_has_body_p(node->decl))
-        continue;
-
-      my_function = DECL_STRUCT_FUNCTION (node->decl);
-
-      FOR_EACH_BB_FN (bb, my_function)
-        {
-          if (bb->count < PARAM_VALUE (PARAM_CODESIZE_HOTNESS_THRESHOLD))
-            continue;
-          for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
-            {
-              gimple stmt = gsi_stmt (bsi);
-              cgraph_codesize_estimate += estimate_num_insns (stmt, &eni_size_weights);
-            }
-        }
-    }
-  if (dump_file)
-    fprintf (dump_file, "Code size estimate from cgraph: %d\n", cgraph_codesize_estimate);
-}
-
 /* Pass: cleanup the CFG just before expanding trees to RTL.
    This is just a round of label cleanups and case node grouping
    because after the tree optimizers have run such cleanups may
@@ -202,8 +158,6 @@  execute_cleanup_cfg_post_optimizing (void)
 {
   unsigned int todo = 0;
 
-  /* Estimate the code footprint for hot BBs before we enter RTL */
-  compute_codesize_estimate();
   if (cleanup_tree_cfg ())
     todo |= TODO_update_ssa;
   maybe_remove_unreachable_handlers ();
Index: gcc/params.def
===================================================================
--- gcc/params.def	(revision 188119)
+++ gcc/params.def	(working copy)
@@ -337,21 +337,23 @@  DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
 	 "max-completely-peel-loop-nest-depth",
 	 "The maximum depth of a loop nest we completely peel",
 	 8, 0, 0)
-/* The minimum profile count of basic blocks to look at when estimating
- * the code size footprint of the call graph in a dynamic IPA compile. */
-DEFPARAM(PARAM_CODESIZE_HOTNESS_THRESHOLD,
-	"codesize-hotness-threshold",
-	"Minimum profile count of basic blocks counted towards dynamic IPA "
-	"code size footprint estimate",
-	10000, 0, 0)
 /* The maximum code size estimate under which loop unrolling and peeling
- * is allowed in a dynamic IPA compile. This currently applies to loops with
- * non-constant iteration counts and no floating point computations. */
+ * is allowed in a profile feedback compile. This currently applies to loops
+ * with non-constant iteration counts and no floating point computations.  */
 DEFPARAM(PARAM_UNROLLPEEL_CODESIZE_THRESHOLD,
 	"unrollpeel-codesize-threshold",
-	"Maximum dynamic IPA code size footprint estimate for loop unrolling "
+	"Maximum profile-based code size footprint estimate for loop unrolling "
 	"and peeling",
-	10000, 0, 0)
+	15000, 0, 0)
+/* The maximum ratio of total profiled execution counts to loop entry block
+   count that must be exceeded to ignore most code size limits when unrolling
+   and peeling.  */
+DEFPARAM(PARAM_UNROLLPEEL_HOTNESS_THRESHOLD,
+        "unrollpeel-hotness-threshold",
+        "Maximum ratio of total profiled execution count to loop entry "
+        "block count under which most codesize limits for unrolling and "
+        "peeling will be ignored",
+        100, 1, 0)
 
 DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
 	"min-iter-unroll-with-branches",
Index: gcc/gcov-dump.c
===================================================================
--- gcc/gcov-dump.c	(revision 188119)
+++ gcc/gcov-dump.c	(working copy)
@@ -526,8 +526,11 @@  tag_summary (const char *filename ATTRIBUTE_UNUSED
     {
       printf ("\n");
       print_prefix (filename, 0, 0);
-      printf ("\t\tcounts=%u, runs=%u",
-	      summary.ctrs[ix].num, summary.ctrs[ix].runs);
+      printf ("\t\tcounts=%u (cutoff pct*10=%u, num to cutoff=%u), runs=%u",
+	      summary.ctrs[ix].num,
+	      summary.ctrs[ix].sum_cutoff_percent,
+	      summary.ctrs[ix].num_to_cutoff,
+	      summary.ctrs[ix].runs);
 
       printf (", sum_all=" HOST_WIDEST_INT_PRINT_DEC,
 	      (HOST_WIDEST_INT)summary.ctrs[ix].sum_all);