[google/main] New fdo summary-based icache sensitive unrolling (issue6282045)

Message ID	20120607064024.81E5460B4E@tjsboxrox.mtv.corp.google.com
State	New
Headers	show Return-Path: <gcc-patches-return-320433-incoming=patchwork.ozlabs.org@gcc.gnu.org> Comment: DKIM? See http://www.dkim.org Comment: DomainKeys? See http://antispam.yahoo.com/domainkeys DomainKey-Signature: a=rsa-sha1; q=dns; c=nofws; s=default; d=gcc.gnu.org; h=Received:Received:X-SWARE-Spam-Status:X-Spam-Check-By:Received:Received:X-Google-DKIM-Signature:Received:Received:Received:Received:Received:To:Subject:Message-Id:Date:From:X-Gm-Message-State:X-IsSubscribed:Mailing-List:Precedence:List-Id:List-Unsubscribe:List-Archive:List-Post:List-Help:Sender:Delivered-To; b=Lw+pyNYoLdITRp0LGBnNBBGlaBD+y5pAbuamhIRjT2i6cOZdIXDdBiJ+oIXaxf TQh+wQzJtvOAfDQJ9PnLzTadNR7KSxBM67kpSwO+ecAnlQLo9w5tMAAAwaJfPqa/ HUGHowa3MPV1Hgip9czii4hc04tufruXHT4uaLD5NpGWg=; To: reply@codereview.appspotmail.com,gcc-patches@gcc.gnu.org Subject: [google/main] New fdo summary-based icache sensitive unrolling (issue6282045) Message-Id: <20120607064024.81E5460B4E@tjsboxrox.mtv.corp.google.com> Date: Wed, 6 Jun 2012 23:40:23 -0700 (PDT) From: tejohnson@google.com (Teresa Johnson) Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org

Index: libgcc/libgcov.c =================================================================== --- libgcc/libgcov.c (revision 188282) +++ libgcc/libgcov.c (working copy) @@ -810,6 +810,118 @@ gcov_sort_topn_counter_arrays (const struct gcov_i } } +/* Used by qsort to sort gcov values in descending order. */ + +static int +sort_by_reverse_gcov_value (const void *pa, const void *pb) +{ + const gcov_type a = *(gcov_type const *)pa; + const gcov_type b = *(gcov_type const *)pb; + + if (b > a) + return 1; + else if (b == a) + return 0; + else + return -1; +} + +/* Determines the number of counters required to cover a given percentage + of the total sum of execution counts in the summary, which is then also + recorded in SUM. */ + +static void +gcov_compute_cutoff_values (struct gcov_summary *sum) +{ + struct gcov_info *gi_ptr; + const struct gcov_fn_info *gfi_ptr; + const struct gcov_ctr_info *ci_ptr; + struct gcov_ctr_summary *cs_ptr; + unsigned t_ix, f_ix, i, ctr_info_ix, index; + gcov_unsigned_t c_num; + gcov_type *value_array; + gcov_type cum, cum_cutoff; + char *cutoff_str; + unsigned cutoff_perc; + +#define CUM_CUTOFF_PERCENT_TIMES_10 999 + cutoff_str = getenv ("GCOV_HOTCODE_CUTOFF_TIMES_10"); + if (cutoff_str && strlen (cutoff_str)) + cutoff_perc = atoi (cutoff_str); + else + cutoff_perc = CUM_CUTOFF_PERCENT_TIMES_10; + + /* This currently only applies to arc counters. */ + t_ix = GCOV_COUNTER_ARCS; + + /* First check if there are any counts recorded for this counter. */ + cs_ptr = &(sum->ctrs[t_ix]); + if (!cs_ptr->num) + return; + + /* Determine the cumulative counter value at the specified cutoff + percentage and record the percentage for use by gcov consumers. + Check for overflow when sum_all is multiplied by the cutoff_perc, + and if so, do the divide first. */ + if (cs_ptr->sum_all*cutoff_perc < cs_ptr->sum_all) + /* Overflow, do the divide first. */ + cum_cutoff = cs_ptr->sum_all / 1000 * cutoff_perc; + else + /* Otherwise multiply first to get the correct value for small + values of sum_all. */ + cum_cutoff = (cs_ptr->sum_all * cutoff_perc) / 1000; + + /* Next, walk through all the per-object structures and save each of + the count values in value_array. */ + index = 0; + value_array = (gcov_type *) malloc (sizeof (gcov_type) * cs_ptr->num); + for (gi_ptr = __gcov_list; gi_ptr; gi_ptr = gi_ptr->next) + { + if (!gi_ptr->merge[t_ix]) + continue; + + /* Find the appropriate index into the gcov_ctr_info array + for the counter we are currently working on based on the + existence of the merge function pointer for this object. */ + for (i = 0, ctr_info_ix = 0; i < t_ix; i++) + { + if (gi_ptr->merge[i]) + ctr_info_ix++; + } + for (f_ix = 0; f_ix != gi_ptr->n_functions; f_ix++) + { + gfi_ptr = gi_ptr->functions[f_ix]; + + if (!gfi_ptr || gfi_ptr->key != gi_ptr) + continue; + + ci_ptr = &gfi_ptr->ctrs[ctr_info_ix]; + /* Sanity check that there are enough entries in value_arry + for this function's counters. Gracefully handle the case when + there are not, in case something in the profile info is + corrupted. */ + c_num = ci_ptr->num; + if (index + c_num > cs_ptr->num) + c_num = cs_ptr->num - index; + /* Copy over this function's counter values. */ + memcpy (&value_array[index], ci_ptr->values, + sizeof (gcov_type) * c_num); + index += c_num; + } + } + + /* Sort all the counter values by descending value and finally + accumulate the values from hottest on down until reaching + the cutoff value computed earlier. */ + qsort (value_array, cs_ptr->num, sizeof (gcov_type), + sort_by_reverse_gcov_value); + for (cum = 0, c_num = 0; c_num < cs_ptr->num && cum < cum_cutoff; c_num++) + cum += value_array[c_num]; + /* Record the number of counters required to reach the cutoff value. */ + cs_ptr->num_hot_counters = c_num; + free (value_array); +} + /* Compute object summary recored in gcov_info INFO. The result is stored in OBJ_SUM. Note that the caller is responsible for zeroing out OBJ_SUM, otherwise the summary is accumulated. */ @@ -1009,7 +1121,11 @@ rewrite:; if (gi_ptr->merge[t_ix]) { if (!cs_prg->runs++) - cs_prg->num = cs_tprg->num; + { + cs_prg->num = cs_tprg->num; + if (cs_tprg->num_hot_counters > cs_prg->num_hot_counters) + cs_prg->num_hot_counters = cs_tprg->num_hot_counters; + } cs_prg->sum_all += cs_tprg->sum_all; if (cs_prg->run_max < cs_tprg->run_max) cs_prg->run_max = cs_tprg->run_max; @@ -1175,6 +1291,7 @@ gcov_exit_init (void) is FDO/LIPO. */ dump_module_info |= gi_ptr->mod_info->is_primary; } + gcov_compute_cutoff_values (&this_program); gcov_alloc_filename (); Index: gcc/doc/invoke.texi =================================================================== --- gcc/doc/invoke.texi (revision 188282) +++ gcc/doc/invoke.texi (working copy) @@ -392,7 +392,7 @@ Objective-C and Objective-C++ Dialects}. -fno-sched-interblock -fno-sched-spec -fno-signed-zeros @gol -fno-toplevel-reorder -fno-trapping-math -fno-zero-initialized-in-bss @gol -fomit-frame-pointer -foptimize-register-move -foptimize-sibling-calls @gol --fpartial-inlining -fpeel-loops -fpredictive-commoning @gol +-fpartial-inlining -fpeel-codesize-limit -fpeel-loops -fpredictive-commoning @gol -fprefetch-loop-arrays @gol -fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol -fprofile-generate=@var{path} -fprofile-generate-sampling @gol @@ -404,8 +404,8 @@ Objective-C and Objective-C++ Dialects}. -freorder-blocks-and-partition -freorder-functions @gol -frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol -fripa -fripa-disallow-asm-modules -fripa-disallow-opt-mismatch @gol --fripa-inc-path-sub=@var{path_mapping} -fripa-no-promote-always-inline-func -fripa-verbose @gol --fripa-peel-size-limit -fripa-unroll-size-limit -frounding-math @gol +-fripa-inc-path-sub=@var{path_mapping} -fripa-no-promote-always-inline-func @gol +-fripa-verbose -frounding-math @gol -fsched2-use-superblocks -fsched-pressure @gol -fsched-spec-load -fsched-spec-load-dangerous @gol -fsched-stalled-insns-dep[=@var{n}] -fsched-stalled-insns[=@var{n}] @gol @@ -428,7 +428,7 @@ Objective-C and Objective-C++ Dialects}. -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-pta -ftree-reassoc @gol -ftree-sink -ftree-sra -ftree-switch-conversion -ftree-tail-merge @gol -ftree-ter -ftree-vect-loop-version -ftree-vectorize -ftree-vrp @gol --funit-at-a-time -funroll-all-loops -funroll-loops @gol +-funit-at-a-time -funroll-all-loops -funroll-loops -funroll-codesize-limit @gol -funsafe-loop-optimizations -funsafe-math-optimizations -funswitch-loops @gol -fvariable-expansion-in-unroller -fvect-cost-model -fvpt -fweb @gol -fwhole-program -fwpa -fuse-ld -fuse-linker-plugin @gol @@ -8424,20 +8424,6 @@ Do not promote static functions with always inline Enable printing of verbose information about dynamic inter-procedural optimizations. This is used in conjunction with the @option{-fripa}. -@item -fripa-peel-size-limit -@opindex fripa-peel-size-limit -Limit loop peeling of non-const non-FP loops in a LIPO compilation under estimates -of a large code footprint. Enabled by default under @option{-fripa}. Code size -estimation and thresholds are controlled by the @option{codesize-hotness-threshold} -and @option{unrollpeel-codesize-threshold} parameters. - -@item -fripa-unroll-size-limit -@opindex fripa-unroll-size-limit -Limit loop unrolling of non-const non-FP loops in a LIPO compilation under estimates -of a large code footprint. Enabled by default under @option{-fripa}. Code size -estimation and thresholds are controlled by the @option{codesize-hotness-threshold} -and @option{unrollpeel-codesize-threshold} parameters. - @item -fcallgraph-profiles-sections @opindex fcallgraph-profiles-sections Emit call graph edge profile counts in .note.callgraph.text sections. This is @@ -8774,6 +8760,14 @@ the loop is entered. This usually makes programs @option{-funroll-all-loops} implies the same options as @option{-funroll-loops}. +@item -funroll-codesize-limit +@opindex funroll-codesize-limit +Limit loop unrolling of non-const non-FP loops in a profile feedback compilation +under estimates of a large code footprint. Enabled by default with +@option{-fprofile-use}. Code size and execution weight thresholds are controlled +by the @option{unrollpeel-codesize-threshold} and +@option{unrollpeel-hotness-threshold} parameters. + @item -fpeel-loops @opindex fpeel-loops Peels loops for which there is enough information that they do not @@ -8782,6 +8776,14 @@ roll much (from profile feedback). It also turns Enabled with @option{-fprofile-use}. +@item -fpeel-codesize-limit +@opindex fpeel-codesize-limit +Limit loop peeling of non-const non-FP loops in a profile feedback compilation +under estimates of a large code footprint. Enabled by default with +@option{-fprofile-use}. Code size and execution weight thresholds are controlled +by the @option{unrollpeel-codesize-threshold} and +@option{unrollpeel-hotness-threshold} parameters. + @item -fmove-loop-invariants @opindex fmove-loop-invariants Enables the loop invariant motion pass in the RTL loop optimizer. Enabled @@ -9168,13 +9170,15 @@ hot loops. Its default value is 16. @item max-completely-peel-loop-nest-depth The maximum depth of a loop nest suitable for complete peeling. -@item codesize-hotness-threshold -The minimum profile count of basic blocks to look at when estimating -the code size footprint of the call graph in a LIPO compile. - @item unrollpeel-codesize-threshold -Maximum LIPO code size footprint estimate for loop unrolling and peeling. +Maximum profile-based code size footprint estimate for loop unrolling and +peeling. +@item unrollpeel-hotness-threshold +Maximum ratio of total execution count to loop entry block count under which +most profile-based code size estimates will be ignored for loop unrolling and +peeling. + @item max-unswitch-insns The maximum number of insns of an unswitched loop. Index: gcc/gcov-io.c =================================================================== --- gcc/gcov-io.c (revision 188282) +++ gcc/gcov-io.c (working copy) @@ -520,6 +520,7 @@ gcov_write_summary (gcov_unsigned_t tag, const str for (csum = summary->ctrs, ix = GCOV_COUNTERS_SUMMABLE; ix--; csum++) { gcov_write_unsigned (csum->num); + gcov_write_unsigned (csum->num_hot_counters); gcov_write_unsigned (csum->runs); gcov_write_counter (csum->sum_all); gcov_write_counter (csum->run_max); @@ -637,6 +638,7 @@ gcov_read_summary (struct gcov_summary *summary) for (csum = summary->ctrs, ix = GCOV_COUNTERS_SUMMABLE; ix--; csum++) { csum->num = gcov_read_unsigned (); + csum->num_hot_counters = gcov_read_unsigned (); csum->runs = gcov_read_unsigned (); csum->sum_all = gcov_read_counter (); csum->run_max = gcov_read_counter (); Index: gcc/gcov-io.h =================================================================== --- gcc/gcov-io.h (revision 188282) +++ gcc/gcov-io.h (working copy) @@ -440,7 +440,7 @@ typedef HOST_WIDEST_INT gcov_type; #define GCOV_TAG_OBJECT_SUMMARY ((gcov_unsigned_t)0xa1000000) /* Obsolete */ #define GCOV_TAG_PROGRAM_SUMMARY ((gcov_unsigned_t)0xa3000000) #define GCOV_TAG_SUMMARY_LENGTH \ - (1 + GCOV_COUNTERS_SUMMABLE * (2 + 3 * 2)) + (1 + GCOV_COUNTERS_SUMMABLE * (3 + 3 * 2)) #define GCOV_TAG_MODULE_INFO ((gcov_unsigned_t)0xa4000000) #define GCOV_TAG_PMU_LOAD_LATENCY_INFO ((gcov_unsigned_t)0xa5000000) #define GCOV_TAG_PMU_LOAD_LATENCY_LENGTH(filename) \ @@ -541,6 +541,8 @@ typedef HOST_WIDEST_INT gcov_type; struct gcov_ctr_summary { gcov_unsigned_t num; /* number of counters. */ + gcov_unsigned_t num_hot_counters;/* number of counters to reach a given + percent of sum_all. */ gcov_unsigned_t runs; /* number of program runs */ gcov_type sum_all; /* sum of all counters accumulated. */ gcov_type run_max; /* maximum value on a single run. */ Index: gcc/loop-unroll.c =================================================================== --- gcc/loop-unroll.c (revision 188282) +++ gcc/loop-unroll.c (working copy) @@ -35,6 +35,7 @@ along with GCC; see the file COPYING3. If not see #include "recog.h" #include "target.h" #include "diagnostic.h" +#include "gcov-io.h" /* This pass performs loop unrolling and peeling. We only perform these optimizations on innermost loops (with single exception) because @@ -181,48 +182,73 @@ report_unroll_peel(struct loop *loop, location_t l "" : iter_str); } -/* This returns a bit vector */ -typedef enum { - NO_LIMIT = 0, - LIMIT_UNROLL = 0x1, - LIMIT_PEEL = 0x2, - LIMIT_BOTH = 0x3 -} limit_type; +/* Determine whether and how much LOOP unrolling/peeling should be constrained + based on code footprint estimates. Returns the codesize-based factor to be + divided into the max instructions in an unrolled or peeled loop: + 1) For size <= threshold, do not limit (by returning 1). + 2) For threshold < size < 2*threshold, reduce maximum allowed peeled or + unrolled instructions according to loop hotness. + 3) For threshold >= 2*threshold, disable unrolling/peeling (by returning + INT_MAX). */ -extern int cgraph_codesize_estimate; - -/* Determine whether LOOP unrolling/peeling should be constrained based - on code footprint estimates. */ -static limit_type -limit_code_size(struct loop *loop) +static int +code_size_limit_factor(struct loop *loop) { unsigned size_threshold; - limit_type result = NO_LIMIT; - int result_int = 0; struct niter_desc *desc = get_simple_loop_desc (loop); + gcov_type sum_to_header_ratio; + int hotness_ratio_threshold; + int limit_factor; - if (!flag_dyn_ipa) - return NO_LIMIT; + /* First check if the application has a large codesize footprint. + This is estimated from FDO profile summary information for the + program, where the num_hot_counters indicates the number of hottest + counters (blocks) that compose most of the execution time of + the program. A large value would indicate a large flat execution + profile where icache misses may be a concern. */ + size_threshold = PARAM_VALUE (PARAM_UNROLLPEEL_CODESIZE_THRESHOLD); + if (!profile_info + || profile_info->num_hot_counters <= size_threshold + || !profile_info->sum_all) + return 1; - gcc_assert (cgraph_codesize_estimate >= 0); + /* Next, exclude some loops where unrolling/peeling may be more + important to overall performance. */ /* Ignore FP loops, which are more likely to benefit heavily from unrolling. */ if (desc->has_fp) - return NO_LIMIT; + return 1; - size_threshold = PARAM_VALUE (PARAM_UNROLLPEEL_CODESIZE_THRESHOLD); - if (cgraph_codesize_estimate <= (int)size_threshold) - return NO_LIMIT; + /* Next, set the value of the codesize-based unroll factor divisor which in + most loops will need to be set to a value that will reduce or eliminate + unrolling/peeling. */ + if (profile_info->num_hot_counters < size_threshold * 2) + { + /* For applications that are less than twice the codesize limit, allow + limited unrolling for very hot loops. */ + sum_to_header_ratio = profile_info->sum_all / loop->header->count; + hotness_ratio_threshold = PARAM_VALUE (PARAM_UNROLLPEEL_HOTNESS_THRESHOLD); + /* When the profile count sum to loop entry header ratio is smaller than + the threshold (i.e. the loop entry is hot enough, the divisor is set + to 1 so the unroll/peel factor is not reduced. When it is bigger + than the ratio, increase the divisor by the amount this ratio + is over the threshold, which will quickly reduce the unroll/peel + factor to zero as the loop's hotness reduces. */ + if (sum_to_header_ratio > hotness_ratio_threshold) + { + limit_factor = sum_to_header_ratio / hotness_ratio_threshold; + gcc_assert (limit_factor >= 1); + } + else + limit_factor = 1; + } + else + /* For appliations that are at least twice the codesize limit, set + the divisor to a large value that will force the unroll factor to 0. */ + limit_factor = INT_MAX; - if (flag_ripa_peel_size_limit) - result_int |= LIMIT_PEEL; - - if (flag_ripa_unroll_size_limit) - result_int |= LIMIT_UNROLL; - - result = (limit_type)result_int; - return result; + return limit_factor; } /* Compute the maximum number of times LOOP can be unrolled without exceeding @@ -442,7 +468,6 @@ decide_unrolling_and_peeling (int flags) struct loop *loop; loop_iterator li; location_t locus; - limit_type limit; /* Scan the loops, inner ones first. */ FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST) @@ -483,38 +508,15 @@ decide_unrolling_and_peeling (int flags) loop->ninsns = num_loop_insns (loop); loop->av_ninsns = average_num_loop_insns (loop); - /* Determine whether to limit code size growth from unrolling and - peeling. This is currently enabled only under LIPO (dynamic IPA) - where we have a partial call graph. It is not applied to loops - with constant trip counts, as it is easier to determine the - profitability of unrolling and peeling such loops. */ - limit = limit_code_size(loop); - if (limit != NO_LIMIT) - { - if (dump_file) - { - fprintf (dump_file, ";; Due to large code size footprint estimate, limit "); - if (limit == (LIMIT_UNROLL|LIMIT_PEEL)) - fprintf (dump_file, "unrolling and peeling\n"); - else if (limit == LIMIT_UNROLL) - fprintf (dump_file, "unrolling\n"); - else - fprintf (dump_file, "peeling\n"); - } - } - /* Try transformations one by one in decreasing order of priority. */ decide_unroll_constant_iterations (loop, flags); - if (loop->lpt_decision.decision == LPT_NONE - && !(limit & LIMIT_UNROLL)) + if (loop->lpt_decision.decision == LPT_NONE) decide_unroll_runtime_iterations (loop, flags); - if (loop->lpt_decision.decision == LPT_NONE - && !(limit & LIMIT_UNROLL)) + if (loop->lpt_decision.decision == LPT_NONE) decide_unroll_stupid (loop, flags); - if (loop->lpt_decision.decision == LPT_NONE - && !(limit & LIMIT_PEEL)) + if (loop->lpt_decision.decision == LPT_NONE) decide_peel_simple (loop, flags); if (flag_opt_info >= OPT_INFO_MIN @@ -1053,6 +1055,7 @@ decide_unroll_runtime_iterations (struct loop *loo { unsigned nunroll, nunroll_by_av, nunroll_branches, i; struct niter_desc *desc; + int limit_factor = 1; if (!(flags & UAP_UNROLL)) { @@ -1065,10 +1068,26 @@ decide_unroll_runtime_iterations (struct loop *loo "\n;; Considering unrolling loop with runtime " "computable number of iterations\n"); + if (flag_unroll_codesize_limit) + { + /* Determine whether to limit code size growth from unrolling, + using FDO profile summary information that gives an + estimated number of executed blocks. */ + limit_factor = code_size_limit_factor (loop); + if (dump_file && limit_factor > 1) + { + fprintf (dump_file, + ";; Due to large code size footprint estimate, limit " + "max unrolled insns by divisor %d\n", limit_factor); + } + } + /* nunroll = total number of copies of the original loop body in unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */ - nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns; - nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns; + nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / limit_factor + / loop->ninsns; + nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) + / limit_factor / loop->av_ninsns; if (nunroll > nunroll_by_av) nunroll = nunroll_by_av; if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) @@ -1459,6 +1478,7 @@ decide_peel_simple (struct loop *loop, int flags) { unsigned npeel; struct niter_desc *desc; + int limit_factor = 1; if (!(flags & UAP_PEEL)) { @@ -1469,8 +1489,23 @@ decide_peel_simple (struct loop *loop, int flags) if (dump_file) fprintf (dump_file, "\n;; Considering simply peeling loop\n"); + if (flag_peel_codesize_limit) + { + /* Determine whether to limit code size growth from peeling, + using FDO profile summary information that gives an + estimated number of executed blocks. */ + limit_factor = code_size_limit_factor (loop); + if (dump_file && limit_factor > 1) + { + fprintf (dump_file, + ";; Due to large code size footprint estimate, limit " + "max peeled insns by divisor %d\n", limit_factor); + } + } + /* npeel = number of iterations to peel. */ - npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns; + npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / limit_factor + / loop->ninsns; if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES)) npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES); @@ -1612,6 +1647,7 @@ decide_unroll_stupid (struct loop *loop, int flags { unsigned nunroll, nunroll_by_av, i; struct niter_desc *desc; + int limit_factor = 1; if (!(flags & UAP_UNROLL_ALL)) { @@ -1622,11 +1658,26 @@ decide_unroll_stupid (struct loop *loop, int flags if (dump_file) fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n"); + if (flag_unroll_codesize_limit) + { + /* Determine whether to limit code size growth from unrolling, + using FDO profile summary information that gives an + estimated number of executed blocks. */ + limit_factor = code_size_limit_factor (loop); + if (dump_file && limit_factor > 1) + { + fprintf (dump_file, + ";; Due to large code size footprint estimate, limit " + "max unrolled insns by divisor %d\n", limit_factor); + } + } + /* nunroll = total number of copies of the original loop body in unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */ - nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns; - nunroll_by_av - = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns; + nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / limit_factor + / loop->ninsns; + nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) + / limit_factor / loop->av_ninsns; if (nunroll > nunroll_by_av) nunroll = nunroll_by_av; if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) Index: gcc/coverage.c =================================================================== --- gcc/coverage.c (revision 188282) +++ gcc/coverage.c (working copy) @@ -468,6 +468,8 @@ read_counts_file (const char *da_file_name, unsign gcov_read_summary (&sum); for (ix = 0; ix != GCOV_COUNTERS_SUMMABLE; ix++) { + summary.ctrs[ix].num_hot_counters + += sum.ctrs[ix].num_hot_counters; summary.ctrs[ix].runs += sum.ctrs[ix].runs; summary.ctrs[ix].sum_all += sum.ctrs[ix].sum_all; if (summary.ctrs[ix].run_max < sum.ctrs[ix].run_max) Index: gcc/common.opt =================================================================== --- gcc/common.opt (revision 188282) +++ gcc/common.opt (working copy) @@ -1139,14 +1139,6 @@ Common Report Var(flag_ripa_no_promote_always_inli Don't promote always inline static functions assuming they will be inlined and no copy is needed. -fripa-peel-size-limit -Common Report Var(flag_ripa_peel_size_limit) Init(1) Optimization -Limit non-const non-FP loop peeling under dynamic IPA estimates of large code footprint - -fripa-unroll-size-limit -Common Report Var(flag_ripa_unroll_size_limit) Init(1) Optimization -Limit non-const non-FP loop unrolling under dynamic IPA estimates of large code footprint - fripa-inc-path-sub= Common Joined RejectNegative Var(lipo_inc_path_pattern) Substitute substring in include paths with a new string to allow reuse profile data @@ -1639,6 +1631,10 @@ fpcc-struct-return Common Report Var(flag_pcc_struct_return,1) Init(DEFAULT_PCC_STRUCT_RETURN) Return small aggregates in memory, not registers +fpeel-codesize-limit +Common Report Var(flag_peel_codesize_limit) Init(1) Optimization +Limit non-const non-FP loop peeling under profile estimates of large code footprint + fpeel-loops Common Report Var(flag_peel_loops) Optimization Perform loop peeling @@ -2216,6 +2212,10 @@ funroll-all-loops Common Report Var(flag_unroll_all_loops) Optimization Perform loop unrolling for all loops +funroll-codesize-limit +Common Report Var(flag_unroll_codesize_limit) Init(1) Optimization +Limit non-const non-FP loop unrolling under profile estimates of large code footprint + ; Nonzero means that loop optimizer may assume that the induction variables ; that control loops do not overflow and that the loops with nontrivial ; exit condition are not infinite Index: gcc/tree-optimize.c =================================================================== --- gcc/tree-optimize.c (revision 188282) +++ gcc/tree-optimize.c (working copy) @@ -46,7 +46,6 @@ along with GCC; see the file COPYING3. If not see #include "except.h" #include "plugin.h" #include "regset.h" /* FIXME: For reg_obstack. */ -#include "params.h" /* Gate: execute, or not, all of the non-trivial optimizations. */ @@ -149,49 +148,6 @@ struct gimple_opt_pass pass_all_early_optimization } }; -int cgraph_codesize_estimate = -1; - -/* Estimate the code size from the dynamic IPA call graph. */ -static void -compute_codesize_estimate(void) -{ - struct cgraph_node *node; - basic_block bb; - gimple_stmt_iterator bsi; - struct function *my_function; - - if (!flag_dyn_ipa) - return; - - if (cgraph_codesize_estimate >= 0) - return; - cgraph_codesize_estimate = 0; - - for (node = cgraph_nodes; node; node = node->next) - { - if (node->count == 0) - continue; - - if (!gimple_has_body_p(node->decl)) - continue; - - my_function = DECL_STRUCT_FUNCTION (node->decl); - - FOR_EACH_BB_FN (bb, my_function) - { - if (bb->count < PARAM_VALUE (PARAM_CODESIZE_HOTNESS_THRESHOLD)) - continue; - for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) - { - gimple stmt = gsi_stmt (bsi); - cgraph_codesize_estimate += estimate_num_insns (stmt, &eni_size_weights); - } - } - } - if (dump_file) - fprintf (dump_file, "Code size estimate from cgraph: %d\n", cgraph_codesize_estimate); -} - /* Pass: cleanup the CFG just before expanding trees to RTL. This is just a round of label cleanups and case node grouping because after the tree optimizers have run such cleanups may @@ -202,8 +158,6 @@ execute_cleanup_cfg_post_optimizing (void) { unsigned int todo = 0; - /* Estimate the code footprint for hot BBs before we enter RTL */ - compute_codesize_estimate(); if (cleanup_tree_cfg ()) todo |= TODO_update_ssa; maybe_remove_unreachable_handlers (); Index: gcc/params.def =================================================================== --- gcc/params.def (revision 188282) +++ gcc/params.def (working copy) @@ -337,21 +337,23 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS, "max-completely-peel-loop-nest-depth", "The maximum depth of a loop nest we completely peel", 8, 0, 0) -/* The minimum profile count of basic blocks to look at when estimating - * the code size footprint of the call graph in a dynamic IPA compile. */ -DEFPARAM(PARAM_CODESIZE_HOTNESS_THRESHOLD, - "codesize-hotness-threshold", - "Minimum profile count of basic blocks counted towards dynamic IPA " - "code size footprint estimate", - 10000, 0, 0) /* The maximum code size estimate under which loop unrolling and peeling - * is allowed in a dynamic IPA compile. This currently applies to loops with - * non-constant iteration counts and no floating point computations. */ + * is allowed in a profile feedback compile. This currently applies to loops + * with non-constant iteration counts and no floating point computations. */ DEFPARAM(PARAM_UNROLLPEEL_CODESIZE_THRESHOLD, "unrollpeel-codesize-threshold", - "Maximum dynamic IPA code size footprint estimate for loop unrolling " + "Maximum profile-based code size footprint estimate for loop unrolling " "and peeling", - 10000, 0, 0) + 15000, 0, 0) +/* The maximum ratio of total profiled execution counts to loop entry block + count that must be exceeded to ignore most code size limits when unrolling + and peeling. */ +DEFPARAM(PARAM_UNROLLPEEL_HOTNESS_THRESHOLD, + "unrollpeel-hotness-threshold", + "Maximum ratio of total profiled execution count to loop entry " + "block count under which most codesize limits for unrolling and " + "peeling will be ignored", + 100, 1, 0) DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES, "min-iter-unroll-with-branches", Index: gcc/gcov-dump.c =================================================================== --- gcc/gcov-dump.c (revision 188282) +++ gcc/gcov-dump.c (working copy) @@ -526,8 +526,10 @@ tag_summary (const char *filename ATTRIBUTE_UNUSED { printf ("\n"); print_prefix (filename, 0, 0); - printf ("\t\tcounts=%u, runs=%u", - summary.ctrs[ix].num, summary.ctrs[ix].runs); + printf ("\t\tcounts=%u (num hot counts=%u), runs=%u", + summary.ctrs[ix].num, + summary.ctrs[ix].num_hot_counters, + summary.ctrs[ix].runs); printf (", sum_all=" HOST_WIDEST_INT_PRINT_DEC, (HOST_WIDEST_INT)summary.ctrs[ix].sum_all);

[google/main] New fdo summary-based icache sensitive unrolling (issue6282045)

Commit Message

Patch