Message ID | 20120424212648.E6B696136C@tjsboxrox.mtv.corp.google.com |
---|---|
State | New |
Headers | show |
On Tue, Apr 24, 2012 at 2:26 PM, Teresa Johnson <tejohnson@google.com> wrote: > This patch adds heuristics to limit unrolling in loops with branches that may increase > branch mispredictions. It affects loops that are not frequently iterated, and that are > nested within a hot region of code that already contains many branch instructions. > > Performance tested with both internal benchmarks and with SPEC 2000/2006 on a variety > of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems. > This improves performance of an internal search indexing benchmark by close to 2% on > all the tested Intel platforms. It also consistently improves 445.gobmk (with FDO feedback > where unrolling kicks in) by close to 1% on AMD Opteron. Other performance effects are > neutral. > > Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? > > Thanks, > Teresa > > 2012-04-24 Teresa Johnson <tejohnson@google.com> > > * loop-unroll.c (loop_has_call): New function. > (loop_has_FP_comp): Ditto. > (compute_weighted_branches): Ditto. > (max_unroll_with_branches): Ditto. > (decide_unroll_constant_iterations): Add heuristic to avoid > increasing branch mispredicts when unrolling. > (decide_unroll_runtime_iterations): Ditto. > * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param. > (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto. > > Index: loop-unroll.c > =================================================================== > --- loop-unroll.c (revision 186783) > +++ loop-unroll.c (working copy) > @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc > basic_block); > static rtx get_expansion (struct var_to_expand *); > > +/* Determine whether LOOP contains call. */ > +static bool > +loop_has_call(struct loop *loop) > +{ > + basic_block *body, bb; > + unsigned i; > + rtx insn; > + > + body = get_loop_body (loop); > + for (i = 0; i < loop->num_nodes; i++) > + { > + bb = body[i]; > + > + FOR_BB_INSNS (bb, insn) > + { > + if (CALL_P (insn)) > + { > + free (body); > + return true; > + } > + } > + } > + free (body); > + return false; > +} > + > +/* Determine whether LOOP contains floating-point computation. */ > +static bool > +loop_has_FP_comp(struct loop *loop) > +{ > + rtx set, dest; > + basic_block *body, bb; > + unsigned i; > + rtx insn; > + > + body = get_loop_body (loop); > + for (i = 0; i < loop->num_nodes; i++) > + { > + bb = body[i]; > + > + FOR_BB_INSNS (bb, insn) > + { > + set = single_set (insn); > + if (!set) > + continue; > + > + dest = SET_DEST (set); > + if (FLOAT_MODE_P (GET_MODE (dest))) > + { > + free (body); > + return true; > + } > + } > + } > + free (body); > + return false; > +} > + > +/* Compute the number of branches in LOOP, weighted by execution counts. */ > +static float > +compute_weighted_branches(struct loop *loop) > +{ > + int header_count = loop->header->count; > + unsigned i; > + float n; > + basic_block * body; > + > + /* If no profile feedback data exists, don't limit unrolling */ > + if (header_count == 0) > + return 0.0; > + > + gcc_assert (loop->latch != EXIT_BLOCK_PTR); > + > + body = get_loop_body (loop); > + n = 0.0; > + for (i = 0; i < loop->num_nodes; i++) > + { > + if (EDGE_COUNT (body[i]->succs) >= 2) > + { > + /* If this block is executed less frequently than the header (loop > + entry), then it is weighted based on the ratio of times it is > + executed compared to the header. */ > + if (body[i]->count < header_count) > + n += ((float)body[i]->count)/header_count; Please don't introduce more floating point usage into the compiler since it could change between different hosts (sse vs x87 for an example). Maybe use a fixed point multiply of 1000 (note use a macro for this special value though) like what is used in the rest of the predict code. Thanks, Andrew Pinski > + > + /* When it is executed more frequently than the header (i.e. it is > + in a nested inner loop), simply weight the branch at 1.0. */ > + else > + n += 1.0; > + } > + } > + free (body); > + > + return n; > +} > + > +/* Compute the maximum number of times LOOP can be unrolled without exceeding > + a branch budget, which can increase branch mispredictions. The number of > + branches is computed by weighting each branch with its expected execution > + probability through the loop based on profile data. If no profile feedback > + data exists, simply return the current NUNROLL factor. */ > +static unsigned > +max_unroll_with_branches(struct loop *loop, unsigned nunroll) > +{ > + struct loop *outer; > + struct niter_desc *outer_desc; > + int outer_niters = 1; > + float weighted_outer_branches = 0.0; > + float weighted_num_branches = compute_weighted_branches (loop); > + > + /* If there was no profile feedback data, weighted_num_branches will be 0.0 > + and we won't limit unrolling. If the weighted_num_branches is at most 1.0, > + also don't limit unrolling as the back-edge branch will not be duplicated. */ > + if (weighted_num_branches <= 1.0) > + return nunroll; > + > + /* Walk up the loop tree until we find a hot outer loop in which the current > + loop is nested. At that point we will compute the number of times the > + current loop can be unrolled based on the number of branches in the hot > + outer loop. */ > + outer = loop_outer(loop); > + /* The loop structure contains a fake outermost loop, so this should always > + be non-NULL for our current loop. */ > + gcc_assert (outer); > + /* Detect if this is the fake outermost loop (at which point we are done) > + by checking its outer loop. */ > + while (loop_outer(outer)) > + { > + outer_desc = get_simple_loop_desc (outer); > + > + if (outer_desc->const_iter) > + outer_niters *= outer_desc->niter; > + else if (outer->header->count) > + outer_niters *= expected_loop_iterations (outer); > + > + weighted_outer_branches = compute_weighted_branches (outer); > + > + /* Should have been checked by caller. */ > + gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1); > + > + /* If the outer loop has enough iterations to be considered hot, then > + we can stop our upwards loop tree traversal and examine the current > + outer loop. */ > + if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) > + { > + /* Assume that any call will cause the branch budget to be exceeded, > + and that we can't unroll the current loop without increasing > + mispredicts. */ > + if (loop_has_call(outer)) > + return 0; > + > + /* Otherwise, compute the maximum number of times current loop can be > + unrolled without exceeding our branch budget. First we subtract > + off the outer loop's weighted branch count from the budget. Note > + that this includes the branches in the current loop. This yields > + the number of branches left in the budget for the unrolled copies. > + We divide this by the number of branches in the current loop that > + must be duplicated when we unroll, which is the total weighted > + number of branches minus the back-edge branch. This yields the > + number of new loop body copies that can be created by unrolling > + without exceeding the budget, to which we add 1 to get the unroll > + factor. */ > + return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) - > + weighted_outer_branches)/(weighted_num_branches - 1) + 1; > + } > + outer = loop_outer(outer); > + } > + > + /* The current loop is not enclosed by a hot enough outer loop in this > + procedure, since the hot outer loop is inter-procedural, assume that > + it already contains a significant number of branches, so don't unroll. */ > + return 0; > +} > + > /* Unroll and/or peel (depending on FLAGS) LOOPS. */ > void > unroll_and_peel_loops (int flags) > @@ -522,6 +696,7 @@ static void > decide_unroll_constant_iterations (struct loop *loop, int flags) > { > unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i; > + unsigned nunroll_branches; > struct niter_desc *desc; > > if (!(flags & UAP_UNROLL)) > @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo > return; > } > > + /* Be careful when unrolling loops with branches inside -- it can increase > + the number of mispredicts. Ignore loops with FP computation as these > + tend to benefit much more consistently from unrolling. */ > + if (num_loop_branches (loop) > 1 > + && loop_has_FP_comp(loop) > + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 > + && desc->niter < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) > + { > + nunroll_branches = max_unroll_with_branches(loop, nunroll); > + if (nunroll > nunroll_branches) > + nunroll = nunroll_branches; > + if (nunroll <= 1) > + { > + if (dump_file) > + fprintf (dump_file, ";; Not unrolling, contains branches\n"); > + return; > + } > + } > + > /* Check whether the loop rolls enough to consider. */ > if (desc->niter < 2 * nunroll) > { > @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop > static void > decide_unroll_runtime_iterations (struct loop *loop, int flags) > { > - unsigned nunroll, nunroll_by_av, i; > + unsigned nunroll, nunroll_by_av, nunroll_branches, i; > struct niter_desc *desc; > > if (!(flags & UAP_UNROLL)) > @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo > return; > } > > + /* Be careful when unrolling loops with branches inside -- it can increase > + the number of mispredicts. Ignore loops with FP computation as these > + tend to benefit much more consistently from unrolling. */ > + if (num_loop_branches (loop) > 1 > + && loop_has_FP_comp(loop) > + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 > + && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) > + { > + nunroll_branches = max_unroll_with_branches(loop, nunroll); > + if (nunroll > nunroll_branches) > + nunroll = nunroll_branches; > + if (nunroll <= 1) > + { > + if (dump_file) > + fprintf (dump_file, ";; Not unrolling, contains branches\n"); > + return; > + } > + } > + > /* If we have profile feedback, check whether the loop rolls. */ > if ((loop->header->count > && expected_loop_iterations (loop) < 2 * nunroll) > Index: params.def > =================================================================== > --- params.def (revision 186783) > +++ params.def (working copy) > @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS, > "The maximum depth of a loop nest we completely peel", > 8, 0, 0) > > +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES, > + "min-iter-unroll-with-branches", > + "Minimum iteration count to ignore branch effects when unrolling", > + 50, 0, 0) > + > +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET, > + "unroll-outer-loop-branch-budget", > + "Maximum number of branches allowed in hot outer loop region after unroll", > + 25, 0, 0) > + > /* The maximum number of insns of an unswitched loop. */ > DEFPARAM(PARAM_MAX_UNSWITCH_INSNS, > "max-unswitch-insns", > > -- > This patch is available for review at http://codereview.appspot.com/6099055
Resending my response in plain text so it will go through to gcc-patches... On Tue, Apr 24, 2012 at 2:36 PM, Teresa Johnson <tejohnson@google.com> wrote: > > > > On Tue, Apr 24, 2012 at 2:30 PM, Andrew Pinski <pinskia@gmail.com> wrote: >> >> On Tue, Apr 24, 2012 at 2:26 PM, Teresa Johnson <tejohnson@google.com> wrote: >> > This patch adds heuristics to limit unrolling in loops with branches that may increase >> > branch mispredictions. It affects loops that are not frequently iterated, and that are >> > nested within a hot region of code that already contains many branch instructions. >> > >> > Performance tested with both internal benchmarks and with SPEC 2000/2006 on a variety >> > of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems. >> > This improves performance of an internal search indexing benchmark by close to 2% on >> > all the tested Intel platforms. It also consistently improves 445.gobmk (with FDO feedback >> > where unrolling kicks in) by close to 1% on AMD Opteron. Other performance effects are >> > neutral. >> > >> > Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? >> > >> > Thanks, >> > Teresa >> > >> > 2012-04-24 Teresa Johnson <tejohnson@google.com> >> > >> > * loop-unroll.c (loop_has_call): New function. >> > (loop_has_FP_comp): Ditto. >> > (compute_weighted_branches): Ditto. >> > (max_unroll_with_branches): Ditto. >> > (decide_unroll_constant_iterations): Add heuristic to avoid >> > increasing branch mispredicts when unrolling. >> > (decide_unroll_runtime_iterations): Ditto. >> > * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param. >> > (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto. >> > >> > Index: loop-unroll.c >> > =================================================================== >> > --- loop-unroll.c (revision 186783) >> > +++ loop-unroll.c (working copy) >> > @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc >> > basic_block); >> > static rtx get_expansion (struct var_to_expand *); >> > >> > +/* Determine whether LOOP contains call. */ >> > +static bool >> > +loop_has_call(struct loop *loop) >> > +{ >> > + basic_block *body, bb; >> > + unsigned i; >> > + rtx insn; >> > + >> > + body = get_loop_body (loop); >> > + for (i = 0; i < loop->num_nodes; i++) >> > + { >> > + bb = body[i]; >> > + >> > + FOR_BB_INSNS (bb, insn) >> > + { >> > + if (CALL_P (insn)) >> > + { >> > + free (body); >> > + return true; >> > + } >> > + } >> > + } >> > + free (body); >> > + return false; >> > +} >> > + >> > +/* Determine whether LOOP contains floating-point computation. */ >> > +static bool >> > +loop_has_FP_comp(struct loop *loop) >> > +{ >> > + rtx set, dest; >> > + basic_block *body, bb; >> > + unsigned i; >> > + rtx insn; >> > + >> > + body = get_loop_body (loop); >> > + for (i = 0; i < loop->num_nodes; i++) >> > + { >> > + bb = body[i]; >> > + >> > + FOR_BB_INSNS (bb, insn) >> > + { >> > + set = single_set (insn); >> > + if (!set) >> > + continue; >> > + >> > + dest = SET_DEST (set); >> > + if (FLOAT_MODE_P (GET_MODE (dest))) >> > + { >> > + free (body); >> > + return true; >> > + } >> > + } >> > + } >> > + free (body); >> > + return false; >> > +} >> > + >> > +/* Compute the number of branches in LOOP, weighted by execution counts. */ >> > +static float >> > +compute_weighted_branches(struct loop *loop) >> > +{ >> > + int header_count = loop->header->count; >> > + unsigned i; >> > + float n; >> > + basic_block * body; >> > + >> > + /* If no profile feedback data exists, don't limit unrolling */ >> > + if (header_count == 0) >> > + return 0.0; >> > + >> > + gcc_assert (loop->latch != EXIT_BLOCK_PTR); >> > + >> > + body = get_loop_body (loop); >> > + n = 0.0; >> > + for (i = 0; i < loop->num_nodes; i++) >> > + { >> > + if (EDGE_COUNT (body[i]->succs) >= 2) >> > + { >> > + /* If this block is executed less frequently than the header (loop >> > + entry), then it is weighted based on the ratio of times it is >> > + executed compared to the header. */ >> > + if (body[i]->count < header_count) >> > + n += ((float)body[i]->count)/header_count; >> >> Please don't introduce more floating point usage into the compiler >> since it could change between different hosts (sse vs x87 for an >> example). >> Maybe use a fixed point multiply of 1000 (note use a macro for this >> special value though) like what is used in the rest of the predict >> code. > > > Ok, got it. I will address this in the next version of the patch. > > Thanks, > Teresa > >> >> >> >> Thanks, >> Andrew Pinski >> >> >> > + >> > + /* When it is executed more frequently than the header (i.e. it is >> > + in a nested inner loop), simply weight the branch at 1.0. */ >> > + else >> > + n += 1.0; >> > + } >> > + } >> > + free (body); >> > + >> > + return n; >> > +} >> > + >> > +/* Compute the maximum number of times LOOP can be unrolled without exceeding >> > + a branch budget, which can increase branch mispredictions. The number of >> > + branches is computed by weighting each branch with its expected execution >> > + probability through the loop based on profile data. If no profile feedback >> > + data exists, simply return the current NUNROLL factor. */ >> > +static unsigned >> > +max_unroll_with_branches(struct loop *loop, unsigned nunroll) >> > +{ >> > + struct loop *outer; >> > + struct niter_desc *outer_desc; >> > + int outer_niters = 1; >> > + float weighted_outer_branches = 0.0; >> > + float weighted_num_branches = compute_weighted_branches (loop); >> > + >> > + /* If there was no profile feedback data, weighted_num_branches will be 0.0 >> > + and we won't limit unrolling. If the weighted_num_branches is at most 1.0, >> > + also don't limit unrolling as the back-edge branch will not be duplicated. */ >> > + if (weighted_num_branches <= 1.0) >> > + return nunroll; >> > + >> > + /* Walk up the loop tree until we find a hot outer loop in which the current >> > + loop is nested. At that point we will compute the number of times the >> > + current loop can be unrolled based on the number of branches in the hot >> > + outer loop. */ >> > + outer = loop_outer(loop); >> > + /* The loop structure contains a fake outermost loop, so this should always >> > + be non-NULL for our current loop. */ >> > + gcc_assert (outer); >> > + /* Detect if this is the fake outermost loop (at which point we are done) >> > + by checking its outer loop. */ >> > + while (loop_outer(outer)) >> > + { >> > + outer_desc = get_simple_loop_desc (outer); >> > + >> > + if (outer_desc->const_iter) >> > + outer_niters *= outer_desc->niter; >> > + else if (outer->header->count) >> > + outer_niters *= expected_loop_iterations (outer); >> > + >> > + weighted_outer_branches = compute_weighted_branches (outer); >> > + >> > + /* Should have been checked by caller. */ >> > + gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1); >> > + >> > + /* If the outer loop has enough iterations to be considered hot, then >> > + we can stop our upwards loop tree traversal and examine the current >> > + outer loop. */ >> > + if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) >> > + { >> > + /* Assume that any call will cause the branch budget to be exceeded, >> > + and that we can't unroll the current loop without increasing >> > + mispredicts. */ >> > + if (loop_has_call(outer)) >> > + return 0; >> > + >> > + /* Otherwise, compute the maximum number of times current loop can be >> > + unrolled without exceeding our branch budget. First we subtract >> > + off the outer loop's weighted branch count from the budget. Note >> > + that this includes the branches in the current loop. This yields >> > + the number of branches left in the budget for the unrolled copies. >> > + We divide this by the number of branches in the current loop that >> > + must be duplicated when we unroll, which is the total weighted >> > + number of branches minus the back-edge branch. This yields the >> > + number of new loop body copies that can be created by unrolling >> > + without exceeding the budget, to which we add 1 to get the unroll >> > + factor. */ >> > + return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) - >> > + weighted_outer_branches)/(weighted_num_branches - 1) + 1; >> > + } >> > + outer = loop_outer(outer); >> > + } >> > + >> > + /* The current loop is not enclosed by a hot enough outer loop in this >> > + procedure, since the hot outer loop is inter-procedural, assume that >> > + it already contains a significant number of branches, so don't unroll. */ >> > + return 0; >> > +} >> > + >> > /* Unroll and/or peel (depending on FLAGS) LOOPS. */ >> > void >> > unroll_and_peel_loops (int flags) >> > @@ -522,6 +696,7 @@ static void >> > decide_unroll_constant_iterations (struct loop *loop, int flags) >> > { >> > unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i; >> > + unsigned nunroll_branches; >> > struct niter_desc *desc; >> > >> > if (!(flags & UAP_UNROLL)) >> > @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo >> > return; >> > } >> > >> > + /* Be careful when unrolling loops with branches inside -- it can increase >> > + the number of mispredicts. Ignore loops with FP computation as these >> > + tend to benefit much more consistently from unrolling. */ >> > + if (num_loop_branches (loop) > 1 >> > + && loop_has_FP_comp(loop) >> > + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 >> > + && desc->niter < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) >> > + { >> > + nunroll_branches = max_unroll_with_branches(loop, nunroll); >> > + if (nunroll > nunroll_branches) >> > + nunroll = nunroll_branches; >> > + if (nunroll <= 1) >> > + { >> > + if (dump_file) >> > + fprintf (dump_file, ";; Not unrolling, contains branches\n"); >> > + return; >> > + } >> > + } >> > + >> > /* Check whether the loop rolls enough to consider. */ >> > if (desc->niter < 2 * nunroll) >> > { >> > @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop >> > static void >> > decide_unroll_runtime_iterations (struct loop *loop, int flags) >> > { >> > - unsigned nunroll, nunroll_by_av, i; >> > + unsigned nunroll, nunroll_by_av, nunroll_branches, i; >> > struct niter_desc *desc; >> > >> > if (!(flags & UAP_UNROLL)) >> > @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo >> > return; >> > } >> > >> > + /* Be careful when unrolling loops with branches inside -- it can increase >> > + the number of mispredicts. Ignore loops with FP computation as these >> > + tend to benefit much more consistently from unrolling. */ >> > + if (num_loop_branches (loop) > 1 >> > + && loop_has_FP_comp(loop) >> > + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 >> > + && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) >> > + { >> > + nunroll_branches = max_unroll_with_branches(loop, nunroll); >> > + if (nunroll > nunroll_branches) >> > + nunroll = nunroll_branches; >> > + if (nunroll <= 1) >> > + { >> > + if (dump_file) >> > + fprintf (dump_file, ";; Not unrolling, contains branches\n"); >> > + return; >> > + } >> > + } >> > + >> > /* If we have profile feedback, check whether the loop rolls. */ >> > if ((loop->header->count >> > && expected_loop_iterations (loop) < 2 * nunroll) >> > Index: params.def >> > =================================================================== >> > --- params.def (revision 186783) >> > +++ params.def (working copy) >> > @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS, >> > "The maximum depth of a loop nest we completely peel", >> > 8, 0, 0) >> > >> > +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES, >> > + "min-iter-unroll-with-branches", >> > + "Minimum iteration count to ignore branch effects when unrolling", >> > + 50, 0, 0) >> > + >> > +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET, >> > + "unroll-outer-loop-branch-budget", >> > + "Maximum number of branches allowed in hot outer loop region after unroll", >> > + 25, 0, 0) >> > + >> > /* The maximum number of insns of an unswitched loop. */ >> > DEFPARAM(PARAM_MAX_UNSWITCH_INSNS, >> > "max-unswitch-insns", >> > >> > -- >> > This patch is available for review at http://codereview.appspot.com/6099055 > > > > > -- > Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413 > -- Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
On Tue, Apr 24, 2012 at 11:26 PM, Teresa Johnson <tejohnson@google.com> wrote: > * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param. > (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto. You should add documentation for these new PARAMs to doc/invoke.texi. I don't really like these new PARAMs: All other loop PARAMs are based on the number of insns in a loop, or the maximum number of times a transformation is applied. Your new PARAM_MIN_ITER_UNROLL_WITH_BRANCHES is completely different, because it is a number of iterations. This makes the PARAM value feel even more arbitrary than all the other PARAMs to some extend already do... (The only other PARAM like that is PARAM_ALIGN_LOOP_ITERATIONS, and its default value also looks quite arbitrary...) > Index: loop-unroll.c > =================================================================== > --- loop-unroll.c (revision 186783) > +++ loop-unroll.c (working copy) > @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc > basic_block); > static rtx get_expansion (struct var_to_expand *); > > +/* Determine whether LOOP contains call. */ > +static bool > +loop_has_call(struct loop *loop) > +{ > + basic_block *body, bb; > + unsigned i; > + rtx insn; > + > + body = get_loop_body (loop); > + for (i = 0; i < loop->num_nodes; i++) > + { > + bb = body[i]; > + > + FOR_BB_INSNS (bb, insn) > + { > + if (CALL_P (insn)) > + { > + free (body); > + return true; > + } > + } > + } > + free (body); > + return false; > +} > + > +/* Determine whether LOOP contains floating-point computation. */ > +static bool > +loop_has_FP_comp(struct loop *loop) > +{ > + rtx set, dest; > + basic_block *body, bb; > + unsigned i; > + rtx insn; > + > + body = get_loop_body (loop); > + for (i = 0; i < loop->num_nodes; i++) > + { > + bb = body[i]; > + > + FOR_BB_INSNS (bb, insn) > + { > + set = single_set (insn); > + if (!set) > + continue; > + > + dest = SET_DEST (set); > + if (FLOAT_MODE_P (GET_MODE (dest))) > + { > + free (body); > + return true; So you only detect single-set FP operations where some insns stores in a float mode. It wouldn't be very difficult to just walk over all sets and look for float modes. This is also necessary e.g. for x87 sincos, as well as various insns on other machines. Your comments say you don't want to apply the new heuristic to loops containing FP operations because these loops usually benefit more from unrolling. Therefore, you should IMHO look at non-single_set() insns also here, to avoid applying the heuristics to loops containing non-single_set() FP insns. > + } > + } > + } > + free (body); > + return false; > +} Nit: You are calling loop_has_call and loop_has_FP_comp() twice on each loop (first for constant iterations and next for runtime iterations), maybe you can fuse the functions and cache the results (e.g. with two bitmaps, or put it in the loop description and retrieve it with get_simple_loop_desc). Actually num_loop_branches() could/should also be cached. I realize that the loop body walks are probably not very expensive (and compile time probably isn't a concern if you're using profile driven optimizations) but they do all add up... > +/* Compute the number of branches in LOOP, weighted by execution counts. */ > +static float > +compute_weighted_branches(struct loop *loop) The floating point thing was already mentioned by Andrew. You can use integer math instead (for examples, look for BB_FREQ_MAX e.g. in average_num_loop_insns()). > + while (loop_outer(outer)) > + { > + outer_desc = get_simple_loop_desc (outer); > + > + if (outer_desc->const_iter) > + outer_niters *= outer_desc->niter; > + else if (outer->header->count) > + outer_niters *= expected_loop_iterations (outer); > + > + weighted_outer_branches = compute_weighted_branches (outer); Can you delay this computation of "weighted_outer_branches" call to ... > + /* Should have been checked by caller. */ > + gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1); Should never even happen. You have set the minimum acceptable value to 0. If you managed to test this code with PARAM_MIN_ITER_UNROLL_WITH_BRANCHES==-1, I'd like to know how (if you can do it from the command line, there is a bug in the handling of acceptable PARAM values :-) > + /* If the outer loop has enough iterations to be considered hot, then > + we can stop our upwards loop tree traversal and examine the current > + outer loop. */ > + if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) > + { > + /* Assume that any call will cause the branch budget to be exceeded, > + and that we can't unroll the current loop without increasing > + mispredicts. */ > + if (loop_has_call(outer)) > + return 0; > + > + /* Otherwise, compute the maximum number of times current loop can be > + unrolled without exceeding our branch budget. First we subtract > + off the outer loop's weighted branch count from the budget. Note > + that this includes the branches in the current loop. This yields > + the number of branches left in the budget for the unrolled copies. > + We divide this by the number of branches in the current loop that > + must be duplicated when we unroll, which is the total weighted > + number of branches minus the back-edge branch. This yields the > + number of new loop body copies that can be created by unrolling > + without exceeding the budget, to which we add 1 to get the unroll > + factor. */ ... somewhere here, where weighted_outer_branches is used? > + return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) - > + weighted_outer_branches)/(weighted_num_branches - 1) + 1; You should guard against weighted branches==1.0. Ciao! Steven
tejohnson@google.com (Teresa Johnson) writes: > This patch adds heuristics to limit unrolling in loops with branches that may increase > branch mispredictions. It affects loops that are not frequently iterated, and that are > nested within a hot region of code that already contains many branch instructions. > > Performance tested with both internal benchmarks and with SPEC 2000/2006 on a variety > of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems. > This improves performance of an internal search indexing benchmark by close to 2% on > all the tested Intel platforms. It also consistently improves 445.gobmk (with FDO feedback > where unrolling kicks in) by close to 1% on AMD Opteron. Other performance effects are > neutral. > > Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? One problem with any unrolling heuristics is currently that gcc has both the tree level and the rtl level unroller. The tree one is even on at -O3. So if you tweak anything for one you have to affect both, otherwise the other may still do the wrong thing(tm). For some other tweaks I looked into a shared cost model some time ago. May be still needed. -Andi
On Tue, Apr 24, 2012 at 6:13 PM, Andi Kleen <andi@firstfloor.org> wrote: > tejohnson@google.com (Teresa Johnson) writes: > >> This patch adds heuristics to limit unrolling in loops with branches that may increase >> branch mispredictions. It affects loops that are not frequently iterated, and that are >> nested within a hot region of code that already contains many branch instructions. >> >> Performance tested with both internal benchmarks and with SPEC 2000/2006 on a variety >> of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems. >> This improves performance of an internal search indexing benchmark by close to 2% on >> all the tested Intel platforms. It also consistently improves 445.gobmk (with FDO feedback >> where unrolling kicks in) by close to 1% on AMD Opteron. Other performance effects are >> neutral. >> >> Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? > > One problem with any unrolling heuristics is currently that gcc has both > the tree level and the rtl level unroller. The tree one is even on at > -O3. So if you tweak anything for one you have to affect both, otherwise the > other may still do the wrong thing(tm). Tree level unrollers (cunrolli and cunroll) do complete unroll. At O2, both of them are turned on, but gcc does not allow any code growth -- which makes them pretty useless at O2 (very few loops qualify). The default max complete peel iteration is also too low compared with both icc and llvm. This needs to be tuned. David > > For some other tweaks I looked into a shared cost model some time ago. > May be still needed. > > -Andi > > -- > ak@linux.intel.com -- Speaking for myself only
> Tree level unrollers (cunrolli and cunroll) do complete unroll. At O2, > both of them are turned on, but gcc does not allow any code growth -- > which makes them pretty useless at O2 (very few loops qualify). The > default max complete peel iteration is also too low compared with both > icc and llvm. This needs to be tuned. I found that at -O3 (where tree unroll is on by default) there is quite a bit of useless unrolling. I got somewhat irritated that my printf debug loops were commonly unrolled. -Andi
On Tue, Apr 24, 2012 at 11:26 PM, Teresa Johnson <tejohnson@google.com> wrote: > This patch adds heuristics to limit unrolling in loops with branches that may increase > branch mispredictions. It affects loops that are not frequently iterated, and that are > nested within a hot region of code that already contains many branch instructions. > > Performance tested with both internal benchmarks and with SPEC 2000/2006 on a variety > of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems. > This improves performance of an internal search indexing benchmark by close to 2% on > all the tested Intel platforms. It also consistently improves 445.gobmk (with FDO feedback > where unrolling kicks in) by close to 1% on AMD Opteron. Other performance effects are > neutral. > > Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? > > Thanks, > Teresa > > 2012-04-24 Teresa Johnson <tejohnson@google.com> > > * loop-unroll.c (loop_has_call): New function. > (loop_has_FP_comp): Ditto. > (compute_weighted_branches): Ditto. > (max_unroll_with_branches): Ditto. > (decide_unroll_constant_iterations): Add heuristic to avoid > increasing branch mispredicts when unrolling. > (decide_unroll_runtime_iterations): Ditto. > * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param. > (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto. > > Index: loop-unroll.c > =================================================================== > --- loop-unroll.c (revision 186783) > +++ loop-unroll.c (working copy) > @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc > basic_block); > static rtx get_expansion (struct var_to_expand *); > > +/* Determine whether LOOP contains call. */ > +static bool > +loop_has_call(struct loop *loop) > +{ > + basic_block *body, bb; > + unsigned i; > + rtx insn; > + > + body = get_loop_body (loop); You repeatedly do this and walk over all blocks. Please think about compile-time issues when writing code. This all looks sort-of target specific to me and I don't see why this very specialized patch is a good idea when unrolling does a very poor job deciding what and how much to unroll generally. Richard. > + for (i = 0; i < loop->num_nodes; i++) > + { > + bb = body[i]; > + > + FOR_BB_INSNS (bb, insn) > + { > + if (CALL_P (insn)) > + { > + free (body); > + return true; > + } > + } > + } > + free (body); > + return false; > +} > + > +/* Determine whether LOOP contains floating-point computation. */ > +static bool > +loop_has_FP_comp(struct loop *loop) > +{ > + rtx set, dest; > + basic_block *body, bb; > + unsigned i; > + rtx insn; > + > + body = get_loop_body (loop); > + for (i = 0; i < loop->num_nodes; i++) > + { > + bb = body[i]; > + > + FOR_BB_INSNS (bb, insn) > + { > + set = single_set (insn); > + if (!set) > + continue; > + > + dest = SET_DEST (set); > + if (FLOAT_MODE_P (GET_MODE (dest))) > + { > + free (body); > + return true; > + } > + } > + } > + free (body); > + return false; > +} > + > +/* Compute the number of branches in LOOP, weighted by execution counts. */ > +static float > +compute_weighted_branches(struct loop *loop) > +{ > + int header_count = loop->header->count; > + unsigned i; > + float n; > + basic_block * body; > + > + /* If no profile feedback data exists, don't limit unrolling */ > + if (header_count == 0) > + return 0.0; > + > + gcc_assert (loop->latch != EXIT_BLOCK_PTR); > + > + body = get_loop_body (loop); > + n = 0.0; > + for (i = 0; i < loop->num_nodes; i++) > + { > + if (EDGE_COUNT (body[i]->succs) >= 2) > + { > + /* If this block is executed less frequently than the header (loop > + entry), then it is weighted based on the ratio of times it is > + executed compared to the header. */ > + if (body[i]->count < header_count) > + n += ((float)body[i]->count)/header_count; > + > + /* When it is executed more frequently than the header (i.e. it is > + in a nested inner loop), simply weight the branch at 1.0. */ > + else > + n += 1.0; > + } > + } > + free (body); > + > + return n; > +} > + > +/* Compute the maximum number of times LOOP can be unrolled without exceeding > + a branch budget, which can increase branch mispredictions. The number of > + branches is computed by weighting each branch with its expected execution > + probability through the loop based on profile data. If no profile feedback > + data exists, simply return the current NUNROLL factor. */ > +static unsigned > +max_unroll_with_branches(struct loop *loop, unsigned nunroll) > +{ > + struct loop *outer; > + struct niter_desc *outer_desc; > + int outer_niters = 1; > + float weighted_outer_branches = 0.0; > + float weighted_num_branches = compute_weighted_branches (loop); > + > + /* If there was no profile feedback data, weighted_num_branches will be 0.0 > + and we won't limit unrolling. If the weighted_num_branches is at most 1.0, > + also don't limit unrolling as the back-edge branch will not be duplicated. */ > + if (weighted_num_branches <= 1.0) > + return nunroll; > + > + /* Walk up the loop tree until we find a hot outer loop in which the current > + loop is nested. At that point we will compute the number of times the > + current loop can be unrolled based on the number of branches in the hot > + outer loop. */ > + outer = loop_outer(loop); > + /* The loop structure contains a fake outermost loop, so this should always > + be non-NULL for our current loop. */ > + gcc_assert (outer); > + /* Detect if this is the fake outermost loop (at which point we are done) > + by checking its outer loop. */ > + while (loop_outer(outer)) > + { > + outer_desc = get_simple_loop_desc (outer); > + > + if (outer_desc->const_iter) > + outer_niters *= outer_desc->niter; > + else if (outer->header->count) > + outer_niters *= expected_loop_iterations (outer); > + > + weighted_outer_branches = compute_weighted_branches (outer); > + > + /* Should have been checked by caller. */ > + gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1); > + > + /* If the outer loop has enough iterations to be considered hot, then > + we can stop our upwards loop tree traversal and examine the current > + outer loop. */ > + if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) > + { > + /* Assume that any call will cause the branch budget to be exceeded, > + and that we can't unroll the current loop without increasing > + mispredicts. */ > + if (loop_has_call(outer)) > + return 0; > + > + /* Otherwise, compute the maximum number of times current loop can be > + unrolled without exceeding our branch budget. First we subtract > + off the outer loop's weighted branch count from the budget. Note > + that this includes the branches in the current loop. This yields > + the number of branches left in the budget for the unrolled copies. > + We divide this by the number of branches in the current loop that > + must be duplicated when we unroll, which is the total weighted > + number of branches minus the back-edge branch. This yields the > + number of new loop body copies that can be created by unrolling > + without exceeding the budget, to which we add 1 to get the unroll > + factor. */ > + return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) - > + weighted_outer_branches)/(weighted_num_branches - 1) + 1; > + } > + outer = loop_outer(outer); > + } > + > + /* The current loop is not enclosed by a hot enough outer loop in this > + procedure, since the hot outer loop is inter-procedural, assume that > + it already contains a significant number of branches, so don't unroll. */ > + return 0; > +} > + > /* Unroll and/or peel (depending on FLAGS) LOOPS. */ > void > unroll_and_peel_loops (int flags) > @@ -522,6 +696,7 @@ static void > decide_unroll_constant_iterations (struct loop *loop, int flags) > { > unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i; > + unsigned nunroll_branches; > struct niter_desc *desc; > > if (!(flags & UAP_UNROLL)) > @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo > return; > } > > + /* Be careful when unrolling loops with branches inside -- it can increase > + the number of mispredicts. Ignore loops with FP computation as these > + tend to benefit much more consistently from unrolling. */ > + if (num_loop_branches (loop) > 1 > + && loop_has_FP_comp(loop) > + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 > + && desc->niter < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) > + { > + nunroll_branches = max_unroll_with_branches(loop, nunroll); > + if (nunroll > nunroll_branches) > + nunroll = nunroll_branches; > + if (nunroll <= 1) > + { > + if (dump_file) > + fprintf (dump_file, ";; Not unrolling, contains branches\n"); > + return; > + } > + } > + > /* Check whether the loop rolls enough to consider. */ > if (desc->niter < 2 * nunroll) > { > @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop > static void > decide_unroll_runtime_iterations (struct loop *loop, int flags) > { > - unsigned nunroll, nunroll_by_av, i; > + unsigned nunroll, nunroll_by_av, nunroll_branches, i; > struct niter_desc *desc; > > if (!(flags & UAP_UNROLL)) > @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo > return; > } > > + /* Be careful when unrolling loops with branches inside -- it can increase > + the number of mispredicts. Ignore loops with FP computation as these > + tend to benefit much more consistently from unrolling. */ > + if (num_loop_branches (loop) > 1 > + && loop_has_FP_comp(loop) > + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 > + && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) > + { > + nunroll_branches = max_unroll_with_branches(loop, nunroll); > + if (nunroll > nunroll_branches) > + nunroll = nunroll_branches; > + if (nunroll <= 1) > + { > + if (dump_file) > + fprintf (dump_file, ";; Not unrolling, contains branches\n"); > + return; > + } > + } > + > /* If we have profile feedback, check whether the loop rolls. */ > if ((loop->header->count > && expected_loop_iterations (loop) < 2 * nunroll) > Index: params.def > =================================================================== > --- params.def (revision 186783) > +++ params.def (working copy) > @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS, > "The maximum depth of a loop nest we completely peel", > 8, 0, 0) > > +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES, > + "min-iter-unroll-with-branches", > + "Minimum iteration count to ignore branch effects when unrolling", > + 50, 0, 0) > + > +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET, > + "unroll-outer-loop-branch-budget", > + "Maximum number of branches allowed in hot outer loop region after unroll", > + 25, 0, 0) > + > /* The maximum number of insns of an unswitched loop. */ > DEFPARAM(PARAM_MAX_UNSWITCH_INSNS, > "max-unswitch-insns", > > -- > This patch is available for review at http://codereview.appspot.com/6099055
On Tue, Apr 24, 2012 at 4:38 PM, Steven Bosscher <stevenb.gcc@gmail.com> wrote: > On Tue, Apr 24, 2012 at 11:26 PM, Teresa Johnson <tejohnson@google.com> wrote: > >> * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param. >> (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto. > > You should add documentation for these new PARAMs to doc/invoke.texi. Ok, will do. > > I don't really like these new PARAMs: All other loop PARAMs are based > on the number of insns in a loop, or the maximum number of times a > transformation is applied. Your new > PARAM_MIN_ITER_UNROLL_WITH_BRANCHES is completely different, because > it is a number of iterations. This makes the PARAM value feel even > more arbitrary than all the other PARAMs to some extend already do... That's true, they are different in what they are checking than some of the other loop unrolling params. But I need some threshold for determining when a loop is hot enough that its unrolled branches will be executed frequently enough to train the branch predictor and also where the impact on the branch prediction in the outer region of code is less likely to matter overall. The defaults were chosen so that the new unrolling limit should only kick in for loops that are not iterating much anyway, and where the outer hot region has quite a few branches. > > (The only other PARAM like that is PARAM_ALIGN_LOOP_ITERATIONS, and > its default value also looks quite arbitrary...) > > >> Index: loop-unroll.c >> =================================================================== >> --- loop-unroll.c (revision 186783) >> +++ loop-unroll.c (working copy) >> @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc >> basic_block); >> static rtx get_expansion (struct var_to_expand *); >> >> +/* Determine whether LOOP contains call. */ >> +static bool >> +loop_has_call(struct loop *loop) >> +{ >> + basic_block *body, bb; >> + unsigned i; >> + rtx insn; >> + >> + body = get_loop_body (loop); >> + for (i = 0; i < loop->num_nodes; i++) >> + { >> + bb = body[i]; >> + >> + FOR_BB_INSNS (bb, insn) >> + { >> + if (CALL_P (insn)) >> + { >> + free (body); >> + return true; >> + } >> + } >> + } >> + free (body); >> + return false; >> +} >> + >> +/* Determine whether LOOP contains floating-point computation. */ >> +static bool >> +loop_has_FP_comp(struct loop *loop) >> +{ >> + rtx set, dest; >> + basic_block *body, bb; >> + unsigned i; >> + rtx insn; >> + >> + body = get_loop_body (loop); >> + for (i = 0; i < loop->num_nodes; i++) >> + { >> + bb = body[i]; >> + >> + FOR_BB_INSNS (bb, insn) >> + { >> + set = single_set (insn); >> + if (!set) >> + continue; >> + >> + dest = SET_DEST (set); >> + if (FLOAT_MODE_P (GET_MODE (dest))) >> + { >> + free (body); >> + return true; > > So you only detect single-set FP operations where some insns stores in > a float mode. It wouldn't be very difficult to just walk over all sets > and look for float modes. This is also necessary e.g. for x87 sincos, > as well as various insns on other machines. Your comments say you > don't want to apply the new heuristic to loops containing FP > operations because these loops usually benefit more from unrolling. > Therefore, you should IMHO look at non-single_set() insns also here, > to avoid applying the heuristics to loops containing non-single_set() > FP insns. Ok, thanks for the suggestion, I will expand this for the next version of the patch. > > >> + } >> + } >> + } >> + free (body); >> + return false; >> +} > > Nit: You are calling loop_has_call and loop_has_FP_comp() twice on > each loop (first for constant iterations and next for runtime > iterations), I don't think that is true for loop_has_FP_comp, since it is called in decide_unroll_constant_iterations and decide_unroll_runtime_iterations just after we have checked if the loop has a constant number of iterations, and returned early depending on the result of this check and which routine we are in. So each inner loop will only reach the call to loop_has_FP_comp in one of these routines. In the case of loop_has_call, which is only called for a hot outer loop, it is true we could invoke that more than once. That would happen if a hot outer loop contains more than one nested inner loop with a small iteration count and branches that we attempt to unroll (it is called at most once per inner loop that we attempt to unroll). I thought about attempting to cache this info for the outer loop in the structure returned by get_simple_loop_desc() as you also suggest below. I was concerned that currently this returns an niter_desc structure which holds info about the # iterations, and this information doesn't fit into that category. However, I could go ahead and add it to that structure and perhaps rename the structure to something more generic like "loop_desc". What do you think? The other issue is that we don't need this new information on all loops where we currently may compute and return an niter_desc instance, and I didn't want to unnecessarily add a walk over the loop bodies to compute the new information if we didn't need it. I'm not sure of the tradeoff between always computing this information when we do a get_simple_loop_desc vs calling it a couple times each for the loops where we do need it (only when we have innermost loops with shorter iteration counts and internal branches that we have decided to unroll). One way around this would be to set the initial value of this information in the loop_desc to a default value that means "unknown" and then compute and cache the information lazily as needed. > maybe you can fuse the functions I don't know that it makes sense to fuse them, as they are called on different loops - the call to loop_has_FP_comp will only be done for innermost loops that we are attempting to unroll, whereas loop_has_call is called only for outer loops that contain nested inner loops. But if I am going to cache the results in the loop desc then it may not be too expensive to do all the checks in one walk over each loop, regardless of whether it is an outer or inner loop. > and cache the results > (e.g. with two bitmaps, or put it in the loop description and retrieve > it with get_simple_loop_desc). Actually num_loop_branches() > could/should also be cached. I realize that the loop body walks are > probably not very expensive (and compile time probably isn't a concern > if you're using profile driven optimizations) but they do all add > up... True, num_loop_branches could be called a couple times for each innermost loop we attempt to unroll, because it is also called by decide_peel_simple and decide_unroll_stupid. If I cache the other info in a loop_desc as described above, we could cache this as well as the result of compute_weighted_branches. > > >> +/* Compute the number of branches in LOOP, weighted by execution counts. */ >> +static float >> +compute_weighted_branches(struct loop *loop) > > The floating point thing was already mentioned by Andrew. You can use > integer math instead (for examples, look for BB_FREQ_MAX e.g. in > average_num_loop_insns()). > > >> + while (loop_outer(outer)) >> + { >> + outer_desc = get_simple_loop_desc (outer); >> + >> + if (outer_desc->const_iter) >> + outer_niters *= outer_desc->niter; >> + else if (outer->header->count) >> + outer_niters *= expected_loop_iterations (outer); >> + >> + weighted_outer_branches = compute_weighted_branches (outer); > > Can you delay this computation of "weighted_outer_branches" call to ... Yes - thanks for the suggestion. > >> + /* Should have been checked by caller. */ >> + gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1); > > Should never even happen. You have set the minimum acceptable value to > 0. If you managed to test this code with > PARAM_MIN_ITER_UNROLL_WITH_BRANCHES==-1, I'd like to know how (if you > can do it from the command line, there is a bug in the handling of > acceptable PARAM values :-) You are right. This should have been removed - at one point I was thinking of having a value of -1 indicate that the heuristic should not kick in, but you are right that the minimum value set in params.def does not support this, and in fact it is not needed at all as setting this param to 0 will do the same thing. Will fix! > > >> + /* If the outer loop has enough iterations to be considered hot, then >> + we can stop our upwards loop tree traversal and examine the current >> + outer loop. */ >> + if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) >> + { >> + /* Assume that any call will cause the branch budget to be exceeded, >> + and that we can't unroll the current loop without increasing >> + mispredicts. */ >> + if (loop_has_call(outer)) >> + return 0; >> + >> + /* Otherwise, compute the maximum number of times current loop can be >> + unrolled without exceeding our branch budget. First we subtract >> + off the outer loop's weighted branch count from the budget. Note >> + that this includes the branches in the current loop. This yields >> + the number of branches left in the budget for the unrolled copies. >> + We divide this by the number of branches in the current loop that >> + must be duplicated when we unroll, which is the total weighted >> + number of branches minus the back-edge branch. This yields the >> + number of new loop body copies that can be created by unrolling >> + without exceeding the budget, to which we add 1 to get the unroll >> + factor. */ > > ... somewhere here, where weighted_outer_branches is used? Yep, thanks. > >> + return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) - >> + weighted_outer_branches)/(weighted_num_branches - 1) + 1; > > You should guard against weighted branches==1.0. Right - the guard already exists at the top of the routine. I can add a comment here to that effect. Thanks! Teresa > > Ciao! > Steven
On Tue, Apr 24, 2012 at 6:13 PM, Andi Kleen <andi@firstfloor.org> wrote: > tejohnson@google.com (Teresa Johnson) writes: > >> This patch adds heuristics to limit unrolling in loops with branches that may increase >> branch mispredictions. It affects loops that are not frequently iterated, and that are >> nested within a hot region of code that already contains many branch instructions. >> >> Performance tested with both internal benchmarks and with SPEC 2000/2006 on a variety >> of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems. >> This improves performance of an internal search indexing benchmark by close to 2% on >> all the tested Intel platforms. It also consistently improves 445.gobmk (with FDO feedback >> where unrolling kicks in) by close to 1% on AMD Opteron. Other performance effects are >> neutral. >> >> Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? > > One problem with any unrolling heuristics is currently that gcc has both > the tree level and the rtl level unroller. The tree one is even on at > -O3. So if you tweak anything for one you have to affect both, otherwise the > other may still do the wrong thing(tm). It's true that the tree level unroller could benefit from taking branch mispredict effects into account as well. But since that is only performing full unrolling of constant trip count loops I suspect that there will be additional things that need to be considered, such as whether the full unrolling enables better optimization in the surrounding code/loop. Hence I wanted to tackle that later. > > For some other tweaks I looked into a shared cost model some time ago. > May be still needed. Yes, I think it would be good to unify some of the profitability checks between the two unrolling passes, or at least between the tree and rtl level full unrollers/peelers. Teresa > > -Andi > > -- > ak@linux.intel.com -- Speaking for myself only
On Wed, Apr 25, 2012 at 2:03 AM, Richard Guenther <richard.guenther@gmail.com> wrote: > On Tue, Apr 24, 2012 at 11:26 PM, Teresa Johnson <tejohnson@google.com> wrote: >> This patch adds heuristics to limit unrolling in loops with branches that may increase >> branch mispredictions. It affects loops that are not frequently iterated, and that are >> nested within a hot region of code that already contains many branch instructions. >> >> Performance tested with both internal benchmarks and with SPEC 2000/2006 on a variety >> of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems. >> This improves performance of an internal search indexing benchmark by close to 2% on >> all the tested Intel platforms. It also consistently improves 445.gobmk (with FDO feedback >> where unrolling kicks in) by close to 1% on AMD Opteron. Other performance effects are >> neutral. >> >> Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? >> >> Thanks, >> Teresa >> >> 2012-04-24 Teresa Johnson <tejohnson@google.com> >> >> * loop-unroll.c (loop_has_call): New function. >> (loop_has_FP_comp): Ditto. >> (compute_weighted_branches): Ditto. >> (max_unroll_with_branches): Ditto. >> (decide_unroll_constant_iterations): Add heuristic to avoid >> increasing branch mispredicts when unrolling. >> (decide_unroll_runtime_iterations): Ditto. >> * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param. >> (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto. >> >> Index: loop-unroll.c >> =================================================================== >> --- loop-unroll.c (revision 186783) >> +++ loop-unroll.c (working copy) >> @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc >> basic_block); >> static rtx get_expansion (struct var_to_expand *); >> >> +/* Determine whether LOOP contains call. */ >> +static bool >> +loop_has_call(struct loop *loop) >> +{ >> + basic_block *body, bb; >> + unsigned i; >> + rtx insn; >> + >> + body = get_loop_body (loop); > > You repeatedly do this and walk over all blocks. Please think about > compile-time > issues when writing code. See my response to Steven where I address this issue and mention some approaches to reducing the loop body walks. Please let me know if you have any feedback on that. > > This all looks sort-of target specific to me and I don't see why this > very specialized > patch is a good idea when unrolling does a very poor job deciding what and how > much to unroll generally. I am hoping this will improve upon the job the unroller does in deciding when/how to unroll. I didn't think that it was too target specific as branch mispredictions could affect many targets. Note that there are already some much more basic checks for the branch misprediction effects in both decide_peel_simple and decide_unroll_stupid, for example: /* Do not simply peel loops with branches inside -- it increases number of mispredicts. */ if (num_loop_branches (loop) > 1) { if (dump_file) fprintf (dump_file, ";; Not peeling, contains branches\n"); return; } It is possible that both of these checks could be made less aggressive using the approach in this patch, which affects many more loops and hence I am trying to add some more intelligent checking of whether branch mispredicts might be triggered. Thanks, Teresa > > Richard. > >> + for (i = 0; i < loop->num_nodes; i++) >> + { >> + bb = body[i]; >> + >> + FOR_BB_INSNS (bb, insn) >> + { >> + if (CALL_P (insn)) >> + { >> + free (body); >> + return true; >> + } >> + } >> + } >> + free (body); >> + return false; >> +} >> + >> +/* Determine whether LOOP contains floating-point computation. */ >> +static bool >> +loop_has_FP_comp(struct loop *loop) >> +{ >> + rtx set, dest; >> + basic_block *body, bb; >> + unsigned i; >> + rtx insn; >> + >> + body = get_loop_body (loop); >> + for (i = 0; i < loop->num_nodes; i++) >> + { >> + bb = body[i]; >> + >> + FOR_BB_INSNS (bb, insn) >> + { >> + set = single_set (insn); >> + if (!set) >> + continue; >> + >> + dest = SET_DEST (set); >> + if (FLOAT_MODE_P (GET_MODE (dest))) >> + { >> + free (body); >> + return true; >> + } >> + } >> + } >> + free (body); >> + return false; >> +} >> + >> +/* Compute the number of branches in LOOP, weighted by execution counts. */ >> +static float >> +compute_weighted_branches(struct loop *loop) >> +{ >> + int header_count = loop->header->count; >> + unsigned i; >> + float n; >> + basic_block * body; >> + >> + /* If no profile feedback data exists, don't limit unrolling */ >> + if (header_count == 0) >> + return 0.0; >> + >> + gcc_assert (loop->latch != EXIT_BLOCK_PTR); >> + >> + body = get_loop_body (loop); >> + n = 0.0; >> + for (i = 0; i < loop->num_nodes; i++) >> + { >> + if (EDGE_COUNT (body[i]->succs) >= 2) >> + { >> + /* If this block is executed less frequently than the header (loop >> + entry), then it is weighted based on the ratio of times it is >> + executed compared to the header. */ >> + if (body[i]->count < header_count) >> + n += ((float)body[i]->count)/header_count; >> + >> + /* When it is executed more frequently than the header (i.e. it is >> + in a nested inner loop), simply weight the branch at 1.0. */ >> + else >> + n += 1.0; >> + } >> + } >> + free (body); >> + >> + return n; >> +} >> + >> +/* Compute the maximum number of times LOOP can be unrolled without exceeding >> + a branch budget, which can increase branch mispredictions. The number of >> + branches is computed by weighting each branch with its expected execution >> + probability through the loop based on profile data. If no profile feedback >> + data exists, simply return the current NUNROLL factor. */ >> +static unsigned >> +max_unroll_with_branches(struct loop *loop, unsigned nunroll) >> +{ >> + struct loop *outer; >> + struct niter_desc *outer_desc; >> + int outer_niters = 1; >> + float weighted_outer_branches = 0.0; >> + float weighted_num_branches = compute_weighted_branches (loop); >> + >> + /* If there was no profile feedback data, weighted_num_branches will be 0.0 >> + and we won't limit unrolling. If the weighted_num_branches is at most 1.0, >> + also don't limit unrolling as the back-edge branch will not be duplicated. */ >> + if (weighted_num_branches <= 1.0) >> + return nunroll; >> + >> + /* Walk up the loop tree until we find a hot outer loop in which the current >> + loop is nested. At that point we will compute the number of times the >> + current loop can be unrolled based on the number of branches in the hot >> + outer loop. */ >> + outer = loop_outer(loop); >> + /* The loop structure contains a fake outermost loop, so this should always >> + be non-NULL for our current loop. */ >> + gcc_assert (outer); >> + /* Detect if this is the fake outermost loop (at which point we are done) >> + by checking its outer loop. */ >> + while (loop_outer(outer)) >> + { >> + outer_desc = get_simple_loop_desc (outer); >> + >> + if (outer_desc->const_iter) >> + outer_niters *= outer_desc->niter; >> + else if (outer->header->count) >> + outer_niters *= expected_loop_iterations (outer); >> + >> + weighted_outer_branches = compute_weighted_branches (outer); >> + >> + /* Should have been checked by caller. */ >> + gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1); >> + >> + /* If the outer loop has enough iterations to be considered hot, then >> + we can stop our upwards loop tree traversal and examine the current >> + outer loop. */ >> + if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) >> + { >> + /* Assume that any call will cause the branch budget to be exceeded, >> + and that we can't unroll the current loop without increasing >> + mispredicts. */ >> + if (loop_has_call(outer)) >> + return 0; >> + >> + /* Otherwise, compute the maximum number of times current loop can be >> + unrolled without exceeding our branch budget. First we subtract >> + off the outer loop's weighted branch count from the budget. Note >> + that this includes the branches in the current loop. This yields >> + the number of branches left in the budget for the unrolled copies. >> + We divide this by the number of branches in the current loop that >> + must be duplicated when we unroll, which is the total weighted >> + number of branches minus the back-edge branch. This yields the >> + number of new loop body copies that can be created by unrolling >> + without exceeding the budget, to which we add 1 to get the unroll >> + factor. */ >> + return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) - >> + weighted_outer_branches)/(weighted_num_branches - 1) + 1; >> + } >> + outer = loop_outer(outer); >> + } >> + >> + /* The current loop is not enclosed by a hot enough outer loop in this >> + procedure, since the hot outer loop is inter-procedural, assume that >> + it already contains a significant number of branches, so don't unroll. */ >> + return 0; >> +} >> + >> /* Unroll and/or peel (depending on FLAGS) LOOPS. */ >> void >> unroll_and_peel_loops (int flags) >> @@ -522,6 +696,7 @@ static void >> decide_unroll_constant_iterations (struct loop *loop, int flags) >> { >> unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i; >> + unsigned nunroll_branches; >> struct niter_desc *desc; >> >> if (!(flags & UAP_UNROLL)) >> @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo >> return; >> } >> >> + /* Be careful when unrolling loops with branches inside -- it can increase >> + the number of mispredicts. Ignore loops with FP computation as these >> + tend to benefit much more consistently from unrolling. */ >> + if (num_loop_branches (loop) > 1 >> + && loop_has_FP_comp(loop) >> + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 >> + && desc->niter < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) >> + { >> + nunroll_branches = max_unroll_with_branches(loop, nunroll); >> + if (nunroll > nunroll_branches) >> + nunroll = nunroll_branches; >> + if (nunroll <= 1) >> + { >> + if (dump_file) >> + fprintf (dump_file, ";; Not unrolling, contains branches\n"); >> + return; >> + } >> + } >> + >> /* Check whether the loop rolls enough to consider. */ >> if (desc->niter < 2 * nunroll) >> { >> @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop >> static void >> decide_unroll_runtime_iterations (struct loop *loop, int flags) >> { >> - unsigned nunroll, nunroll_by_av, i; >> + unsigned nunroll, nunroll_by_av, nunroll_branches, i; >> struct niter_desc *desc; >> >> if (!(flags & UAP_UNROLL)) >> @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo >> return; >> } >> >> + /* Be careful when unrolling loops with branches inside -- it can increase >> + the number of mispredicts. Ignore loops with FP computation as these >> + tend to benefit much more consistently from unrolling. */ >> + if (num_loop_branches (loop) > 1 >> + && loop_has_FP_comp(loop) >> + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 >> + && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) >> + { >> + nunroll_branches = max_unroll_with_branches(loop, nunroll); >> + if (nunroll > nunroll_branches) >> + nunroll = nunroll_branches; >> + if (nunroll <= 1) >> + { >> + if (dump_file) >> + fprintf (dump_file, ";; Not unrolling, contains branches\n"); >> + return; >> + } >> + } >> + >> /* If we have profile feedback, check whether the loop rolls. */ >> if ((loop->header->count >> && expected_loop_iterations (loop) < 2 * nunroll) >> Index: params.def >> =================================================================== >> --- params.def (revision 186783) >> +++ params.def (working copy) >> @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS, >> "The maximum depth of a loop nest we completely peel", >> 8, 0, 0) >> >> +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES, >> + "min-iter-unroll-with-branches", >> + "Minimum iteration count to ignore branch effects when unrolling", >> + 50, 0, 0) >> + >> +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET, >> + "unroll-outer-loop-branch-budget", >> + "Maximum number of branches allowed in hot outer loop region after unroll", >> + 25, 0, 0) >> + >> /* The maximum number of insns of an unswitched loop. */ >> DEFPARAM(PARAM_MAX_UNSWITCH_INSNS, >> "max-unswitch-insns", >> >> -- >> This patch is available for review at http://codereview.appspot.com/6099055
I think the general mechanism applies to most of the targets. What is needed is target specific parameter (branch budget) tuning which can be done separately -- there exist a way to do that already. David On Wed, Apr 25, 2012 at 2:03 AM, Richard Guenther <richard.guenther@gmail.com> wrote: > On Tue, Apr 24, 2012 at 11:26 PM, Teresa Johnson <tejohnson@google.com> wrote: >> This patch adds heuristics to limit unrolling in loops with branches that may increase >> branch mispredictions. It affects loops that are not frequently iterated, and that are >> nested within a hot region of code that already contains many branch instructions. >> >> Performance tested with both internal benchmarks and with SPEC 2000/2006 on a variety >> of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems. >> This improves performance of an internal search indexing benchmark by close to 2% on >> all the tested Intel platforms. It also consistently improves 445.gobmk (with FDO feedback >> where unrolling kicks in) by close to 1% on AMD Opteron. Other performance effects are >> neutral. >> >> Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? >> >> Thanks, >> Teresa >> >> 2012-04-24 Teresa Johnson <tejohnson@google.com> >> >> * loop-unroll.c (loop_has_call): New function. >> (loop_has_FP_comp): Ditto. >> (compute_weighted_branches): Ditto. >> (max_unroll_with_branches): Ditto. >> (decide_unroll_constant_iterations): Add heuristic to avoid >> increasing branch mispredicts when unrolling. >> (decide_unroll_runtime_iterations): Ditto. >> * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param. >> (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto. >> >> Index: loop-unroll.c >> =================================================================== >> --- loop-unroll.c (revision 186783) >> +++ loop-unroll.c (working copy) >> @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc >> basic_block); >> static rtx get_expansion (struct var_to_expand *); >> >> +/* Determine whether LOOP contains call. */ >> +static bool >> +loop_has_call(struct loop *loop) >> +{ >> + basic_block *body, bb; >> + unsigned i; >> + rtx insn; >> + >> + body = get_loop_body (loop); > > You repeatedly do this and walk over all blocks. Please think about > compile-time > issues when writing code. > > This all looks sort-of target specific to me and I don't see why this > very specialized > patch is a good idea when unrolling does a very poor job deciding what and how > much to unroll generally. > > Richard. > >> + for (i = 0; i < loop->num_nodes; i++) >> + { >> + bb = body[i]; >> + >> + FOR_BB_INSNS (bb, insn) >> + { >> + if (CALL_P (insn)) >> + { >> + free (body); >> + return true; >> + } >> + } >> + } >> + free (body); >> + return false; >> +} >> + >> +/* Determine whether LOOP contains floating-point computation. */ >> +static bool >> +loop_has_FP_comp(struct loop *loop) >> +{ >> + rtx set, dest; >> + basic_block *body, bb; >> + unsigned i; >> + rtx insn; >> + >> + body = get_loop_body (loop); >> + for (i = 0; i < loop->num_nodes; i++) >> + { >> + bb = body[i]; >> + >> + FOR_BB_INSNS (bb, insn) >> + { >> + set = single_set (insn); >> + if (!set) >> + continue; >> + >> + dest = SET_DEST (set); >> + if (FLOAT_MODE_P (GET_MODE (dest))) >> + { >> + free (body); >> + return true; >> + } >> + } >> + } >> + free (body); >> + return false; >> +} >> + >> +/* Compute the number of branches in LOOP, weighted by execution counts. */ >> +static float >> +compute_weighted_branches(struct loop *loop) >> +{ >> + int header_count = loop->header->count; >> + unsigned i; >> + float n; >> + basic_block * body; >> + >> + /* If no profile feedback data exists, don't limit unrolling */ >> + if (header_count == 0) >> + return 0.0; >> + >> + gcc_assert (loop->latch != EXIT_BLOCK_PTR); >> + >> + body = get_loop_body (loop); >> + n = 0.0; >> + for (i = 0; i < loop->num_nodes; i++) >> + { >> + if (EDGE_COUNT (body[i]->succs) >= 2) >> + { >> + /* If this block is executed less frequently than the header (loop >> + entry), then it is weighted based on the ratio of times it is >> + executed compared to the header. */ >> + if (body[i]->count < header_count) >> + n += ((float)body[i]->count)/header_count; >> + >> + /* When it is executed more frequently than the header (i.e. it is >> + in a nested inner loop), simply weight the branch at 1.0. */ >> + else >> + n += 1.0; >> + } >> + } >> + free (body); >> + >> + return n; >> +} >> + >> +/* Compute the maximum number of times LOOP can be unrolled without exceeding >> + a branch budget, which can increase branch mispredictions. The number of >> + branches is computed by weighting each branch with its expected execution >> + probability through the loop based on profile data. If no profile feedback >> + data exists, simply return the current NUNROLL factor. */ >> +static unsigned >> +max_unroll_with_branches(struct loop *loop, unsigned nunroll) >> +{ >> + struct loop *outer; >> + struct niter_desc *outer_desc; >> + int outer_niters = 1; >> + float weighted_outer_branches = 0.0; >> + float weighted_num_branches = compute_weighted_branches (loop); >> + >> + /* If there was no profile feedback data, weighted_num_branches will be 0.0 >> + and we won't limit unrolling. If the weighted_num_branches is at most 1.0, >> + also don't limit unrolling as the back-edge branch will not be duplicated. */ >> + if (weighted_num_branches <= 1.0) >> + return nunroll; >> + >> + /* Walk up the loop tree until we find a hot outer loop in which the current >> + loop is nested. At that point we will compute the number of times the >> + current loop can be unrolled based on the number of branches in the hot >> + outer loop. */ >> + outer = loop_outer(loop); >> + /* The loop structure contains a fake outermost loop, so this should always >> + be non-NULL for our current loop. */ >> + gcc_assert (outer); >> + /* Detect if this is the fake outermost loop (at which point we are done) >> + by checking its outer loop. */ >> + while (loop_outer(outer)) >> + { >> + outer_desc = get_simple_loop_desc (outer); >> + >> + if (outer_desc->const_iter) >> + outer_niters *= outer_desc->niter; >> + else if (outer->header->count) >> + outer_niters *= expected_loop_iterations (outer); >> + >> + weighted_outer_branches = compute_weighted_branches (outer); >> + >> + /* Should have been checked by caller. */ >> + gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1); >> + >> + /* If the outer loop has enough iterations to be considered hot, then >> + we can stop our upwards loop tree traversal and examine the current >> + outer loop. */ >> + if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) >> + { >> + /* Assume that any call will cause the branch budget to be exceeded, >> + and that we can't unroll the current loop without increasing >> + mispredicts. */ >> + if (loop_has_call(outer)) >> + return 0; >> + >> + /* Otherwise, compute the maximum number of times current loop can be >> + unrolled without exceeding our branch budget. First we subtract >> + off the outer loop's weighted branch count from the budget. Note >> + that this includes the branches in the current loop. This yields >> + the number of branches left in the budget for the unrolled copies. >> + We divide this by the number of branches in the current loop that >> + must be duplicated when we unroll, which is the total weighted >> + number of branches minus the back-edge branch. This yields the >> + number of new loop body copies that can be created by unrolling >> + without exceeding the budget, to which we add 1 to get the unroll >> + factor. */ >> + return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) - >> + weighted_outer_branches)/(weighted_num_branches - 1) + 1; >> + } >> + outer = loop_outer(outer); >> + } >> + >> + /* The current loop is not enclosed by a hot enough outer loop in this >> + procedure, since the hot outer loop is inter-procedural, assume that >> + it already contains a significant number of branches, so don't unroll. */ >> + return 0; >> +} >> + >> /* Unroll and/or peel (depending on FLAGS) LOOPS. */ >> void >> unroll_and_peel_loops (int flags) >> @@ -522,6 +696,7 @@ static void >> decide_unroll_constant_iterations (struct loop *loop, int flags) >> { >> unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i; >> + unsigned nunroll_branches; >> struct niter_desc *desc; >> >> if (!(flags & UAP_UNROLL)) >> @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo >> return; >> } >> >> + /* Be careful when unrolling loops with branches inside -- it can increase >> + the number of mispredicts. Ignore loops with FP computation as these >> + tend to benefit much more consistently from unrolling. */ >> + if (num_loop_branches (loop) > 1 >> + && loop_has_FP_comp(loop) >> + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 >> + && desc->niter < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) >> + { >> + nunroll_branches = max_unroll_with_branches(loop, nunroll); >> + if (nunroll > nunroll_branches) >> + nunroll = nunroll_branches; >> + if (nunroll <= 1) >> + { >> + if (dump_file) >> + fprintf (dump_file, ";; Not unrolling, contains branches\n"); >> + return; >> + } >> + } >> + >> /* Check whether the loop rolls enough to consider. */ >> if (desc->niter < 2 * nunroll) >> { >> @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop >> static void >> decide_unroll_runtime_iterations (struct loop *loop, int flags) >> { >> - unsigned nunroll, nunroll_by_av, i; >> + unsigned nunroll, nunroll_by_av, nunroll_branches, i; >> struct niter_desc *desc; >> >> if (!(flags & UAP_UNROLL)) >> @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo >> return; >> } >> >> + /* Be careful when unrolling loops with branches inside -- it can increase >> + the number of mispredicts. Ignore loops with FP computation as these >> + tend to benefit much more consistently from unrolling. */ >> + if (num_loop_branches (loop) > 1 >> + && loop_has_FP_comp(loop) >> + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 >> + && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) >> + { >> + nunroll_branches = max_unroll_with_branches(loop, nunroll); >> + if (nunroll > nunroll_branches) >> + nunroll = nunroll_branches; >> + if (nunroll <= 1) >> + { >> + if (dump_file) >> + fprintf (dump_file, ";; Not unrolling, contains branches\n"); >> + return; >> + } >> + } >> + >> /* If we have profile feedback, check whether the loop rolls. */ >> if ((loop->header->count >> && expected_loop_iterations (loop) < 2 * nunroll) >> Index: params.def >> =================================================================== >> --- params.def (revision 186783) >> +++ params.def (working copy) >> @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS, >> "The maximum depth of a loop nest we completely peel", >> 8, 0, 0) >> >> +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES, >> + "min-iter-unroll-with-branches", >> + "Minimum iteration count to ignore branch effects when unrolling", >> + 50, 0, 0) >> + >> +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET, >> + "unroll-outer-loop-branch-budget", >> + "Maximum number of branches allowed in hot outer loop region after unroll", >> + 25, 0, 0) >> + >> /* The maximum number of insns of an unswitched loop. */ >> DEFPARAM(PARAM_MAX_UNSWITCH_INSNS, >> "max-unswitch-insns", >> >> -- >> This patch is available for review at http://codereview.appspot.com/6099055
Are you sure that tree-level unrollers are turned on at O2? My impression was that they work only at O3 or with f[unroll,peel]-loops flags. On Tue, Apr 24, 2012 at 6:13 PM, Andi Kleen <andi@firstfloor.org> wrote: > tejohnson@google.com (Teresa Johnson) writes: > >> This patch adds heuristics to limit unrolling in loops with branches >> that may increase branch mispredictions. It affects loops that are >> not frequently iterated, and that are nested within a hot region of code that already contains many branch instructions. >> >> Performance tested with both internal benchmarks and with SPEC >> 2000/2006 on a variety of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems. >> This improves performance of an internal search indexing benchmark by >> close to 2% on all the tested Intel platforms. It also consistently >> improves 445.gobmk (with FDO feedback where unrolling kicks in) by >> close to 1% on AMD Opteron. Other performance effects are neutral. >> >> Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? > > One problem with any unrolling heuristics is currently that gcc has > both the tree level and the rtl level unroller. The tree one is even > on at -O3. So if you tweak anything for one you have to affect both, > otherwise the other may still do the wrong thing(tm). Tree level unrollers (cunrolli and cunroll) do complete unroll. At O2, both of them are turned on, but gcc does not allow any code growth -- which makes them pretty useless at O2 (very few loops qualify). The default max complete peel iteration is also too low compared with both icc and llvm. This needs to be tuned. David > > For some other tweaks I looked into a shared cost model some time ago. > May be still needed. > > -Andi > > -- > ak@linux.intel.com -- Speaking for myself only
On Fri, Apr 27, 2012 at 12:07 AM, Igor Zamyatin <izamyatin@gmail.com> wrote: > Are you sure that tree-level unrollers are turned on at O2? My > impression was that they work only at O3 or with f[unroll,peel]-loops > flags. yes they are on but only have effect on tiny loops with very small trip count. With O3 or with -funroll,peel-loops, the size is allowed to grow. David > > On Tue, Apr 24, 2012 at 6:13 PM, Andi Kleen <andi@firstfloor.org> wrote: >> tejohnson@google.com (Teresa Johnson) writes: >> >>> This patch adds heuristics to limit unrolling in loops with branches >>> that may increase branch mispredictions. It affects loops that are >>> not frequently iterated, and that are nested within a hot region of code that already contains many branch instructions. >>> >>> Performance tested with both internal benchmarks and with SPEC >>> 2000/2006 on a variety of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems. >>> This improves performance of an internal search indexing benchmark by >>> close to 2% on all the tested Intel platforms. It also consistently >>> improves 445.gobmk (with FDO feedback where unrolling kicks in) by >>> close to 1% on AMD Opteron. Other performance effects are neutral. >>> >>> Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? >> >> One problem with any unrolling heuristics is currently that gcc has >> both the tree level and the rtl level unroller. The tree one is even >> on at -O3. So if you tweak anything for one you have to affect both, >> otherwise the other may still do the wrong thing(tm). > > Tree level unrollers (cunrolli and cunroll) do complete unroll. At O2, > both of them are turned on, but gcc does not allow any code growth -- > which makes them pretty useless at O2 (very few loops qualify). The > default max complete peel iteration is also too low compared with both > icc and llvm. This needs to be tuned. > > David > >> >> For some other tweaks I looked into a shared cost model some time ago. >> May be still needed. >> >> -Andi >> >> -- >> ak@linux.intel.com -- Speaking for myself only
Index: loop-unroll.c =================================================================== --- loop-unroll.c (revision 186783) +++ loop-unroll.c (working copy) @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc basic_block); static rtx get_expansion (struct var_to_expand *); +/* Determine whether LOOP contains call. */ +static bool +loop_has_call(struct loop *loop) +{ + basic_block *body, bb; + unsigned i; + rtx insn; + + body = get_loop_body (loop); + for (i = 0; i < loop->num_nodes; i++) + { + bb = body[i]; + + FOR_BB_INSNS (bb, insn) + { + if (CALL_P (insn)) + { + free (body); + return true; + } + } + } + free (body); + return false; +} + +/* Determine whether LOOP contains floating-point computation. */ +static bool +loop_has_FP_comp(struct loop *loop) +{ + rtx set, dest; + basic_block *body, bb; + unsigned i; + rtx insn; + + body = get_loop_body (loop); + for (i = 0; i < loop->num_nodes; i++) + { + bb = body[i]; + + FOR_BB_INSNS (bb, insn) + { + set = single_set (insn); + if (!set) + continue; + + dest = SET_DEST (set); + if (FLOAT_MODE_P (GET_MODE (dest))) + { + free (body); + return true; + } + } + } + free (body); + return false; +} + +/* Compute the number of branches in LOOP, weighted by execution counts. */ +static float +compute_weighted_branches(struct loop *loop) +{ + int header_count = loop->header->count; + unsigned i; + float n; + basic_block * body; + + /* If no profile feedback data exists, don't limit unrolling */ + if (header_count == 0) + return 0.0; + + gcc_assert (loop->latch != EXIT_BLOCK_PTR); + + body = get_loop_body (loop); + n = 0.0; + for (i = 0; i < loop->num_nodes; i++) + { + if (EDGE_COUNT (body[i]->succs) >= 2) + { + /* If this block is executed less frequently than the header (loop + entry), then it is weighted based on the ratio of times it is + executed compared to the header. */ + if (body[i]->count < header_count) + n += ((float)body[i]->count)/header_count; + + /* When it is executed more frequently than the header (i.e. it is + in a nested inner loop), simply weight the branch at 1.0. */ + else + n += 1.0; + } + } + free (body); + + return n; +} + +/* Compute the maximum number of times LOOP can be unrolled without exceeding + a branch budget, which can increase branch mispredictions. The number of + branches is computed by weighting each branch with its expected execution + probability through the loop based on profile data. If no profile feedback + data exists, simply return the current NUNROLL factor. */ +static unsigned +max_unroll_with_branches(struct loop *loop, unsigned nunroll) +{ + struct loop *outer; + struct niter_desc *outer_desc; + int outer_niters = 1; + float weighted_outer_branches = 0.0; + float weighted_num_branches = compute_weighted_branches (loop); + + /* If there was no profile feedback data, weighted_num_branches will be 0.0 + and we won't limit unrolling. If the weighted_num_branches is at most 1.0, + also don't limit unrolling as the back-edge branch will not be duplicated. */ + if (weighted_num_branches <= 1.0) + return nunroll; + + /* Walk up the loop tree until we find a hot outer loop in which the current + loop is nested. At that point we will compute the number of times the + current loop can be unrolled based on the number of branches in the hot + outer loop. */ + outer = loop_outer(loop); + /* The loop structure contains a fake outermost loop, so this should always + be non-NULL for our current loop. */ + gcc_assert (outer); + /* Detect if this is the fake outermost loop (at which point we are done) + by checking its outer loop. */ + while (loop_outer(outer)) + { + outer_desc = get_simple_loop_desc (outer); + + if (outer_desc->const_iter) + outer_niters *= outer_desc->niter; + else if (outer->header->count) + outer_niters *= expected_loop_iterations (outer); + + weighted_outer_branches = compute_weighted_branches (outer); + + /* Should have been checked by caller. */ + gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1); + + /* If the outer loop has enough iterations to be considered hot, then + we can stop our upwards loop tree traversal and examine the current + outer loop. */ + if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) + { + /* Assume that any call will cause the branch budget to be exceeded, + and that we can't unroll the current loop without increasing + mispredicts. */ + if (loop_has_call(outer)) + return 0; + + /* Otherwise, compute the maximum number of times current loop can be + unrolled without exceeding our branch budget. First we subtract + off the outer loop's weighted branch count from the budget. Note + that this includes the branches in the current loop. This yields + the number of branches left in the budget for the unrolled copies. + We divide this by the number of branches in the current loop that + must be duplicated when we unroll, which is the total weighted + number of branches minus the back-edge branch. This yields the + number of new loop body copies that can be created by unrolling + without exceeding the budget, to which we add 1 to get the unroll + factor. */ + return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) - + weighted_outer_branches)/(weighted_num_branches - 1) + 1; + } + outer = loop_outer(outer); + } + + /* The current loop is not enclosed by a hot enough outer loop in this + procedure, since the hot outer loop is inter-procedural, assume that + it already contains a significant number of branches, so don't unroll. */ + return 0; +} + /* Unroll and/or peel (depending on FLAGS) LOOPS. */ void unroll_and_peel_loops (int flags) @@ -522,6 +696,7 @@ static void decide_unroll_constant_iterations (struct loop *loop, int flags) { unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i; + unsigned nunroll_branches; struct niter_desc *desc; if (!(flags & UAP_UNROLL)) @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo return; } + /* Be careful when unrolling loops with branches inside -- it can increase + the number of mispredicts. Ignore loops with FP computation as these + tend to benefit much more consistently from unrolling. */ + if (num_loop_branches (loop) > 1 + && loop_has_FP_comp(loop) + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 + && desc->niter < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) + { + nunroll_branches = max_unroll_with_branches(loop, nunroll); + if (nunroll > nunroll_branches) + nunroll = nunroll_branches; + if (nunroll <= 1) + { + if (dump_file) + fprintf (dump_file, ";; Not unrolling, contains branches\n"); + return; + } + } + /* Check whether the loop rolls enough to consider. */ if (desc->niter < 2 * nunroll) { @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop static void decide_unroll_runtime_iterations (struct loop *loop, int flags) { - unsigned nunroll, nunroll_by_av, i; + unsigned nunroll, nunroll_by_av, nunroll_branches, i; struct niter_desc *desc; if (!(flags & UAP_UNROLL)) @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo return; } + /* Be careful when unrolling loops with branches inside -- it can increase + the number of mispredicts. Ignore loops with FP computation as these + tend to benefit much more consistently from unrolling. */ + if (num_loop_branches (loop) > 1 + && loop_has_FP_comp(loop) + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 + && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) + { + nunroll_branches = max_unroll_with_branches(loop, nunroll); + if (nunroll > nunroll_branches) + nunroll = nunroll_branches; + if (nunroll <= 1) + { + if (dump_file) + fprintf (dump_file, ";; Not unrolling, contains branches\n"); + return; + } + } + /* If we have profile feedback, check whether the loop rolls. */ if ((loop->header->count && expected_loop_iterations (loop) < 2 * nunroll) Index: params.def =================================================================== --- params.def (revision 186783) +++ params.def (working copy) @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS, "The maximum depth of a loop nest we completely peel", 8, 0, 0) +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES, + "min-iter-unroll-with-branches", + "Minimum iteration count to ignore branch effects when unrolling", + 50, 0, 0) + +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET, + "unroll-outer-loop-branch-budget", + "Maximum number of branches allowed in hot outer loop region after unroll", + 25, 0, 0) + /* The maximum number of insns of an unswitched loop. */ DEFPARAM(PARAM_MAX_UNSWITCH_INSNS, "max-unswitch-insns",