diff mbox

Enable loop peeling at -O3

Message ID 20160528150444.GB5812@kam.mff.cuni.cz
State New
Headers show

Commit Message

Jan Hubicka May 28, 2016, 3:04 p.m. UTC
Hello,
thanks for feedback. I updated the patch and also noticed that -fpeel-all-loops gives up when
upper bound is known but it is large and when the max-peel-insns is too small to permit
peeling max-peel-times.  This patch also updates  pr61743-2.c which are now peeled before
we manage to propagate the proper loop bound.

Bootstrapped/regtested x86_64-linux. OK?

Honza

	* common.opt (flag_peel_all_loops): New option.
	* doc/invoke.texi: (-fpeel-loops): Update documentation.
	(-fpeel-all-loops): Document.
	* opts.c (default_options): Add OPT_fpeel_loops to -O3+.
	* toplev.c (process_options): flag_peel_all_loops implies
	flag_peel_loops.
	* tree-ssa-lop-ivcanon.c (try_peel_loop): Update comment; handle
	-fpeel-all-loops, use likely estimates.

	* gcc.dg/tree-ssa/peel1.c: New testcase.
	* gcc.dg/tree-ssa/peel2.c: New testcase.
	* gcc.dg/tree-ssa/pr61743-1.c: Pass -fno-peel-loops.
	* gcc.dg/tree-ssa/pr61743-2.c: Pass -fno-peel-loops.

Comments

Richard Biener May 30, 2016, 9:24 a.m. UTC | #1
On Sat, 28 May 2016, Jan Hubicka wrote:

> Hello,
> thanks for feedback. I updated the patch and also noticed that -fpeel-all-loops gives up when
> upper bound is known but it is large and when the max-peel-insns is too small to permit
> peeling max-peel-times.  This patch also updates  pr61743-2.c which are now peeled before
> we manage to propagate the proper loop bound.
> 
> Bootstrapped/regtested x86_64-linux. OK?

Humm, so why add -fpeel-all-loops?  I don't think -funroll-all-loops
is useful.

Did you check code-size/compile-time/performance effects of enabling
-fpeel-loops at -O3 for, say, SPEC CPU 2006?

Thanks,
Richard.

> Honza
> 
> 	* common.opt (flag_peel_all_loops): New option.
> 	* doc/invoke.texi: (-fpeel-loops): Update documentation.
> 	(-fpeel-all-loops): Document.
> 	* opts.c (default_options): Add OPT_fpeel_loops to -O3+.
> 	* toplev.c (process_options): flag_peel_all_loops implies
> 	flag_peel_loops.
> 	* tree-ssa-lop-ivcanon.c (try_peel_loop): Update comment; handle
> 	-fpeel-all-loops, use likely estimates.
> 
> 	* gcc.dg/tree-ssa/peel1.c: New testcase.
> 	* gcc.dg/tree-ssa/peel2.c: New testcase.
> 	* gcc.dg/tree-ssa/pr61743-1.c: Pass -fno-peel-loops.
> 	* gcc.dg/tree-ssa/pr61743-2.c: Pass -fno-peel-loops.
> Index: common.opt
> ===================================================================
> --- common.opt	(revision 236815)
> +++ common.opt	(working copy)
> @@ -1840,6 +1840,10 @@ fpeel-loops
>  Common Report Var(flag_peel_loops) Optimization
>  Perform loop peeling.
>  
> +fpeel-all-loops
> +Common Report Var(flag_peel_all_loops) Optimization
> +Perform loop peeling of all loops.
> +
>  fpeephole
>  Common Report Var(flag_no_peephole,0) Optimization
>  Enable machine specific peephole optimizations.
> Index: doc/invoke.texi
> ===================================================================
> --- doc/invoke.texi	(revision 236815)
> +++ doc/invoke.texi	(working copy)
> @@ -375,7 +375,7 @@ Objective-C and Objective-C++ Dialects}.
>  -fno-sched-interblock -fno-sched-spec -fno-signed-zeros @gol
>  -fno-toplevel-reorder -fno-trapping-math -fno-zero-initialized-in-bss @gol
>  -fomit-frame-pointer -foptimize-sibling-calls @gol
> --fpartial-inlining -fpeel-loops -fpredictive-commoning @gol
> +-fpartial-inlining -fpeel-loops -fpeel-all-loops -fpredictive-commoning @gol
>  -fprefetch-loop-arrays @gol
>  -fprofile-correction @gol
>  -fprofile-use -fprofile-use=@var{path} -fprofile-values @gol
> @@ -6338,7 +6338,8 @@ by @option{-O2} and also turns on the @o
>  @option{-fgcse-after-reload}, @option{-ftree-loop-vectorize},
>  @option{-ftree-loop-distribute-patterns}, @option{-fsplit-paths}
>  @option{-ftree-slp-vectorize}, @option{-fvect-cost-model},
> -@option{-ftree-partial-pre} and @option{-fipa-cp-clone} options.
> +@option{-ftree-partial-pre}, @option{-fpeel-loops}
> +and @option{-fipa-cp-clone} options.
>  
>  @item -O0
>  @opindex O0
> @@ -8593,7 +8594,7 @@ data about values of expressions in the
>  With @option{-fbranch-probabilities}, it reads back the data gathered
>  from profiling values of expressions for usage in optimizations.
>  
> -Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
> +Enabled with @option{-fprofile-generate} and/or @option{-fprofile-use}.
>  
>  @item -fprofile-reorder-functions
>  @opindex fprofile-reorder-functions
> @@ -8661,10 +8662,17 @@ the loop is entered.  This usually makes
>  @item -fpeel-loops
>  @opindex fpeel-loops
>  Peels loops for which there is enough information that they do not
> -roll much (from profile feedback).  It also turns on complete loop peeling
> -(i.e.@: complete removal of loops with small constant number of iterations).
> -
> -Enabled with @option{-fprofile-use}.
> +roll much (from profile feedback or static analysis).  It also turns on
> +complete loop peeling (i.e.@: complete removal of loops with small constant
> +number of iterations).
> +
> +Enabled with @option{-O3} and @option{-fprofile-use}.
> +
> +@item -fpeel-all-loops
> +@opindex fpeel-all-loops
> +Peel all loops, even if their number of iterations is uncertain when
> +the loop is entered.  For loops with large number of iterations this leads
> +to wasted code size.
>  
>  @item -fmove-loop-invariants
>  @opindex fmove-loop-invariants
> Index: opts.c
> ===================================================================
> --- opts.c	(revision 236815)
> +++ opts.c	(working copy)
> @@ -535,6 +535,7 @@ static const struct default_options defa
>      { OPT_LEVELS_3_PLUS, OPT_fvect_cost_model_, NULL, VECT_COST_MODEL_DYNAMIC },
>      { OPT_LEVELS_3_PLUS, OPT_fipa_cp_clone, NULL, 1 },
>      { OPT_LEVELS_3_PLUS, OPT_ftree_partial_pre, NULL, 1 },
> +    { OPT_LEVELS_3_PLUS, OPT_fpeel_loops, NULL, 1 },
>  
>      /* -Ofast adds optimizations to -O3.  */
>      { OPT_LEVELS_FAST, OPT_ffast_math, NULL, 1 },
> Index: testsuite/gcc.dg/tree-ssa/peel1.c
> ===================================================================
> --- testsuite/gcc.dg/tree-ssa/peel1.c	(revision 0)
> +++ testsuite/gcc.dg/tree-ssa/peel1.c	(working copy)
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-tree-cunroll-details" } */
> +struct foo {int b; int a[3];} foo;
> +void add(struct foo *a,int l)
> +{
> +  int i;
> +  for (i=0;i<l;i++)
> +    a->a[i]++;
> +}
> +/* { dg-final { scan-tree-dump "Loop 1 likely iterates at most 3 times." "cunroll"} } */
> +/* { dg-final { scan-tree-dump "Peeled loop 1, 4 times." "cunroll"} } */
> Index: testsuite/gcc.dg/tree-ssa/peel2.c
> ===================================================================
> --- testsuite/gcc.dg/tree-ssa/peel2.c	(revision 0)
> +++ testsuite/gcc.dg/tree-ssa/peel2.c	(working copy)
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fpeel-all-loops -fdump-tree-cunroll-details --param max-peel-times=16 --param max-peeled-insns=100" } */
> +void add(int *a,int l)
> +{
> +  int i;
> +  for (i=0;i<l;i++)
> +    a[i]++;
> +}
> +/* { dg-final { scan-tree-dump "Peeled loop 1, 16 times." "cunroll"} } */
> Index: testsuite/gcc.dg/tree-ssa/pr61743-1.c
> ===================================================================
> --- testsuite/gcc.dg/tree-ssa/pr61743-1.c	(revision 236815)
> +++ testsuite/gcc.dg/tree-ssa/pr61743-1.c	(working copy)
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details" } */
> +/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details -fno-peel-loops" } */
>  
>  #define N 8
>  #define M 14
> Index: testsuite/gcc.dg/tree-ssa/pr61743-2.c
> ===================================================================
> --- testsuite/gcc.dg/tree-ssa/pr61743-2.c	(revision 236815)
> +++ testsuite/gcc.dg/tree-ssa/pr61743-2.c	(working copy)
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details" } */
> +/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details -fno-peel-loops" } */
>  
>  #define N 8
>  #define M 14
> Index: toplev.c
> ===================================================================
> --- toplev.c	(revision 236815)
> +++ toplev.c	(working copy)
> @@ -1294,6 +1294,9 @@ process_options (void)
>    if (flag_unroll_all_loops)
>      flag_unroll_loops = 1;
>  
> +  if (flag_peel_all_loops)
> +    flag_peel_loops = 1;
> +
>    /* web and rename-registers help when run after loop unrolling.  */
>    if (flag_web == AUTODETECT_VALUE)
>      flag_web = flag_unroll_loops || flag_peel_loops;
> Index: tree-ssa-loop-ivcanon.c
> ===================================================================
> --- tree-ssa-loop-ivcanon.c	(revision 236816)
> +++ tree-ssa-loop-ivcanon.c	(working copy)
> @@ -951,7 +951,9 @@ try_peel_loop (struct loop *loop,
>    if (!flag_peel_loops || PARAM_VALUE (PARAM_MAX_PEEL_TIMES) <= 0)
>      return false;
>  
> -  /* Peel only innermost loops.  */
> +  /* Peel only innermost loops.
> +     While the code is perfectly capable of peeling non-innermost loops,
> +     the heuristics would probably need some improvements. */
>    if (loop->inner)
>      {
>        if (dump_file)
> @@ -970,11 +972,22 @@ try_peel_loop (struct loop *loop,
>    npeel = estimated_loop_iterations_int (loop);
>    if (npeel < 0)
>      {
> +      npeel = likely_max_loop_iterations_int (loop);
> +      if (flag_peel_all_loops
> +	  && npeel >= PARAM_VALUE (PARAM_MAX_PEEL_TIMES) - 1)
> +	npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES) - 1;
> +    }
> +  if (npeel < 0 && flag_peel_all_loops)
> +    npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES) - 1;
> +  if (npeel < 0)
> +    {
>        if (dump_file)
>          fprintf (dump_file, "Not peeling: number of iterations is not "
>  	         "estimated\n");
>        return false;
>      }
> +  gcc_assert (maxiter < 0 || maxiter >= npeel
> +	      || npeel <= max_loop_iterations_int (loop));
>    if (maxiter >= 0 && maxiter <= npeel)
>      {
>        if (dump_file)
> @@ -998,8 +1011,25 @@ try_peel_loop (struct loop *loop,
>    /* Check peeled loops size.  */
>    tree_estimate_loop_size (loop, exit, NULL, &size,
>  			   PARAM_VALUE (PARAM_MAX_PEELED_INSNS));
> -  if ((peeled_size = estimated_peeled_sequence_size (&size, (int) npeel))
> -      > PARAM_VALUE (PARAM_MAX_PEELED_INSNS))
> +  peeled_size = estimated_peeled_sequence_size (&size, (int) npeel);
> +
> +  /* When asked to peel all loops, try to reduce number of peeled copies to
> +     fit in the size bound.  */
> +  while (flag_peel_all_loops
> +	 && peeled_size > PARAM_VALUE (PARAM_MAX_PEELED_INSNS)
> +	 && npeel > 1)
> +    {
> +      /* Number of peeled copies is capped by PARAM_MAX_PEEL_TIMES and thus this
> +	 loop will converge quickly.
> +	 Just be sure we won't get a compile time hog when user asks for
> +	 insanely many copies by --param parameter.  */
> +      if (npeel > 256)
> +	npeel /= 2;
> +      else
> +	npeel--;
> +      peeled_size = estimated_peeled_sequence_size (&size, (int) npeel);
> +    }
> +  if (peeled_size > PARAM_VALUE (PARAM_MAX_PEELED_INSNS))
>      {
>        if (dump_file)
>          fprintf (dump_file, "Not peeling: peeled sequence size is too large "
> @@ -1112,8 +1142,8 @@ canonicalize_loop_induction_variables (s
>    if (dump_file && (dump_flags & TDF_DETAILS)
>        && likely_max_loop_iterations_int (loop) >= 0)
>      {
> -      fprintf (dump_file, "Loop likely %d iterates at most %i times.\n", loop->num,
> -	       (int)likely_max_loop_iterations_int (loop));
> +      fprintf (dump_file, "Loop %d likely iterates at most %i times.\n",
> +	       loop->num, (int)likely_max_loop_iterations_int (loop));
>      }
>  
>    /* Remove exits that are known to be never taken based on loop bound.
> 
>
Jan Hubicka May 30, 2016, 11:07 a.m. UTC | #2
> On Sat, 28 May 2016, Jan Hubicka wrote:
> 
> > Hello,
> > thanks for feedback. I updated the patch and also noticed that -fpeel-all-loops gives up when
> > upper bound is known but it is large and when the max-peel-insns is too small to permit
> > peeling max-peel-times.  This patch also updates  pr61743-2.c which are now peeled before
> > we manage to propagate the proper loop bound.
> > 
> > Bootstrapped/regtested x86_64-linux. OK?
> 
> Humm, so why add -fpeel-all-loops?  I don't think -funroll-all-loops
> is useful.
It is mostly there to trigger the transform to see if it is useful. Not something
you want to enable by default.

-fpeel-all-loops helps when you know your code have internal loops that
iterate few times.  I.e. one can get good speedup for the sudoku solver benchmark
because it has loops that iterate either once or 10 times.

http://www.ucw.cz/~hubicka/papers/amd64/node4.html also claims that -funroll-all-loops
improves specint by 2.5%, while -funroll-loops by 2.23%, so it seemed somewhat useful
back then.
> 
> Did you check code-size/compile-time/performance effects of enabling
> -fpeel-loops at -O3 for, say, SPEC CPU 2006?

Martin Liska run it on the SPEC2006 and v6 (not with latest fixes to
heuristics).  Without FDO the loop peeling triggers only for loops where we
have likely_upper_bound != upper_bound. We do not predict that too often (we
may in future as there is room for improvement in niter).  The code size effect
was +0.9% for SPECint and +2.2% on SPECfp.  The off-noise improvements were vrp
94.5->89.7 and John the ripper 106.9->100 (less than 0.1% in geomavg).  I have
cut down the code size effects since that (there was a bug that made us to peel
without control when maxiter overflowed), but I did not re-run full specs since
then.

My motivation was mainly to reduce number of optimizations that are not good
enough to be enabled by default and also observation that it helps to some
benchmarks.

We can re-run benchmarks with current patch after fixing the profile update
issues I will send shortly.  No fine-tuning of the parameters was done and I
guess they are still set the way they was set for RTL peeling in 2003.

Honza
Richard Biener May 30, 2016, 11:28 a.m. UTC | #3
On Mon, 30 May 2016, Jan Hubicka wrote:

> > On Sat, 28 May 2016, Jan Hubicka wrote:
> > 
> > > Hello,
> > > thanks for feedback. I updated the patch and also noticed that -fpeel-all-loops gives up when
> > > upper bound is known but it is large and when the max-peel-insns is too small to permit
> > > peeling max-peel-times.  This patch also updates  pr61743-2.c which are now peeled before
> > > we manage to propagate the proper loop bound.
> > > 
> > > Bootstrapped/regtested x86_64-linux. OK?
> > 
> > Humm, so why add -fpeel-all-loops?  I don't think -funroll-all-loops
> > is useful.
> It is mostly there to trigger the transform to see if it is useful. Not something
> you want to enable by default.
> 
> -fpeel-all-loops helps when you know your code have internal loops that
> iterate few times.  I.e. one can get good speedup for the sudoku solver benchmark
> because it has loops that iterate either once or 10 times.
> 
> http://www.ucw.cz/~hubicka/papers/amd64/node4.html also claims that -funroll-all-loops
> improves specint by 2.5%, while -funroll-loops by 2.23%, so it seemed somewhat useful
> back then.

Still I'm hesitant to introduce new user-visible options.

> > Did you check code-size/compile-time/performance effects of enabling
> > -fpeel-loops at -O3 for, say, SPEC CPU 2006?
> 
> Martin Liska run it on the SPEC2006 and v6 (not with latest fixes to
> heuristics).  Without FDO the loop peeling triggers only for loops where we
> have likely_upper_bound != upper_bound. We do not predict that too often (we
> may in future as there is room for improvement in niter).  The code size effect
> was +0.9% for SPECint and +2.2% on SPECfp.  The off-noise improvements were vrp
> 94.5->89.7 and John the ripper 106.9->100 (less than 0.1% in geomavg).  I have
> cut down the code size effects since that (there was a bug that made us to peel
> without control when maxiter overflowed), but I did not re-run full specs since
> then.
> 
> My motivation was mainly to reduce number of optimizations that are not good
> enough to be enabled by default and also observation that it helps to some
> benchmarks.
> 
> We can re-run benchmarks with current patch after fixing the profile update
> issues I will send shortly.  No fine-tuning of the parameters was done and I
> guess they are still set the way they was set for RTL peeling in 2003.

Sounds good.

The patch is ok if you omit the new flag for now.

Thanks,
Richard.
Jan Hubicka May 30, 2016, 11:39 a.m. UTC | #4
> 
> Sounds good.
> 
> The patch is ok if you omit the new flag for now.

Ok, I will omit that flag for now.  Thanks!

Honza
> 
> Thanks,
> Richard.
diff mbox

Patch

Index: common.opt
===================================================================
--- common.opt	(revision 236815)
+++ common.opt	(working copy)
@@ -1840,6 +1840,10 @@  fpeel-loops
 Common Report Var(flag_peel_loops) Optimization
 Perform loop peeling.
 
+fpeel-all-loops
+Common Report Var(flag_peel_all_loops) Optimization
+Perform loop peeling of all loops.
+
 fpeephole
 Common Report Var(flag_no_peephole,0) Optimization
 Enable machine specific peephole optimizations.
Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi	(revision 236815)
+++ doc/invoke.texi	(working copy)
@@ -375,7 +375,7 @@  Objective-C and Objective-C++ Dialects}.
 -fno-sched-interblock -fno-sched-spec -fno-signed-zeros @gol
 -fno-toplevel-reorder -fno-trapping-math -fno-zero-initialized-in-bss @gol
 -fomit-frame-pointer -foptimize-sibling-calls @gol
--fpartial-inlining -fpeel-loops -fpredictive-commoning @gol
+-fpartial-inlining -fpeel-loops -fpeel-all-loops -fpredictive-commoning @gol
 -fprefetch-loop-arrays @gol
 -fprofile-correction @gol
 -fprofile-use -fprofile-use=@var{path} -fprofile-values @gol
@@ -6338,7 +6338,8 @@  by @option{-O2} and also turns on the @o
 @option{-fgcse-after-reload}, @option{-ftree-loop-vectorize},
 @option{-ftree-loop-distribute-patterns}, @option{-fsplit-paths}
 @option{-ftree-slp-vectorize}, @option{-fvect-cost-model},
-@option{-ftree-partial-pre} and @option{-fipa-cp-clone} options.
+@option{-ftree-partial-pre}, @option{-fpeel-loops}
+and @option{-fipa-cp-clone} options.
 
 @item -O0
 @opindex O0
@@ -8593,7 +8594,7 @@  data about values of expressions in the
 With @option{-fbranch-probabilities}, it reads back the data gathered
 from profiling values of expressions for usage in optimizations.
 
-Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
+Enabled with @option{-fprofile-generate} and/or @option{-fprofile-use}.
 
 @item -fprofile-reorder-functions
 @opindex fprofile-reorder-functions
@@ -8661,10 +8662,17 @@  the loop is entered.  This usually makes
 @item -fpeel-loops
 @opindex fpeel-loops
 Peels loops for which there is enough information that they do not
-roll much (from profile feedback).  It also turns on complete loop peeling
-(i.e.@: complete removal of loops with small constant number of iterations).
-
-Enabled with @option{-fprofile-use}.
+roll much (from profile feedback or static analysis).  It also turns on
+complete loop peeling (i.e.@: complete removal of loops with small constant
+number of iterations).
+
+Enabled with @option{-O3} and @option{-fprofile-use}.
+
+@item -fpeel-all-loops
+@opindex fpeel-all-loops
+Peel all loops, even if their number of iterations is uncertain when
+the loop is entered.  For loops with large number of iterations this leads
+to wasted code size.
 
 @item -fmove-loop-invariants
 @opindex fmove-loop-invariants
Index: opts.c
===================================================================
--- opts.c	(revision 236815)
+++ opts.c	(working copy)
@@ -535,6 +535,7 @@  static const struct default_options defa
     { OPT_LEVELS_3_PLUS, OPT_fvect_cost_model_, NULL, VECT_COST_MODEL_DYNAMIC },
     { OPT_LEVELS_3_PLUS, OPT_fipa_cp_clone, NULL, 1 },
     { OPT_LEVELS_3_PLUS, OPT_ftree_partial_pre, NULL, 1 },
+    { OPT_LEVELS_3_PLUS, OPT_fpeel_loops, NULL, 1 },
 
     /* -Ofast adds optimizations to -O3.  */
     { OPT_LEVELS_FAST, OPT_ffast_math, NULL, 1 },
Index: testsuite/gcc.dg/tree-ssa/peel1.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/peel1.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/peel1.c	(working copy)
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-cunroll-details" } */
+struct foo {int b; int a[3];} foo;
+void add(struct foo *a,int l)
+{
+  int i;
+  for (i=0;i<l;i++)
+    a->a[i]++;
+}
+/* { dg-final { scan-tree-dump "Loop 1 likely iterates at most 3 times." "cunroll"} } */
+/* { dg-final { scan-tree-dump "Peeled loop 1, 4 times." "cunroll"} } */
Index: testsuite/gcc.dg/tree-ssa/peel2.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/peel2.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/peel2.c	(working copy)
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fpeel-all-loops -fdump-tree-cunroll-details --param max-peel-times=16 --param max-peeled-insns=100" } */
+void add(int *a,int l)
+{
+  int i;
+  for (i=0;i<l;i++)
+    a[i]++;
+}
+/* { dg-final { scan-tree-dump "Peeled loop 1, 16 times." "cunroll"} } */
Index: testsuite/gcc.dg/tree-ssa/pr61743-1.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr61743-1.c	(revision 236815)
+++ testsuite/gcc.dg/tree-ssa/pr61743-1.c	(working copy)
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details" } */
+/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details -fno-peel-loops" } */
 
 #define N 8
 #define M 14
Index: testsuite/gcc.dg/tree-ssa/pr61743-2.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr61743-2.c	(revision 236815)
+++ testsuite/gcc.dg/tree-ssa/pr61743-2.c	(working copy)
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details" } */
+/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details -fno-peel-loops" } */
 
 #define N 8
 #define M 14
Index: toplev.c
===================================================================
--- toplev.c	(revision 236815)
+++ toplev.c	(working copy)
@@ -1294,6 +1294,9 @@  process_options (void)
   if (flag_unroll_all_loops)
     flag_unroll_loops = 1;
 
+  if (flag_peel_all_loops)
+    flag_peel_loops = 1;
+
   /* web and rename-registers help when run after loop unrolling.  */
   if (flag_web == AUTODETECT_VALUE)
     flag_web = flag_unroll_loops || flag_peel_loops;
Index: tree-ssa-loop-ivcanon.c
===================================================================
--- tree-ssa-loop-ivcanon.c	(revision 236816)
+++ tree-ssa-loop-ivcanon.c	(working copy)
@@ -951,7 +951,9 @@  try_peel_loop (struct loop *loop,
   if (!flag_peel_loops || PARAM_VALUE (PARAM_MAX_PEEL_TIMES) <= 0)
     return false;
 
-  /* Peel only innermost loops.  */
+  /* Peel only innermost loops.
+     While the code is perfectly capable of peeling non-innermost loops,
+     the heuristics would probably need some improvements. */
   if (loop->inner)
     {
       if (dump_file)
@@ -970,11 +972,22 @@  try_peel_loop (struct loop *loop,
   npeel = estimated_loop_iterations_int (loop);
   if (npeel < 0)
     {
+      npeel = likely_max_loop_iterations_int (loop);
+      if (flag_peel_all_loops
+	  && npeel >= PARAM_VALUE (PARAM_MAX_PEEL_TIMES) - 1)
+	npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES) - 1;
+    }
+  if (npeel < 0 && flag_peel_all_loops)
+    npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES) - 1;
+  if (npeel < 0)
+    {
       if (dump_file)
         fprintf (dump_file, "Not peeling: number of iterations is not "
 	         "estimated\n");
       return false;
     }
+  gcc_assert (maxiter < 0 || maxiter >= npeel
+	      || npeel <= max_loop_iterations_int (loop));
   if (maxiter >= 0 && maxiter <= npeel)
     {
       if (dump_file)
@@ -998,8 +1011,25 @@  try_peel_loop (struct loop *loop,
   /* Check peeled loops size.  */
   tree_estimate_loop_size (loop, exit, NULL, &size,
 			   PARAM_VALUE (PARAM_MAX_PEELED_INSNS));
-  if ((peeled_size = estimated_peeled_sequence_size (&size, (int) npeel))
-      > PARAM_VALUE (PARAM_MAX_PEELED_INSNS))
+  peeled_size = estimated_peeled_sequence_size (&size, (int) npeel);
+
+  /* When asked to peel all loops, try to reduce number of peeled copies to
+     fit in the size bound.  */
+  while (flag_peel_all_loops
+	 && peeled_size > PARAM_VALUE (PARAM_MAX_PEELED_INSNS)
+	 && npeel > 1)
+    {
+      /* Number of peeled copies is capped by PARAM_MAX_PEEL_TIMES and thus this
+	 loop will converge quickly.
+	 Just be sure we won't get a compile time hog when user asks for
+	 insanely many copies by --param parameter.  */
+      if (npeel > 256)
+	npeel /= 2;
+      else
+	npeel--;
+      peeled_size = estimated_peeled_sequence_size (&size, (int) npeel);
+    }
+  if (peeled_size > PARAM_VALUE (PARAM_MAX_PEELED_INSNS))
     {
       if (dump_file)
         fprintf (dump_file, "Not peeling: peeled sequence size is too large "
@@ -1112,8 +1142,8 @@  canonicalize_loop_induction_variables (s
   if (dump_file && (dump_flags & TDF_DETAILS)
       && likely_max_loop_iterations_int (loop) >= 0)
     {
-      fprintf (dump_file, "Loop likely %d iterates at most %i times.\n", loop->num,
-	       (int)likely_max_loop_iterations_int (loop));
+      fprintf (dump_file, "Loop %d likely iterates at most %i times.\n",
+	       loop->num, (int)likely_max_loop_iterations_int (loop));
     }
 
   /* Remove exits that are known to be never taken based on loop bound.