Patchwork Implement simple peeling on tree level; remove RTL peeling code

login
register
mail settings
Submitter Jan Hubicka
Date Nov. 5, 2012, 4:46 p.m.
Message ID <20121105164616.GC30424@kam.mff.cuni.cz>
Download mbox | patch
Permalink /patch/197221/
State New
Headers show

Comments

Jan Hubicka - Nov. 5, 2012, 4:46 p.m.
Hi,
this patch removes RTL loop peeling code and makes tree-cunroll pass to also
perform simple peeling (i.e. one done with profile feedback when loop is
expected to iterate just few times).

The motivation is
 1) I want to do some re-tunning of tree peeling heuristics first week(s) of
    stage 3.  The work on making complette unrolling to use all loop bound
    estimates actualy enabled a lot of extra unrolling oppurtunities showing
    that we have to do something about the code growth.
    The fact that both RTL and tree peeling code share the same --param bounds
    leads to apples/orranges problem.
 2) peeling earlier is better because it enables optimization across the
    peeled sequence, so current RTL peelng pass is too late for historical
    reasons.
 3) there are some bugs in RTL level peeling logic that seems not worth to
    solve.
 4) we may eventually retire the need for ivcanon pass, if it really is around
    just to make RTL niter code work well.

In last weeks i was looking into the cases where we still peeled on RTL level
and not at tree level and I think I narrowed down the cases enough so the RTL
level peeling can be declared obsolette.  The remaining issues are mostly
related to pass ordering where we are not able to discover induction variables
early enough.  This happens for obscure testcases only and can be better solved
by another tree unrolling pass late in queue if neccesary, but I do not see a
need in it.

So I think I can say that RTL peeling has become obsolette. There are two
issues:

One thing RTL pass does is the accumulator var splitting, we do not have tree
equivalent for this, but I think this should be handled more generally and the
pass is disabled by default at all levels anyway.

Anohter thing I dislike about current situation is that all the complette
unroling/peeling code ended up ilogically in tree-ssa-loop-ivcanon.  I think it
should be moved into separate file and dismantled from ivcanon pass, but I
think it should wait for next stage1.

Bootstrapped/regtested x86_64-linux.  I am doing some additional benchmarking,
but with earlier version of patch there was no performance regressions on SPEC.

Honza

	* loop-unroll.c: Update copyright and toplevel comment.
	(decide_unrolling_and_peeling): Rename to
	(decide_unrolling): ... this one.
	(peel_loops_completely): Remove.
	(decide_peel_simple): Remove.
	(decide_peel_once_rolling): Remove.
	(decide_peel_completely): Remove.
	(peel_loop_simple): Remove.
	(peel_loop_completely): Remove.
	(unroll_and_peel_loops): Rename to ...
	(unroll_loops): ... this one; handle only unrolling.
	* cfgloop.h (lpt_dec): Remove LPT_PEEL_COMPLETELY and
	LPT_PEEL_SIMPLE.
	(UAP_PEEL): Remove.
	(unroll_and_peel_loops): Remove.
	(unroll_loops): New.
	* passes.c (init_optimization_passes): Replace
	pass_rtl_unroll_and_peel_loops by pass_rtl_unroll_loops.
	* loop-init.c (gate_rtl_unroll_and_peel_loops,
	rtl_unroll_and_peel_loops): Rename to ...
	(gate_rtl_unroll_loops, rtl_unroll_loops): ... these; update.
	(pass_rtl_unroll_and_peel_loops): Rename to ...
	(pass_rtl_unroll_loops): ... this one.
	* tree-pass.h (pass_rtl_unroll_and_peel_loops): Remove.
	(pass_rtl_unroll_loops): New.
	* tree-ssa-loop-ivcanon.c: Update toplevel comment.
	(estimated_peeled_sequence_size, try_peel_loop): New.
	(canonicalize_loop_induction_variables): Update.

	* gcc.dg/tree-prof/peel-1.c: Update.
	* gcc.dg/tree-prof/unroll-1.c: Update.

Patch

Index: tree-pass.h
===================================================================
--- tree-pass.h	(revision 193160)
+++ tree-pass.h	(working copy)
@@ -417,7 +417,7 @@  extern struct rtl_opt_pass pass_loop2;
 extern struct rtl_opt_pass pass_rtl_loop_init;
 extern struct rtl_opt_pass pass_rtl_move_loop_invariants;
 extern struct rtl_opt_pass pass_rtl_unswitch;
-extern struct rtl_opt_pass pass_rtl_unroll_and_peel_loops;
+extern struct rtl_opt_pass pass_rtl_unroll_loops;
 extern struct rtl_opt_pass pass_rtl_doloop;
 extern struct rtl_opt_pass pass_rtl_loop_done;
 
Index: testsuite/gcc.dg/tree-prof/unroll-1.c
===================================================================
--- testsuite/gcc.dg/tree-prof/unroll-1.c	(revision 193160)
+++ testsuite/gcc.dg/tree-prof/unroll-1.c	(working copy)
@@ -20,5 +20,5 @@  main()
   return 0;
 }
 /* { dg-final-use { scan-rtl-dump "Considering unrolling loop with constant number of iterations" "loop2_unroll" } } */
-/* { dg-final-use { cleanup-rtl-dump "Not unrolling loop, doesn't roll" } } */
-/* { dg-options "-O3 -fdump-rtl-loop2_unroll -funroll-loops -fno-peel-loops" } */
+/* { dg-final-use { scan-rtl-dump "Not unrolling loop, doesn't roll" "loop2_unroll" } } */
+/* { dg-final-use { cleanup-rtl-dump "loop2_unroll" } } */
Index: testsuite/gcc.dg/tree-prof/peel-1.c
===================================================================
--- testsuite/gcc.dg/tree-prof/peel-1.c	(revision 193160)
+++ testsuite/gcc.dg/tree-prof/peel-1.c	(working copy)
@@ -1,4 +1,4 @@ 
-/* { dg-options "-O3 -fdump-rtl-loop2_unroll -fno-unroll-loops -fpeel-loops" } */
+/* { dg-options "-O3 -fdump-tree-cunroll-details -fno-unroll-loops -fpeel-loops" } */
 void abort();
 
 int a[1000];
@@ -19,7 +19,7 @@  main()
     t();
   return 0;
 }
-/* { dg-final-use { scan-rtl-dump "Considering simply peeling loop" "loop2_unroll" } } */
+/* { dg-final-use { scan-tree-dump "Peeled loop 1. 2 times." "cunrol" } } */
 /* In fact one peeling is enough; we however mispredict number of iterations of the loop
    at least until loop_ch is schedule ahead of profiling pass.  */
-/* { dg-final-use { cleanup-rtl-dump "Decided to simply peel the loop 2 times" } } */
+/* { dg-final-use { cleanup-tree-dump "cunroll" } } */
Index: loop-init.c
===================================================================
--- loop-init.c	(revision 193160)
+++ loop-init.c	(working copy)
@@ -359,13 +359,13 @@  struct rtl_opt_pass pass_rtl_unswitch =
 
 /* Loop unswitching for RTL.  */
 static bool
-gate_rtl_unroll_and_peel_loops (void)
+gate_rtl_unroll_loops (void)
 {
-  return (flag_peel_loops || flag_unroll_loops || flag_unroll_all_loops);
+  return (flag_unroll_loops || flag_unroll_all_loops);
 }
 
 static unsigned int
-rtl_unroll_and_peel_loops (void)
+rtl_unroll_loops (void)
 {
   if (number_of_loops () > 1)
     {
@@ -373,26 +373,24 @@  rtl_unroll_and_peel_loops (void)
       if (dump_file)
 	df_dump (dump_file);
 
-      if (flag_peel_loops)
-	flags |= UAP_PEEL;
       if (flag_unroll_loops)
 	flags |= UAP_UNROLL;
       if (flag_unroll_all_loops)
 	flags |= UAP_UNROLL_ALL;
 
-      unroll_and_peel_loops (flags);
+      unroll_loops (flags);
     }
   return 0;
 }
 
-struct rtl_opt_pass pass_rtl_unroll_and_peel_loops =
+struct rtl_opt_pass pass_rtl_unroll_loops =
 {
  {
   RTL_PASS,
   "loop2_unroll",                        /* name */
   OPTGROUP_LOOP,                        /* optinfo_flags */
-  gate_rtl_unroll_and_peel_loops,       /* gate */
-  rtl_unroll_and_peel_loops,            /* execute */
+  gate_rtl_unroll_loops,   	        /* gate */
+  rtl_unroll_loops,  	                /* execute */
   NULL,                                 /* sub */
   NULL,                                 /* next */
   0,                                    /* static_pass_number */
Index: tree-ssa-loop-ivcanon.c
===================================================================
--- tree-ssa-loop-ivcanon.c	(revision 193160)
+++ tree-ssa-loop-ivcanon.c	(working copy)
@@ -1,5 +1,5 @@ 
-/* Induction variable canonicalization.
-   Copyright (C) 2004, 2005, 2007, 2008, 2010
+/* Induction variable canonicalization and loop peeling.
+   Copyright (C) 2004, 2005, 2007, 2008, 2010, 2012
    Free Software Foundation, Inc.
 
 This file is part of GCC.
@@ -29,9 +29,12 @@  along with GCC; see the file COPYING3.  
    variables.  In that case the created optimization possibilities are likely
    to pay up.
 
-   Additionally in case we detect that it is beneficial to unroll the
-   loop completely, we do it right here to expose the optimization
-   possibilities to the following passes.  */
+   We also perform
+     - complette unrolling (or peeling) when the loops is rolling few enough
+       times
+     - simple peeling (i.e. copying few initial iterations prior the loop)
+       when number of iteration estimate is known (typically by the profile
+       info).  */
 
 #include "config.h"
 #include "system.h"
@@ -571,7 +574,7 @@  try_unroll_loop_completely (struct loop 
 			    enum unroll_level ul,
 			    HOST_WIDE_INT maxiter)
 {
-  unsigned HOST_WIDE_INT n_unroll, ninsns, max_unroll, unr_insns;
+  unsigned HOST_WIDE_INT n_unroll = 0, ninsns, max_unroll, unr_insns;
   gimple cond;
   struct loop_size size;
   bool n_unroll_found = false;
@@ -742,6 +745,132 @@  try_unroll_loop_completely (struct loop 
   return true;
 }
 
+/* Return number of instructions after peeling.  */
+static unsigned HOST_WIDE_INT
+estimated_peeled_sequence_size (struct loop_size *size,
+			        unsigned HOST_WIDE_INT npeel)
+{
+  return MAX (npeel * (HOST_WIDE_INT) (size->overall
+			     	       - size->eliminated_by_peeling), 1);
+}
+
+/* If the loop is expected to iterate N times and is
+   small enough, duplicate the loop body N+1 times before
+   the loop itself.  This way the hot path will never
+   enter the loop.  
+   Parameters are the same as for try_unroll_loops_completely */
+
+static bool
+try_peel_loop (struct loop *loop,
+	       edge exit, tree niter,
+	       HOST_WIDE_INT maxiter)
+{
+  int npeel;
+  struct loop_size size;
+  int peeled_size;
+  sbitmap wont_exit;
+  unsigned i;
+  VEC (edge, heap) *to_remove = NULL;
+  edge e;
+
+  /* If the iteration bound is known and large, then we can safely eliminate
+     the check in peeled copies.  */
+  if (TREE_CODE (niter) != INTEGER_CST)
+    exit = NULL;
+
+  if (!flag_peel_loops || PARAM_VALUE (PARAM_MAX_PEEL_TIMES) <= 0)
+    return false;
+
+  /* Peel only innermost loops.  */
+  if (loop->inner)
+    {
+      if (dump_file)
+        fprintf (dump_file, "Not peeling: outer loop\n");
+      return false;
+    }
+
+  if (!optimize_loop_for_speed_p (loop))
+    {
+      if (dump_file)
+        fprintf (dump_file, "Not peeling: cold loop\n");
+      return false;
+    }
+
+  /* Check if there is an estimate on the number of iterations.  */
+  npeel = estimated_loop_iterations_int (loop);
+  if (npeel < 0)
+    {
+      if (dump_file)
+        fprintf (dump_file, "Not peeling: number of iterations is not "
+	         "estimated\n");
+      return false;
+    }
+  if (maxiter >= 0 && maxiter <= npeel)
+    {
+      if (dump_file)
+        fprintf (dump_file, "Not peeling: upper bound is known so can "
+		 "unroll complettely\n");
+      return false;
+    }
+
+  /* We want to peel estimated number of iterations + 1 (so we never
+     enter the loop on quick path).  Check against PARAM_MAX_PEEL_TIMES
+     and be sure to avoid overflows.  */
+  if (npeel > PARAM_VALUE (PARAM_MAX_PEEL_TIMES) - 1)
+    {
+      if (dump_file)
+        fprintf (dump_file, "Not peeling: rolls too much "
+		 "(%i + 1 > --param max-peel-times)\n", npeel);
+      return false;
+    }
+  npeel++;
+
+  /* Check peeled loops size.  */
+  tree_estimate_loop_size (loop, exit, NULL, &size);
+  if ((peeled_size = estimated_peeled_sequence_size (&size, npeel))
+      > PARAM_VALUE (PARAM_MAX_PEELED_INSNS))
+    {
+      if (dump_file)
+        fprintf (dump_file, "Not peeling: peeled sequence size is too large "
+		 "(%i insns > --param max-peel-insns)", peeled_size);
+      return false;
+    }
+
+  /* Duplicate possibly eliminating the exits.  */
+  initialize_original_copy_tables ();
+  wont_exit = sbitmap_alloc (npeel + 1);
+  bitmap_ones (wont_exit);
+  bitmap_clear_bit (wont_exit, 0);
+  if (!gimple_duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
+					     npeel, wont_exit,
+					     exit, &to_remove,
+					     DLTHE_FLAG_UPDATE_FREQ
+					     | DLTHE_FLAG_COMPLETTE_PEEL))
+    {
+      free_original_copy_tables ();
+      free (wont_exit);
+      return false;
+    }
+  FOR_EACH_VEC_ELT (edge, to_remove, i, e)
+    {
+      bool ok = remove_path (e);
+      gcc_assert (ok);
+    }
+  free (wont_exit);
+  free_original_copy_tables ();
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "Peeled loop %d, %i times.\n",
+	       loop->num, npeel);
+    }
+  if (loop->any_upper_bound)
+    loop->nb_iterations_upper_bound -= double_int::from_uhwi (npeel);
+  loop->nb_iterations_estimate = double_int_zero;
+  /* Make sure to mark loop cold so we do not try to peel it more.  */
+  scale_loop_profile (loop, 1, 0);
+  loop->header->count = 0;
+  return true;
+}
 /* Adds a canonical induction variable to LOOP if suitable.
    CREATE_IV is true if we may create a new iv.  UL determines
    which loops we are allowed to completely unroll.  If TRY_EVAL is true, we try
@@ -816,6 +945,9 @@  canonicalize_loop_induction_variables (s
       && exit && just_once_each_iteration_p (loop, exit->src))
     create_canonical_iv (loop, exit, niter);
 
+  if (ul == UL_ALL)
+    modified |= try_peel_loop (loop, exit, niter, maxiter);
+
   return modified;
 }
 
Index: loop-unroll.c
===================================================================
--- loop-unroll.c	(revision 193160)
+++ loop-unroll.c	(working copy)
@@ -1,5 +1,5 @@ 
-/* Loop unrolling and peeling.
-   Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010, 2011
+/* Loop unrolling.
+   Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010, 2011, 2012
    Free Software Foundation, Inc.
 
 This file is part of GCC.
@@ -34,8 +34,8 @@  along with GCC; see the file COPYING3.  
 #include "target.h"
 #include "dumpfile.h"
 
-/* This pass performs loop unrolling and peeling.  We only perform these
-   optimizations on innermost loops (with single exception) because
+/* This pass performs loop unrolling.  We only perform this
+   optimization on innermost loops (with single exception) because
    the impact on performance is greatest here, and we want to avoid
    unnecessary code size growth.  The gain is caused by greater sequentiality
    of code, better code to optimize for further passes and in some cases
@@ -44,12 +44,6 @@  along with GCC; see the file COPYING3.  
 
    What we do:
 
-   -- complete peeling of once-rolling loops; this is the above mentioned
-      exception, as this causes loop to be cancelled completely and
-      does not cause code growth
-   -- complete peeling of loops that roll (small) constant times.
-   -- simple peeling of first iterations of loops that do not roll much
-      (according to profile feedback)
    -- unrolling of loops that roll constant times; this is almost always
       win, as we get rid of exit condition tests.
    -- unrolling of loops that roll number of times that we can compute
@@ -62,7 +56,7 @@  along with GCC; see the file COPYING3.  
    appropriate function below.
 
    There is a lot of parameters (defined and described in params.def) that
-   control how much we unroll/peel.
+   control how much we unroll.
 
    ??? A great problem is that we don't have a good way how to determine
    how many times we should unroll the loop; the experiments I have made
@@ -120,17 +114,11 @@  struct opt_info
   basic_block loop_preheader;      /* The loop preheader basic block.  */
 };
 
-static void decide_unrolling_and_peeling (int);
-static void peel_loops_completely (int);
-static void decide_peel_simple (struct loop *, int);
-static void decide_peel_once_rolling (struct loop *, int);
-static void decide_peel_completely (struct loop *, int);
 static void decide_unroll_stupid (struct loop *, int);
 static void decide_unroll_constant_iterations (struct loop *, int);
 static void decide_unroll_runtime_iterations (struct loop *, int);
-static void peel_loop_simple (struct loop *);
-static void peel_loop_completely (struct loop *);
 static void unroll_loop_stupid (struct loop *);
+static void decide_unrolling (int);
 static void unroll_loop_constant_iterations (struct loop *);
 static void unroll_loop_runtime_iterations (struct loop *);
 static struct opt_info *analyze_insns_in_loop (struct loop *);
@@ -149,18 +137,14 @@  static rtx get_expansion (struct var_to_
 
 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 void
-unroll_and_peel_loops (int flags)
+unroll_loops (int flags)
 {
   struct loop *loop;
   bool check;
   loop_iterator li;
 
-  /* First perform complete loop peeling (it is almost surely a win,
-     and affects parameters for further decision a lot).  */
-  peel_loops_completely (flags);
-
-  /* Now decide rest of unrolling and peeling.  */
-  decide_unrolling_and_peeling (flags);
+  /* Now decide rest of unrolling.  */
+  decide_unrolling (flags);
 
   /* Scan the loops, inner ones first.  */
   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
@@ -169,12 +153,6 @@  unroll_and_peel_loops (int flags)
       /* And perform the appropriate transformations.  */
       switch (loop->lpt_decision.decision)
 	{
-	case LPT_PEEL_COMPLETELY:
-	  /* Already done.  */
-	  gcc_unreachable ();
-	case LPT_PEEL_SIMPLE:
-	  peel_loop_simple (loop);
-	  break;
 	case LPT_UNROLL_CONSTANT:
 	  unroll_loop_constant_iterations (loop);
 	  break;
@@ -209,55 +187,27 @@  loop_exit_at_end_p (struct loop *loop)
   struct niter_desc *desc = get_simple_loop_desc (loop);
   rtx insn;
 
+  /* We should never have conditional in latch block.  */
+  gcc_assert (desc->in_edge->dest != loop->header);
+
   if (desc->in_edge->dest != loop->latch)
     return false;
 
-  /* Check that the latch is empty.  */
+  /* Check that the latch is empty.  
+     forwarder_block_p will not help here since it protects
+     loop latches.  */
   FOR_BB_INSNS (loop->latch, insn)
     {
-      if (NONDEBUG_INSN_P (insn))
+      if (INSN_P (insn) && active_insn_p (insn))
 	return false;
     }
 
   return true;
 }
 
-/* Depending on FLAGS, check whether to peel loops completely and do so.  */
+/* Decide whether unroll (depending on FLAGS) and how much.  */
 static void
-peel_loops_completely (int flags)
-{
-  struct loop *loop;
-  loop_iterator li;
-
-  /* Scan the loops, the inner ones first.  */
-  FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
-    {
-      loop->lpt_decision.decision = LPT_NONE;
-
-      if (dump_file)
-	fprintf (dump_file,
-		 "\n;; *** Considering loop %d for complete peeling ***\n",
-		 loop->num);
-
-      loop->ninsns = num_loop_insns (loop);
-
-      decide_peel_once_rolling (loop, flags);
-      if (loop->lpt_decision.decision == LPT_NONE)
-	decide_peel_completely (loop, flags);
-
-      if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
-	{
-	  peel_loop_completely (loop);
-#ifdef ENABLE_CHECKING
-	  verify_loop_structure ();
-#endif
-	}
-    }
-}
-
-/* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
-static void
-decide_unrolling_and_peeling (int flags)
+decide_unrolling (int flags)
 {
   struct loop *loop;
   loop_iterator li;
@@ -270,7 +220,7 @@  decide_unrolling_and_peeling (int flags)
       if (dump_file)
 	fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 
-      /* Do not peel cold areas.  */
+      /* Do not unroll cold areas.  */
       if (optimize_loop_for_size_p (loop))
 	{
 	  if (dump_file)
@@ -306,211 +256,9 @@  decide_unrolling_and_peeling (int flags)
 	decide_unroll_runtime_iterations (loop, flags);
       if (loop->lpt_decision.decision == LPT_NONE)
 	decide_unroll_stupid (loop, flags);
-      if (loop->lpt_decision.decision == LPT_NONE)
-	decide_peel_simple (loop, flags);
     }
 }
 
-/* Decide whether the LOOP is once rolling and suitable for complete
-   peeling.  */
-static void
-decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
-{
-  struct niter_desc *desc;
-
-  if (dump_file)
-    fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
-
-  /* Is the loop small enough?  */
-  if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
-    {
-      if (dump_file)
-	fprintf (dump_file, ";; Not considering loop, is too big\n");
-      return;
-    }
-
-  /* Check for simple loops.  */
-  desc = get_simple_loop_desc (loop);
-
-  /* Check number of iterations.  */
-  if (!desc->simple_p
-      || desc->assumptions
-      || desc->infinite
-      || !desc->const_iter
-      || (desc->niter != 0
-	  && max_loop_iterations_int (loop) != 0))
-    {
-      if (dump_file)
-	fprintf (dump_file,
-		 ";; Unable to prove that the loop rolls exactly once\n");
-      return;
-    }
-
-  /* Success.  */
-  if (dump_file)
-    fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
-  loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
-}
-
-/* Decide whether the LOOP is suitable for complete peeling.  */
-static void
-decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
-{
-  unsigned npeel;
-  struct niter_desc *desc;
-
-  if (dump_file)
-    fprintf (dump_file, "\n;; Considering peeling completely\n");
-
-  /* Skip non-innermost loops.  */
-  if (loop->inner)
-    {
-      if (dump_file)
-	fprintf (dump_file, ";; Not considering loop, is not innermost\n");
-      return;
-    }
-
-  /* Do not peel cold areas.  */
-  if (optimize_loop_for_size_p (loop))
-    {
-      if (dump_file)
-	fprintf (dump_file, ";; Not considering loop, cold area\n");
-      return;
-    }
-
-  /* Can the loop be manipulated?  */
-  if (!can_duplicate_loop_p (loop))
-    {
-      if (dump_file)
-	fprintf (dump_file,
-		 ";; Not considering loop, cannot duplicate\n");
-      return;
-    }
-
-  /* npeel = number of iterations to peel.  */
-  npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
-  if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
-    npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
-
-  /* Is the loop small enough?  */
-  if (!npeel)
-    {
-      if (dump_file)
-	fprintf (dump_file, ";; Not considering loop, is too big\n");
-      return;
-    }
-
-  /* Check for simple loops.  */
-  desc = get_simple_loop_desc (loop);
-
-  /* Check number of iterations.  */
-  if (!desc->simple_p
-      || desc->assumptions
-      || !desc->const_iter
-      || desc->infinite)
-    {
-      if (dump_file)
-	fprintf (dump_file,
-		 ";; Unable to prove that the loop iterates constant times\n");
-      return;
-    }
-
-  if (desc->niter > npeel - 1)
-    {
-      if (dump_file)
-	{
-	  fprintf (dump_file,
-		   ";; Not peeling loop completely, rolls too much (");
-	  fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
-	  fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
-	}
-      return;
-    }
-
-  /* Success.  */
-  if (dump_file)
-    fprintf (dump_file, ";; Decided to peel loop completely\n");
-  loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
-}
-
-/* Peel all iterations of LOOP, remove exit edges and cancel the loop
-   completely.  The transformation done:
-
-   for (i = 0; i < 4; i++)
-     body;
-
-   ==>
-
-   i = 0;
-   body; i++;
-   body; i++;
-   body; i++;
-   body; i++;
-   */
-static void
-peel_loop_completely (struct loop *loop)
-{
-  sbitmap wont_exit;
-  unsigned HOST_WIDE_INT npeel;
-  unsigned i;
-  VEC (edge, heap) *remove_edges;
-  edge ein;
-  struct niter_desc *desc = get_simple_loop_desc (loop);
-  struct opt_info *opt_info = NULL;
-
-  npeel = desc->niter;
-
-  if (npeel)
-    {
-      bool ok;
-
-      wont_exit = sbitmap_alloc (npeel + 1);
-      bitmap_ones (wont_exit);
-      bitmap_clear_bit (wont_exit, 0);
-      if (desc->noloop_assumptions)
-	bitmap_clear_bit (wont_exit, 1);
-
-      remove_edges = NULL;
-
-      if (flag_split_ivs_in_unroller)
-        opt_info = analyze_insns_in_loop (loop);
-
-      opt_info_start_duplication (opt_info);
-      ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
-					  npeel,
-					  wont_exit, desc->out_edge,
-					  &remove_edges,
-					  DLTHE_FLAG_UPDATE_FREQ
-					  | DLTHE_FLAG_COMPLETTE_PEEL
-					  | (opt_info
-					     ? DLTHE_RECORD_COPY_NUMBER : 0));
-      gcc_assert (ok);
-
-      free (wont_exit);
-
-      if (opt_info)
- 	{
- 	  apply_opt_in_copies (opt_info, npeel, false, true);
- 	  free_opt_info (opt_info);
- 	}
-
-      /* Remove the exit edges.  */
-      FOR_EACH_VEC_ELT (edge, remove_edges, i, ein)
-	remove_path (ein);
-      VEC_free (edge, heap, remove_edges);
-    }
-
-  ein = desc->in_edge;
-  free_simple_loop_desc (loop);
-
-  /* Now remove the unreachable part of the last iteration and cancel
-     the loop.  */
-  remove_path (ein);
-
-  if (dump_file)
-    fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
-}
-
 /* Decide whether to unroll LOOP iterating constant number of times
    and how much.  */
 
@@ -1223,164 +971,6 @@  unroll_loop_runtime_iterations (struct l
   VEC_free (basic_block, heap, dom_bbs);
 }
 
-/* Decide whether to simply peel LOOP and how much.  */
-static void
-decide_peel_simple (struct loop *loop, int flags)
-{
-  unsigned npeel;
-  double_int iterations;
-
-  if (!(flags & UAP_PEEL))
-    {
-      /* We were not asked to, just return back silently.  */
-      return;
-    }
-
-  if (dump_file)
-    fprintf (dump_file, "\n;; Considering simply peeling loop\n");
-
-  /* npeel = number of iterations to peel.  */
-  npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
-  if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
-    npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
-
-  /* Skip big loops.  */
-  if (!npeel)
-    {
-      if (dump_file)
-	fprintf (dump_file, ";; Not considering loop, is too big\n");
-      return;
-    }
-
-  /* Do not simply peel loops with branches inside -- it increases number
-     of mispredicts.  
-     Exception is when we do have profile and we however have good chance
-     to peel proper number of iterations loop will iterate in practice.
-     TODO: this heuristic needs tunning; while for complette unrolling
-     the branch inside loop mostly eliminates any improvements, for
-     peeling it is not the case.  Also a function call inside loop is
-     also branch from branch prediction POV (and probably better reason
-     to not unroll/peel).  */
-  if (num_loop_branches (loop) > 1
-      && profile_status != PROFILE_READ)
-    {
-      if (dump_file)
-	fprintf (dump_file, ";; Not peeling, contains branches\n");
-      return;
-    }
-
-  /* If we have realistic estimate on number of iterations, use it.  */
-  if (estimated_loop_iterations (loop, &iterations))
-    {
-      if (double_int::from_shwi (npeel).ule (iterations))
-	{
-	  if (dump_file)
-	    {
-	      fprintf (dump_file, ";; Not peeling loop, rolls too much (");
-	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
-		       (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
-	      fprintf (dump_file, " iterations > %d [maximum peelings])\n",
-		       npeel);
-	    }
-	  return;
-	}
-      npeel = iterations.to_shwi () + 1;
-    }
-  /* If we have small enough bound on iterations, we can still peel (completely
-     unroll).  */
-  else if (max_loop_iterations (loop, &iterations)
-           && iterations.ult (double_int::from_shwi (npeel)))
-    npeel = iterations.to_shwi () + 1;
-  else
-    {
-      /* For now we have no good heuristics to decide whether loop peeling
-         will be effective, so disable it.  */
-      if (dump_file)
-	fprintf (dump_file,
-		 ";; Not peeling loop, no evidence it will be profitable\n");
-      return;
-    }
-
-  /* Success.  */
-  loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
-  loop->lpt_decision.times = npeel;
-
-  if (dump_file)
-    fprintf (dump_file, ";; Decided to simply peel the loop %d times.\n",
-	     loop->lpt_decision.times);
-}
-
-/* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
-
-   while (cond)
-     body;
-
-   ==>  (LOOP->LPT_DECISION.TIMES == 3)
-
-   if (!cond) goto end;
-   body;
-   if (!cond) goto end;
-   body;
-   if (!cond) goto end;
-   body;
-   while (cond)
-     body;
-   end: ;
-   */
-static void
-peel_loop_simple (struct loop *loop)
-{
-  sbitmap wont_exit;
-  unsigned npeel = loop->lpt_decision.times;
-  struct niter_desc *desc = get_simple_loop_desc (loop);
-  struct opt_info *opt_info = NULL;
-  bool ok;
-
-  if (flag_split_ivs_in_unroller && npeel > 1)
-    opt_info = analyze_insns_in_loop (loop);
-
-  wont_exit = sbitmap_alloc (npeel + 1);
-  bitmap_clear (wont_exit);
-
-  opt_info_start_duplication (opt_info);
-
-  ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
-				      npeel, wont_exit, NULL,
-				      NULL, DLTHE_FLAG_UPDATE_FREQ
-				      | (opt_info
-					 ? DLTHE_RECORD_COPY_NUMBER
-					   : 0));
-  gcc_assert (ok);
-
-  free (wont_exit);
-
-  if (opt_info)
-    {
-      apply_opt_in_copies (opt_info, npeel, false, false);
-      free_opt_info (opt_info);
-    }
-
-  if (desc->simple_p)
-    {
-      if (desc->const_iter)
-	{
-	  desc->niter -= npeel;
-	  desc->niter_expr = GEN_INT (desc->niter);
-	  desc->noloop_assumptions = NULL_RTX;
-	}
-      else
-	{
-	  /* We cannot just update niter_expr, as its value might be clobbered
-	     inside loop.  We could handle this by counting the number into
-	     temporary just like we do in runtime unrolling, but it does not
-	     seem worthwhile.  */
-	  free_simple_loop_desc (loop);
-	}
-    }
-  if (dump_file)
-    fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
-}
-
 /* Decide whether to unroll LOOP stupidly and how much.  */
 static void
 decide_unroll_stupid (struct loop *loop, int flags)
Index: cfgloop.h
===================================================================
--- cfgloop.h	(revision 193160)
+++ cfgloop.h	(working copy)
@@ -32,8 +32,6 @@  along with GCC; see the file COPYING3.  
 enum lpt_dec
 {
   LPT_NONE,
-  LPT_PEEL_COMPLETELY,
-  LPT_PEEL_SIMPLE,
   LPT_UNROLL_CONSTANT,
   LPT_UNROLL_RUNTIME,
   LPT_UNROLL_STUPID
@@ -704,12 +702,11 @@  extern void unswitch_loops (void);
 
 enum
 {
-  UAP_PEEL = 1,		/* Enables loop peeling.  */
-  UAP_UNROLL = 2,	/* Enables unrolling of loops if it seems profitable.  */
-  UAP_UNROLL_ALL = 4	/* Enables unrolling of all loops.  */
+  UAP_UNROLL = 1,	/* Enables unrolling of loops if it seems profitable.  */
+  UAP_UNROLL_ALL = 2	/* Enables unrolling of all loops.  */
 };
 
-extern void unroll_and_peel_loops (int);
+extern void unroll_loops (int);
 extern void doloop_optimize_loops (void);
 extern void move_loop_invariants (void);
 extern bool finite_loop_p (struct loop *);
@@ -719,7 +716,6 @@  extern void scale_loop_profile (struct l
 static inline struct loop *
 loop_outermost (struct loop *loop)
 {
-  
   unsigned n = VEC_length (loop_p, loop->superloops);
 
   if (n <= 1)
Index: passes.c
===================================================================
--- passes.c	(revision 193160)
+++ passes.c	(working copy)
@@ -1597,7 +1597,7 @@  init_optimization_passes (void)
 	  NEXT_PASS (pass_rtl_loop_init);
 	  NEXT_PASS (pass_rtl_move_loop_invariants);
 	  NEXT_PASS (pass_rtl_unswitch);
-	  NEXT_PASS (pass_rtl_unroll_and_peel_loops);
+	  NEXT_PASS (pass_rtl_unroll_loops);
 	  NEXT_PASS (pass_rtl_doloop);
 	  NEXT_PASS (pass_rtl_loop_done);
 	  *p = NULL;