Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi	(revision 187013)
+++ doc/invoke.texi	(working copy)
@@ -8842,6 +8842,12 @@ The maximum number of insns of an unswitched loop.
 @item max-unswitch-level
 The maximum number of branches unswitched in a single loop.
 
+@item min-iter-unroll-with-branches
+Minimum iteration count to ignore branch effects when unrolling.
+
+@item unroll-outer-loop-branch-budget
+Maximum number of branches allowed in hot outer loop region after unroll.
+
 @item lim-expensive
 The minimum cost of an expensive expression in the loop invariant motion.
 
Index: loop-unroll.c
===================================================================
--- loop-unroll.c	(revision 187013)
+++ loop-unroll.c	(working copy)
@@ -152,6 +152,99 @@ static void combine_var_copies_in_loop_exit (struc
 					     basic_block);
 static rtx get_expansion (struct var_to_expand *);
 
+/* Compute the maximum number of times LOOP can be unrolled without exceeding
+   a branch budget, which can increase branch mispredictions. The number of
+   branches is computed by weighting each branch with its expected execution
+   probability through the loop based on profile data. If no profile feedback
+   data exists, simply return the current NUNROLL factor.  */
+
+static unsigned
+max_unroll_with_branches(struct loop *loop, unsigned nunroll)
+{
+  struct loop *outer;
+  struct niter_desc *outer_desc = 0;
+  int outer_niters = 1;
+  int frequent_iteration_threshold;
+  unsigned branch_budget;
+  struct niter_desc *desc = get_simple_loop_desc (loop);
+
+  /* Ignore loops with FP computation as these tend to benefit much more
+     consistently from unrolling.  */
+  if (desc->has_fp)
+    return nunroll;
+
+  frequent_iteration_threshold = PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES);
+  if (expected_loop_iterations (loop) >= (unsigned) frequent_iteration_threshold)
+    return nunroll;
+
+  /* If there was no profile feedback data, av_num_branches will be 0
+     and we won't limit unrolling. If the av_num_branches is at most 1,
+     also don't limit unrolling as the back-edge branch will not be duplicated.  */
+  if (desc->av_num_branches <= 1)
+    return nunroll;
+
+  /* Walk up the loop tree until we find a hot outer loop in which the current
+     loop is nested. At that point we will compute the number of times the
+     current loop can be unrolled based on the number of branches in the hot
+     outer loop.  */
+  outer = loop_outer (loop);
+  /* The loop structure contains a fake outermost loop, so this should always
+     be non-NULL for our current loop.  */
+  gcc_assert (outer);
+
+  /* Walk up the loop tree until we either find a hot outer loop or hit the
+     fake outermost loop at the root.  */
+  while (true)
+    {
+      outer_desc = get_simple_loop_desc (outer);
+
+      /* Stop if we hit the fake outermost loop at the root of the tree,
+         which includes the whole procedure.  */
+      if (!loop_outer (outer))
+        break;
+
+      if (outer_desc->const_iter)
+        outer_niters *= outer_desc->niter;
+      else if (outer->header->count)
+        outer_niters *= expected_loop_iterations (outer);
+
+      /* If the outer loop has enough iterations to be considered hot, then
+         we can stop our upwards loop tree traversal and examine the current
+         outer loop.  */
+      if (outer_niters >= frequent_iteration_threshold)
+        break;
+
+      outer = loop_outer (outer);
+    }
+
+  gcc_assert(outer);
+
+  /* Assume that any call will cause the branch budget to be exceeded,
+     and that we can't unroll the current loop without increasing
+     mispredicts.  */
+  if (outer_desc->has_call)
+    return 0;
+
+  /* Otherwise, compute the maximum number of times current loop can be
+     unrolled without exceeding our branch budget. First we subtract
+     off the outer loop's average branch count from the budget. Note
+     that this includes the branches in the current loop. This yields
+     the number of branches left in the budget for the unrolled copies.
+     We divide this by the number of branches in the current loop that
+     must be duplicated when we unroll, which is the total average
+     number of branches minus the back-edge branch. This yields the
+     number of new loop body copies that can be created by unrolling
+     without exceeding the budget, to which we add 1 to get the unroll
+     factor. Note that the "outermost loop" may be the whole procedure
+     if we did not find a hot enough enclosing loop.  */
+  branch_budget = PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET);
+  if (outer_desc->av_num_branches > branch_budget)
+    return 0;
+  /* We already returned early if desc->av_num_branches <= 1.  */
+  return (branch_budget - outer_desc->av_num_branches)
+      / (desc->av_num_branches - 1) + 1;
+}
+
 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 void
 unroll_and_peel_loops (int flags)
@@ -522,6 +615,7 @@ static void
 decide_unroll_constant_iterations (struct loop *loop, int flags)
 {
   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
+  unsigned nunroll_branches;
   struct niter_desc *desc;
 
   if (!(flags & UAP_UNROLL))
@@ -565,6 +659,21 @@ decide_unroll_constant_iterations (struct loop *lo
       return;
     }
 
+  /* Be careful when unrolling loops with branches inside -- it can increase
+     the number of mispredicts.  */
+  if (desc->num_branches > 1)
+    {
+      nunroll_branches = max_unroll_with_branches (loop, nunroll);
+      if (nunroll > nunroll_branches)
+        nunroll = nunroll_branches;
+      if (nunroll <= 1)
+        {
+          if (dump_file)
+	    fprintf (dump_file, ";; Not unrolling, contains branches\n");
+          return;
+        }
+    }
+
   /* Check whether the loop rolls enough to consider.  */
   if (desc->niter < 2 * nunroll)
     {
@@ -802,7 +911,7 @@ unroll_loop_constant_iterations (struct loop *loop
 static void
 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 {
-  unsigned nunroll, nunroll_by_av, i;
+  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
   struct niter_desc *desc;
 
   if (!(flags & UAP_UNROLL))
@@ -856,6 +965,21 @@ decide_unroll_runtime_iterations (struct loop *loo
       return;
     }
 
+  /* Be careful when unrolling loops with branches inside -- it can increase
+     the number of mispredicts.  */
+  if (desc->num_branches > 1)
+    {
+      nunroll_branches = max_unroll_with_branches (loop, nunroll);
+      if (nunroll > nunroll_branches)
+        nunroll = nunroll_branches;
+      if (nunroll <= 1)
+        {
+          if (dump_file)
+            fprintf (dump_file, ";; Not unrolling, contains branches\n");
+          return;
+        }
+    }
+
   /* If we have profile feedback, check whether the loop rolls.  */
   if ((loop->header->count
        && expected_loop_iterations (loop) < 2 * nunroll)
@@ -1233,7 +1357,7 @@ decide_peel_simple (struct loop *loop, int flags)
 
   /* Do not simply peel loops with branches inside -- it increases number
      of mispredicts.  */
-  if (num_loop_branches (loop) > 1)
+  if (desc->num_branches > 1)
     {
       if (dump_file)
 	fprintf (dump_file, ";; Not peeling, contains branches\n");
@@ -1394,7 +1518,7 @@ decide_unroll_stupid (struct loop *loop, int flags
 
   /* Do not unroll loops with branches inside -- it increases number
      of mispredicts.  */
-  if (num_loop_branches (loop) > 1)
+  if (desc->num_branches > 1)
     {
       if (dump_file)
 	fprintf (dump_file, ";; Not unrolling, contains branches\n");
Index: loop-iv.c
===================================================================
--- loop-iv.c	(revision 187013)
+++ loop-iv.c	(working copy)
@@ -2948,8 +2948,12 @@ get_simple_loop_desc (struct loop *loop)
   /* At least desc->infinite is not always initialized by
      find_simple_loop_exit.  */
   desc = XCNEW (struct niter_desc);
-  iv_analysis_loop_init (loop);
-  find_simple_exit (loop, desc);
+  if (loop->latch != EXIT_BLOCK_PTR)
+    {
+      iv_analysis_loop_init (loop);
+      find_simple_exit (loop, desc);
+    }
+  analyze_loop_insns (loop, desc);
   loop->aux = desc;
 
   if (desc->simple_p && (desc->assumptions || desc->infinite))
Index: cfgloop.c
===================================================================
--- cfgloop.c	(revision 187013)
+++ cfgloop.c	(working copy)
@@ -1156,24 +1156,98 @@ get_loop_exit_edges (const struct loop *loop)
   return edges;
 }
 
-/* Counts the number of conditional branches inside LOOP.  */
+/* Determine if INSN is a floating point set.  */
 
-unsigned
-num_loop_branches (const struct loop *loop)
+static bool
+insn_has_fp_set(rtx insn)
 {
-  unsigned i, n;
-  basic_block * body;
+  int i;
+  rtx pat = PATTERN(insn);
+  if (GET_CODE (pat) == SET)
+    return (FLOAT_MODE_P (GET_MODE (SET_DEST (pat))));
+  else if (GET_CODE (pat) == PARALLEL)
+    {
+      for (i = 0; i < XVECLEN (pat, 0); i++)
+        {
+          rtx sub = XVECEXP (pat, 0, i);
+          if (GET_CODE (sub) == SET)
+            return (FLOAT_MODE_P (GET_MODE (SET_DEST (sub))));
+        }
+    }
+  return false;
+}
 
-  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
+/* Analyzes the instructions inside LOOP, updating the DESC. Currently counts
+   the number of conditional branch instructions, calls and fp instructions,
+   as well as the average number of branches executed per iteration.  */
 
+void
+analyze_loop_insns (const struct loop *loop, struct niter_desc *desc)
+{
+  unsigned i, nbranch;
+  gcov_type weighted_nbranch;
+  bool has_call, has_fp;
+  basic_block * body, bb;
+  rtx insn;
+  gcov_type header_count = loop->header->count;
+
+  nbranch = weighted_nbranch = 0;
+  has_call = has_fp = false;
+
   body = get_loop_body (loop);
-  n = 0;
   for (i = 0; i < loop->num_nodes; i++)
-    if (EDGE_COUNT (body[i]->succs) >= 2)
-      n++;
+    {
+      bb = body[i];
+
+      if (EDGE_COUNT (bb->succs) >= 2)
+        {
+          nbranch++;
+
+          /* If this block is executed less frequently than the header (loop
+             entry), then it is weighted based on its execution count, which
+             will be turned into a ratio compared to the loop header below. */
+          if (bb->count < header_count)
+            weighted_nbranch += bb->count;
+
+          /* When it is executed more frequently than the header (i.e. it is
+             in a nested inner loop), simply weight the branch the same as the
+             header execution count, so that it will contribute 1 branch to
+             the ratio computed below. */
+          else
+            weighted_nbranch += header_count;
+        }
+
+      /* No need to iterate through the instructions below if
+         both flags have already been set.  */
+      if (has_call && has_fp)
+        continue;
+
+      FOR_BB_INSNS (bb, insn)
+        {
+          if (!INSN_P (insn))
+            continue;
+
+          if (!has_call)
+            has_call = CALL_P (insn);
+
+          if (!has_fp)
+            has_fp = insn_has_fp_set (insn);
+        }
+    }
   free (body);
 
-  return n;
+  desc->num_branches = nbranch;
+  /* Now divide the weights computed above by the loop header execution count,
+     to compute the average number of branches through the loop. By adding
+     header_count/2 to the numerator we round to nearest with integer
+     division.  */
+  if (header_count  != 0)
+    desc->av_num_branches
+        = (weighted_nbranch + header_count/2) / header_count;
+  else
+    desc->av_num_branches = 0;
+  desc->has_call = has_call;
+  desc->has_fp = has_fp;
 }
 
 /* Adds basic block BB to LOOP.  */
Index: cfgloop.h
===================================================================
--- cfgloop.h	(revision 187013)
+++ cfgloop.h	(working copy)
@@ -249,7 +249,6 @@ extern basic_block *get_loop_body_in_custom_order
 
 extern VEC (edge, heap) *get_loop_exit_edges (const struct loop *);
 edge single_exit (const struct loop *);
-extern unsigned num_loop_branches (const struct loop *);
 
 extern edge loop_preheader_edge (const struct loop *);
 extern edge loop_latch_edge (const struct loop *);
@@ -359,7 +358,8 @@ struct rtx_iv
 };
 
 /* The description of an exit from the loop and of the number of iterations
-   till we take the exit.  */
+   till we take the exit. Also includes other information used primarily
+   by the loop unroller.  */
 
 struct niter_desc
 {
@@ -400,6 +400,18 @@ struct niter_desc
 
   /* The number of iterations of the loop.  */
   rtx niter_expr;
+
+  /* The number of branches in the loop.  */
+  unsigned num_branches;
+
+  /* The number of executed branches per iteration.  */
+  unsigned av_num_branches;
+
+  /* Whether the loop contains a call instruction.  */
+  bool has_call;
+
+  /* Whether the loop contains fp instructions.  */
+  bool has_fp;
 };
 
 extern void iv_analysis_loop_init (struct loop *);
@@ -413,6 +425,7 @@ extern void iv_analysis_done (void);
 
 extern struct niter_desc *get_simple_loop_desc (struct loop *loop);
 extern void free_simple_loop_desc (struct loop *loop);
+void analyze_loop_insns (const struct loop *, struct niter_desc *desc);
 
 static inline struct niter_desc *
 simple_loop_desc (struct loop *loop)
Index: params.def
===================================================================
--- params.def	(revision 187013)
+++ params.def	(working copy)
@@ -312,6 +312,15 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
 	 "The maximum depth of a loop nest we completely peel",
 	 8, 0, 0)
 
+DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
+	"min-iter-unroll-with-branches",
+	"Minimum iteration count to ignore branch effects when unrolling",
+	50, 0, 0)
+DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
+	"unroll-outer-loop-branch-budget",
+	"Maximum number of branches allowed in hot outer loop region after unroll",
+	25, 0, 0)
+
 /* The maximum number of insns of an unswitched loop.  */
 DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
 	"max-unswitch-insns",
