Index: loop-unroll.c
===================================================================
--- loop-unroll.c       (revision 181902)
+++ loop-unroll.c       (working copy)
@@ -152,6 +152,38 @@ static void combine_var_copies_in_loop_e
                                             basic_block);
 static rtx get_expansion (struct var_to_expand *);

+/* Determine whether LOOP contains floating-point computation. */
+bool
+loop_has_FP_comp(struct loop *loop)
+{
+  rtx set, dest;
+  basic_block *body, bb;
+  unsigned i;
+  rtx insn;
+
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      bb = body[i];
+
+      FOR_BB_INSNS (bb, insn)
+      {
+        set = single_set (insn);
+        if (!set)
+          continue;
+
+        dest = SET_DEST (set);
+        if (FLOAT_MODE_P (GET_MODE (dest)))
+        {
+          free (body);
+          return true;
+        }
+      }
+    }
+  free (body);
+  return false;
+}
+
 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 void
 unroll_and_peel_loops (int flags)
@@ -547,6 +579,9 @@ decide_unroll_constant_iterations (struc
   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);

+  if (targetm.loop_unroll_adjust)
+    nunroll = targetm.loop_unroll_adjust (nunroll, loop);
+
   /* Skip big loops.  */
   if (nunroll <= 1)
     {
Index: cfgloop.h
===================================================================
--- cfgloop.h   (revision 181902)
+++ cfgloop.h   (working copy)
@@ -693,5 +693,6 @@ extern void unroll_and_peel_loops (int);
 extern void doloop_optimize_loops (void);
 extern void move_loop_invariants (void);
 extern bool finite_loop_p (struct loop *);
+extern bool loop_has_FP_comp(struct loop *loop);

 #endif /* GCC_CFGLOOP_H */
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c  (revision 181902)
+++ config/i386/i386.c  (working copy)
@@ -60,6 +60,7 @@ along with GCC; see the file COPYING3.
 #include "fibheap.h"
 #include "opts.h"
 #include "diagnostic.h"
+#include "cfgloop.h"

 enum upper_128bits_state
 {
@@ -38370,6 +38371,75 @@ ix86_autovectorize_vector_sizes (void)
   return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
 }

+/* If LOOP contains a possible LCP stalling instruction on corei7,
+   calculate new number of times to unroll instead of NUNROLL so that
+   the unrolled loop will still likely fit into the loop stream detector. */
+static unsigned
+ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
+{
+  basic_block *body, bb;
+  unsigned i;
+  rtx insn;
+  bool has_FP;
+  bool found = false;
+  unsigned newunroll;
+
+  if (ix86_tune != PROCESSOR_COREI7_64 &&
+      ix86_tune != PROCESSOR_COREI7_32)
+    return nunroll;
+
+  /* Look for instructions that store a constant into HImode (16-bit)
+     memory. These require a length-changing prefix and on corei7 are
+     prone to LCP stalls. These stalls can be avoided if the loop
+     is streamed from the loop stream detector. */
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes && !found; i++)
+    {
+      bb = body[i];
+
+      FOR_BB_INSNS (bb, insn)
+        {
+          rtx set_expr;
+          set_expr = single_set (insn);
+          if (set_expr != NULL_RTX
+              && GET_MODE (SET_DEST (set_expr)) == HImode
+              && CONST_INT_P (SET_SRC (set_expr))
+              && MEM_P (SET_DEST (set_expr)))
+            {
+              found = true;
+              break;
+            }
+        }
+    }
+  free (body);
+
+  if (!found)
+    return nunroll;
+
+  /* Don't reduce unroll factor in loops with floating point
+     computation, which tend to benefit more heavily from
+     larger unroll factors and are less likely to bottleneck
+     at the decoder. */
+  has_FP = loop_has_FP_comp(loop);
+  if (has_FP)
+    return nunroll;
+
+  if (dump_file)
+    {
+      fprintf (dump_file,
+               ";; Loop contains HImode store of const (possible LCP
stalls),\n");
+      fprintf (dump_file,
+               "   reduce unroll factor to fit into Loop Stream Detector\n");
+    }
+
+  /* On corei7 the loop stream detector can hold about 28 instructions, so
+     don't allow unrolling to exceed that. */
+  newunroll = 28 / loop->av_ninsns;
+  if (newunroll < nunroll)
+    return newunroll;
+
+  return nunroll;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_RETURN_IN_MEMORY
