diff mbox

[x86] Implement static static checking

Message ID 201007022330.09610.ebotcazou@adacore.com
State New
Headers show

Commit Message

Eric Botcazou July 2, 2010, 9:30 p.m. UTC
This implements static stack checking in the x86 back-end, i.e. checking of 
the static part of the stack frame in the prologue.  This is modelled on the 
Alpha back-end, but of course only activated by means of -fstack-check.

Tested on Linux, Solaris and Windows 32-bit as well as Linux 64-bit.  OK for 
the mainline?


2010-07-02  Eric Botcazou  <ebotcazou@adacore.com>

	* config/i386/cygming.h (STACK_CHECK_STATIC_BUILTIN): Define to 1.
	* config/i386/linux.h (STACK_CHECK_STATIC_BUILTIN): Likewise.
	* config/i386/linux64.h (STACK_CHECK_STATIC_BUILTIN): Likewise.
	* config/i386/sol2.h (STACK_CHECK_STATIC_BUILTIN): Likewise.
	* config/i386/i386.c (ix86_target_stack_probe): New function.
	(ix86_compute_frame_layout): Force use of push instructions to
	save registers if stack checking with probes is enabled.
	(get_scratch_register_on_entry): New function.
	(release_scratch_register_on_entry): Likewise.
	(ix86_adjust_stack_and_probe): Likewise.
	(output_adjust_stack_and_probe): Likewise.
	(ix86_emit_probe_stack_range): Likewise.
	(output_probe_stack_range): Likewise.
	(ix86_expand_prologue): Emit stack checking code if static built-in
	stack checking is enabled.
	Test ix86_target_stack_probe instead of TARGET_STACK_PROBE.
	* config/i386/i386-protos.h (ix86_target_stack_probe): Declare.
	(output_adjust_stack_and_probe): Likewise.
	(output_probe_stack_range): Likewise.
	* config/i386/i386.md (UNSPECV_PROBE_STACK_RANGE): New constant.
	(allocate_stack_worker_32): Test ix86_target_stack_probe instead of
	TARGET_STACK_PROBE.
	(allocate_stack_worker_64): Likewise.
	(allocate_stack): Likewise.
	(adjust_stack_and_probe): New insn.
	(probe_stack_range): Likewise.

Comments

Gerald Pfeifer July 5, 2010, 5:33 p.m. UTC | #1
On Fri, 2 Jul 2010, Eric Botcazou wrote:
> Tested on Linux, Solaris and Windows 32-bit as well as Linux 64-bit.  
> OK for the mainline?

What will it need to make this work on FreeBSD?  Will just #define-ing 
STACK_CHECK_STATIC_BUILTIN in config/i386/freebsd.h have a chance to work?

(I assume you'll add a note to gcc-4.6/changes.html once you patches have
been approved and applied? ;-)

Gerald
Eric Botcazou July 6, 2010, 3:24 p.m. UTC | #2
> What will it need to make this work on FreeBSD?  Will just #define-ing
> STACK_CHECK_STATIC_BUILTIN in config/i386/freebsd.h have a chance to work?

Yes, static stack checking works fine on FreeBSD as well, I was too lazy to 
test this version of the patch on yet another machine, but it should work.

> (I assume you'll add a note to gcc-4.6/changes.html once you patches have
> been approved and applied? ;-)

Will think about it. ;-)
diff mbox

Patch

Index: config/i386/cygming.h
===================================================================
--- config/i386/cygming.h	(revision 161685)
+++ config/i386/cygming.h	(working copy)
@@ -429,6 +429,9 @@  do {						\
 #define TARGET_CXX_ADJUST_CLASS_AT_DEFINITION i386_pe_adjust_class_at_definition
 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME i386_pe_mangle_decl_assembler_name
 
+/* Static stack checking is supported by means of probes.  */
+#define STACK_CHECK_STATIC_BUILTIN 1
+
 #undef TREE
 
 #ifndef BUFSIZ
Index: config/i386/linux.h
===================================================================
--- config/i386/linux.h	(revision 161685)
+++ config/i386/linux.h	(working copy)
@@ -209,6 +209,9 @@  along with GCC; see the file COPYING3.
 /* The stack pointer needs to be moved while checking the stack.  */
 #define STACK_CHECK_MOVING_SP 1
 
+/* Static stack checking is supported by means of probes.  */
+#define STACK_CHECK_STATIC_BUILTIN 1
+
 /* This macro may be overridden in i386/k*bsd-gnu.h.  */
 #define REG_NAME(reg) reg
 
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 161685)
+++ config/i386/i386.md	(working copy)
@@ -231,6 +231,7 @@  (define_c_enum "unspec" [
 (define_c_enum "unspecv" [
   UNSPECV_BLOCKAGE
   UNSPECV_STACK_PROBE
+  UNSPECV_PROBE_STACK_RANGE
   UNSPECV_EMMS
   UNSPECV_LDMXCSR
   UNSPECV_STMXCSR
@@ -16432,7 +16433,7 @@  (define_insn "allocate_stack_worker_32"
 			    UNSPECV_STACK_PROBE))
    (set (reg:SI SP_REG) (minus:SI (reg:SI SP_REG) (match_dup 1)))
    (clobber (reg:CC FLAGS_REG))]
-  "!TARGET_64BIT && TARGET_STACK_PROBE"
+  "!TARGET_64BIT && ix86_target_stack_probe ()"
   "call\t___chkstk"
   [(set_attr "type" "multi")
    (set_attr "length" "5")])
@@ -16445,7 +16446,7 @@  (define_insn "allocate_stack_worker_64"
    (clobber (reg:DI R10_REG))
    (clobber (reg:DI R11_REG))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_64BIT && TARGET_STACK_PROBE"
+  "TARGET_64BIT && ix86_target_stack_probe ()"
   "call\t___chkstk"
   [(set_attr "type" "multi")
    (set_attr "length" "5")])
@@ -16453,7 +16454,7 @@  (define_insn "allocate_stack_worker_64"
 (define_expand "allocate_stack"
   [(match_operand 0 "register_operand" "")
    (match_operand 1 "general_operand" "")]
-  "TARGET_STACK_PROBE"
+  "ix86_target_stack_probe ()"
 {
   rtx x;
 
@@ -16495,6 +16496,28 @@  (define_expand "probe_stack"
   DONE;
 })
 
+(define_insn "adjust_stack_and_probe<P:mode>"
+  [(set (match_operand:P 0 "register_operand" "=r")
+	(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")]
+			    UNSPECV_PROBE_STACK_RANGE))
+   (set (reg:P SP_REG)
+        (minus:P (reg:P SP_REG) (match_operand:P 2 "const_int_operand" "n")))
+   (clobber (reg:CC FLAGS_REG))
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "* return output_adjust_stack_and_probe (operands[0]);"
+  [(set_attr "type" "multi")])
+
+(define_insn "probe_stack_range<P:mode>"
+  [(set (match_operand:P 0 "register_operand" "=r")
+	(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")
+			    (match_operand:P 2 "const_int_operand" "n")]
+			    UNSPECV_PROBE_STACK_RANGE))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "* return output_probe_stack_range (operands[0], operands[2]);"
+  [(set_attr "type" "multi")])
+
 (define_expand "builtin_setjmp_receiver"
   [(label_ref (match_operand 0 "" ""))]
   "!TARGET_64BIT && flag_pic"
Index: config/i386/sol2.h
===================================================================
--- config/i386/sol2.h	(revision 161685)
+++ config/i386/sol2.h	(working copy)
@@ -154,6 +154,9 @@  along with GCC; see the file COPYING3.
 #undef X86_FILE_START_VERSION_DIRECTIVE
 #define X86_FILE_START_VERSION_DIRECTIVE false
 
+/* Static stack checking is supported by means of probes.  */
+#define STACK_CHECK_STATIC_BUILTIN 1
+
 /* Only recent versions of Solaris 11 ld properly support hidden .gnu.linkonce
    sections, so don't use them.  */
 #ifndef TARGET_GNU_LD
Index: config/i386/linux64.h
===================================================================
--- config/i386/linux64.h	(revision 161685)
+++ config/i386/linux64.h	(working copy)
@@ -112,6 +112,9 @@  see the files COPYING3 and COPYING.RUNTI
 /* The stack pointer needs to be moved while checking the stack.  */
 #define STACK_CHECK_MOVING_SP 1
 
+/* Static stack checking is supported by means of probes.  */
+#define STACK_CHECK_STATIC_BUILTIN 1
+
 /* This macro may be overridden in i386/k*bsd-gnu.h.  */
 #define REG_NAME(reg) reg
 
Index: config/i386/i386-protos.h
===================================================================
--- config/i386/i386-protos.h	(revision 161685)
+++ config/i386/i386-protos.h	(working copy)
@@ -24,6 +24,7 @@  extern void override_options (bool);
 extern void optimization_options (int, int);
 extern void ix86_conditional_register_usage (void);
 
+extern bool ix86_target_stack_probe (void);
 extern int ix86_can_use_return_insn_p (void);
 extern void ix86_setup_frame_addresses (void);
 
@@ -71,6 +72,8 @@  extern const char *output_387_binary_op
 extern const char *output_387_reg_move (rtx, rtx*);
 extern const char *output_fix_trunc (rtx, rtx*, int);
 extern const char *output_fp_compare (rtx, rtx*, int, int);
+extern const char *output_adjust_stack_and_probe (rtx);
+extern const char *output_probe_stack_range (rtx, rtx);
 
 extern void ix86_expand_clear (rtx);
 extern void ix86_expand_move (enum machine_mode, rtx[]);
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 161685)
+++ config/i386/i386.c	(working copy)
@@ -4444,6 +4444,20 @@  optimization_options (int level, int siz
   SUBTARGET_OPTIMIZATION_OPTIONS;
 #endif
 }
+
+/* Decide whether we must probe the stack before any space allocation
+   on this target.  It's essentially TARGET_STACK_PROBE except when
+   -fstack-check causes the stack to be already probed differently.  */
+
+bool
+ix86_target_stack_probe (void)
+{
+  /* Do not probe the stack twice if static stack checking is enabled.  */
+  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
+    return false;
+
+  return TARGET_STACK_PROBE;
+}
 
 /* Decide whether we can make a sibling call to a function.  DECL is the
    declaration of the function being targeted by the call and EXP is the
@@ -8229,6 +8243,11 @@  ix86_compute_frame_layout (struct ix86_f
   else
     frame->save_regs_using_mov = false;
 
+  /* If static stack checking is enabled and done with probes, the registers
+     need to be saved before allocating the frame.  */
+  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
+    frame->save_regs_using_mov = false;
+
   /* Skip return address.  */
   offset = UNITS_PER_WORD;
 
@@ -8638,6 +8657,388 @@  ix86_internal_arg_pointer (void)
   return virtual_incoming_args_rtx;
 }
 
+struct scratch_reg {
+  rtx reg;
+  bool saved;
+};
+
+/* Return a short-lived scratch register for use on function entry.
+   In 32-bit mode, it is valid only after the registers are saved
+   in the prologue.  This register must be released by means of
+   release_scratch_register_on_entry once it is dead.  */
+
+static void
+get_scratch_register_on_entry (struct scratch_reg *sr)
+{
+  int regno;
+
+  sr->saved = false;
+
+  if (TARGET_64BIT)
+    {
+      /* We always use R11 in 64-bit mode.  */
+      regno = R11_REG;
+    }
+  else
+    {
+      tree decl = current_function_decl, fntype = TREE_TYPE (decl);
+      bool fastcall_p
+	= lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
+      bool static_chain_p = DECL_STATIC_CHAIN (decl);
+      int regparm = ix86_function_regparm (fntype, decl);
+      int drap_regno
+	= crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
+
+      /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
+	  for the static chain register.  */
+      if ((regparm < 1 || (fastcall_p && !static_chain_p))
+	  && drap_regno != AX_REG)
+	regno = AX_REG;
+      else if (regparm < 2 && drap_regno != DX_REG)
+	regno = DX_REG;
+      /* ecx is the static chain register.  */
+      else if (regparm < 3 && !fastcall_p && !static_chain_p
+	       && drap_regno != CX_REG)
+	regno = CX_REG;
+      else if (ix86_save_reg (BX_REG, true))
+	regno = BX_REG;
+      /* esi is the static chain register.  */
+      else if (!(regparm == 3 && static_chain_p)
+	       && ix86_save_reg (SI_REG, true))
+	regno = SI_REG;
+      else if (ix86_save_reg (DI_REG, true))
+	regno = DI_REG;
+      else
+	{
+	  regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
+	  sr->saved = true;
+	}
+    }
+
+  sr->reg = gen_rtx_REG (Pmode, regno);
+  if (sr->saved)
+    {
+      rtx insn = emit_insn (gen_push (sr->reg));
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+}
+
+/* Release a scratch register obtained from the preceding function.  */
+
+static void
+release_scratch_register_on_entry (struct scratch_reg *sr)
+{
+  if (sr->saved)
+    {
+      rtx insn, x;
+
+      if (TARGET_64BIT)
+	insn = emit_insn (gen_popdi1 (sr->reg));
+      else
+	insn = emit_insn (gen_popsi1 (sr->reg));
+
+      /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop.  */
+      RTX_FRAME_RELATED_P (insn) = 1;
+      x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
+      x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
+      add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
+    }
+}
+
+#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
+
+/* Emit code to adjust the stack pointer by SIZE bytes while probing it.  */
+
+static void
+ix86_adjust_stack_and_probe (HOST_WIDE_INT size)
+{
+  /* We skip the probe for the first interval + a small dope of 4 words and
+     probe that many bytes past the specified size to maintain a protection
+     area at the botton of the stack.  */
+  const int dope = 4 * UNITS_PER_WORD;
+  rtx size_rtx = GEN_INT (size);
+
+  /* See if we have a constant small number of probes to generate.  If so,
+     that's the easy case.  The run-time loop is made up of 11 insns in the
+     generic case while the compile-time loop is made up of 3+2*(n-1) insns
+     for n # of intervals.  */
+  if (size <= 5 * PROBE_INTERVAL)
+    {
+      HOST_WIDE_INT i, adjust;
+      bool first_probe = true;
+
+      /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
+	 values of N from 1 until it exceeds SIZE.  If only one probe is
+	 needed, this will not generate any code.  Then adjust and probe
+	 to PROBE_INTERVAL + SIZE.  */
+      for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
+	{
+	  if (first_probe)
+	    {
+	      adjust = 2 * PROBE_INTERVAL + dope;
+	      first_probe = false;
+	    }
+	  else
+	    adjust = PROBE_INTERVAL;
+
+	  emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+				  plus_constant (stack_pointer_rtx, -adjust)));
+	  emit_stack_probe (stack_pointer_rtx);
+	}
+
+      if (first_probe)
+	adjust = size + PROBE_INTERVAL + dope;
+      else
+        adjust = size + PROBE_INTERVAL - i;
+
+      emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+			      plus_constant (stack_pointer_rtx, -adjust)));
+      emit_stack_probe (stack_pointer_rtx);
+
+      /* Adjust back to account for the additional first interval.  */
+      emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+			      plus_constant (stack_pointer_rtx,
+					     PROBE_INTERVAL + dope)));
+    }
+
+  /* In the variable case, do the same as above, but in a loop.  Note that we
+     must be extra careful with variables wrapping around because we might be
+     at the very top (or the very bottom) of the address space and we have to
+     be able to handle this case properly; in particular, we use an equality
+     test for the loop condition.  */
+  else
+    {
+      HOST_WIDE_INT rounded_size;
+      struct scratch_reg sr;
+
+      get_scratch_register_on_entry (&sr);
+
+
+      /* Step 1: round SIZE to the previous multiple of the interval.  */
+
+      rounded_size = size & -PROBE_INTERVAL;
+
+
+      /* Step 2: compute initial and final value of the loop counter.  */
+
+      /* SP = SP_0 + PROBE_INTERVAL.  */
+      emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+			      plus_constant (stack_pointer_rtx,
+					     - (PROBE_INTERVAL + dope))));
+
+      /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE.  */
+      emit_move_insn (sr.reg, GEN_INT (-rounded_size));
+      emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
+			      gen_rtx_PLUS (Pmode, sr.reg,
+					    stack_pointer_rtx)));
+
+
+      /* Step 3: the loop
+
+	 while (SP != LAST_ADDR)
+	   {
+	     SP = SP + PROBE_INTERVAL
+	     probe at SP
+	   }
+
+	 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
+	 values of N from 1 until it is equal to ROUNDED_SIZE.  */
+
+      if (TARGET_64BIT)
+	emit_insn (gen_adjust_stack_and_probedi (sr.reg, sr.reg, size_rtx));
+      else
+	emit_insn (gen_adjust_stack_and_probesi (sr.reg, sr.reg, size_rtx));
+
+
+      /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
+	 assert at compile-time that SIZE is equal to ROUNDED_SIZE.  */
+
+      if (size != rounded_size)
+	{
+	  emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+			          plus_constant (stack_pointer_rtx,
+						 rounded_size - size)));
+	  emit_stack_probe (stack_pointer_rtx);
+	}
+
+      /* Adjust back to account for the additional first interval.  */
+      emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+			      plus_constant (stack_pointer_rtx,
+					     PROBE_INTERVAL + dope)));
+
+      release_scratch_register_on_entry (&sr);
+    }
+
+  gcc_assert (ix86_cfa_state->reg != stack_pointer_rtx);
+
+  /* Make sure nothing is scheduled before we are done.  */
+  emit_insn (gen_blockage ());
+}
+
+/* Adjust the stack pointer up to REG while probing it.  */
+
+const char *
+output_adjust_stack_and_probe (rtx reg)
+{
+  static int labelno = 0;
+  char loop_lab[32], end_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
+
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+
+  /* Jump to END_LAB if SP == LAST_ADDR.  */
+  xops[0] = stack_pointer_rtx;
+  xops[1] = reg;
+  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
+  fputs ("\tje\t", asm_out_file);
+  assemble_name_raw (asm_out_file, end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* SP = SP + PROBE_INTERVAL.  */
+  xops[1] = GEN_INT (PROBE_INTERVAL);
+  output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
+
+  /* Probe at SP.  */
+  xops[1] = const0_rtx;
+  output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
+
+  fprintf (asm_out_file, "\tjmp\t");
+  assemble_name_raw (asm_out_file, loop_lab);
+  fputc ('\n', asm_out_file);
+
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
+
+  return "";
+}
+
+/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
+   inclusive.  These are offsets from the current stack pointer.  */
+
+static void
+ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
+{
+  /* See if we have a constant small number of probes to generate.  If so,
+     that's the easy case.  The run-time loop is made up of 7 insns in the
+     generic case while the compile-time loop is made up of n insns for n #
+     of intervals.  */
+  if (size <= 7 * PROBE_INTERVAL)
+    {
+      HOST_WIDE_INT i;
+
+      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
+	 it exceeds SIZE.  If only one probe is needed, this will not
+	 generate any code.  Then probe at FIRST + SIZE.  */
+      for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
+	emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
+
+      emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
+    }
+
+  /* Otherwise, do the same as above, but in a loop.  Note that we must be
+     extra careful with variables wrapping around because we might be at
+     the very top (or the very bottom) of the address space and we have
+     to be able to handle this case properly; in particular, we use an
+     equality test for the loop condition.  */
+  else
+    {
+      HOST_WIDE_INT rounded_size, last;
+      struct scratch_reg sr;
+
+      get_scratch_register_on_entry (&sr);
+
+
+      /* Step 1: round SIZE to the previous multiple of the interval.  */
+
+      rounded_size = size & -PROBE_INTERVAL;
+
+
+      /* Step 2: compute initial and final value of the loop counter.  */
+
+      /* TEST_OFFSET = FIRST.  */
+      emit_move_insn (sr.reg, GEN_INT (-first));
+
+      /* LAST_OFFSET = FIRST + ROUNDED_SIZE.  */
+
+
+      /* Step 3: the loop
+
+	 while (TEST_ADDR != LAST_ADDR)
+	   {
+	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
+	     probe at TEST_ADDR
+	   }
+
+         probes at FIRST + N * PROBE_INTERVAL for values of N from 1
+         until it is equal to ROUNDED_SIZE.  */
+
+      last = first + rounded_size;
+      if (TARGET_64BIT)
+	emit_insn (gen_probe_stack_rangedi (sr.reg, sr.reg, GEN_INT (-last)));
+      else
+	emit_insn (gen_probe_stack_rangesi (sr.reg, sr.reg, GEN_INT (-last)));
+
+
+      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
+	 that SIZE is equal to ROUNDED_SIZE.  */
+
+      if (size != rounded_size)
+	emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
+						       stack_pointer_rtx,
+						       sr.reg),
+					 rounded_size - size));
+
+      release_scratch_register_on_entry (&sr);
+    }
+
+  /* Make sure nothing is scheduled before we are done.  */
+  emit_insn (gen_blockage ());
+}
+
+/* Probe a range of stack addresses from REG to END, inclusive.  These are
+   offsets from the current stack pointer.  */
+
+const char *
+output_probe_stack_range (rtx reg, rtx end)
+{
+  static int labelno = 0;
+  char loop_lab[32], end_lab[32];
+  rtx xops[3];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
+
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+
+  /* Jump to END_LAB if TEST_ADDR == LAST_ADDR.  */
+  xops[0] = reg;
+  xops[1] = end;
+  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
+  fputs ("\tje\t", asm_out_file);
+  assemble_name_raw (asm_out_file, end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
+  xops[1] = GEN_INT (PROBE_INTERVAL);
+  output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
+
+  /* Probe at TEST_ADDR.  */
+  xops[0] = stack_pointer_rtx;
+  xops[1] = reg;
+  xops[2] = const0_rtx;
+  output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
+
+  fprintf (asm_out_file, "\tjmp\t");
+  assemble_name_raw (asm_out_file, loop_lab);
+  fputc ('\n', asm_out_file);
+
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
+
+  return "";
+}
+
 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
    to be generated in correct form.  */
 static void
@@ -8830,6 +9231,32 @@  ix86_expand_prologue (void)
   else
     allocate += frame.nregs * UNITS_PER_WORD;
 
+  /* The stack has already been decremented by the instruction calling us
+     so we need to probe unconditionally to preserve the protection area.  */
+  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
+    {
+      /* We expect the registers to be saved when probes are used.  */
+      gcc_assert (!frame.save_regs_using_mov);
+
+      if (STACK_CHECK_MOVING_SP)
+	{
+	  ix86_adjust_stack_and_probe (allocate);
+	  allocate = 0;
+	}
+      else
+	{
+	  HOST_WIDE_INT size = allocate;
+
+	  if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
+	    size = 0x80000000 - STACK_CHECK_PROTECT - 1;
+
+	  if (TARGET_STACK_PROBE)
+	    ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
+	  else
+	    ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
+	}
+    }
+
   /* When using red zone we may start register saving before allocating
      the stack frame saving one cycle of the prologue. However I will
      avoid doing this if I am going to have to probe the stack since
@@ -8845,7 +9272,7 @@  ix86_expand_prologue (void)
 
   if (allocate == 0)
     ;
-  else if (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)
+  else if (!ix86_target_stack_probe () || allocate < CHECK_STACK_LIMIT)
     pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
 			       GEN_INT (-allocate), -1,
 			       ix86_cfa_state->reg == stack_pointer_rtx);