Patchwork PATCH RFA: Split stack [5/7]: x86 backend support

login
register
mail settings
Submitter Ian Taylor
Date Sept. 23, 2010, 6:22 p.m.
Message ID <mcr1v8k5ngv.fsf@google.com>
Download mbox | patch
Permalink /patch/65564/
State New
Headers show

Comments

Ian Taylor - Sept. 23, 2010, 6:22 p.m.
Uros Bizjak <ubizjak@gmail.com> writes:

>> +
>> +;; Handle -fsplit-stack.
>> +
>
> The insertion has separated comment away from bit manipulation instructions.

Gah.  Fixed.

> These two patterns can be merged into one, so "split_stack_return"
> will conditionally output "ret" or "ret N", depending on operand[0] !=
> 0.

Done.

> Ideally, INVALID_REGNUM should be returned here to signal to the
> expander that something went wrong. The generic supporting code should
> detect that expander has FAILed and error out without generation of
> the stack prologue code.

I'm not sure that is all that useful, since the compiler already gives
an error message, but I implemented it.

> No negative logic, please.

Done.  My general rule is to put the simpler of the then/else blocks
first.  But I switched it to your preference.

> Probably, we can directly set scratch_reg, without move. In this case,
> the pattern will directly match lea pattern.

Done.

I discovered that I had not properly tested 32-bit mode, and that led to
the discovery that a couple of tweaks were needed to make the varargs
code work in 32-bit mode.  Those are in this updated patch as well.

Thanks for looking at it.

Ian


2010-09-21  Ian Lance Taylor  <iant@google.com>

	* config/i386/i386.c (ix86_option_override_internal): Don't set
	expand_builtin_va_start to NULL if -fsplit-stack.
	(ix86_function_regparm): Reduce local regparm by 1 for 32-bit
	-fsplit-stack.
	(ix86_va_start): If -fsplit-stack, get overflow pointer	from
	scratch register set by prologue.
	(ix86_code_end): If -fsplit-stack, call
	file_end_indicate_split_stack.
	(ix86_supports_split_stack): New static function.
	(SPLIT_STACK_AVAILABLE): Define.
	(split_stack_prologue_scratch_regno): New static function.
	(split_stack_fn): New static variable.
	(ix86_expand_split_stack_prologue): New function.
	(ix86_live_on_entry): New static function.
	(ix86_legitimate_address_p): Handle UNSPEC_STACK_CHECK.
	(output_pic_addr_const): Likewise.
	(i386_asm_output_addr_const_extra): Likewise.
	(ix86_expand_call): Change return type to rtx.  Return the new
	call instruction.
	(TARGET_SUPPORTS_SPLIT_STACK): Define.
	(TARGET_EXTRA_LIVE_ON_ENTRY): Define.
	* config/i386/i386.md (UNSPEC_STACK_CHECK): Define.
	(split_stack_prologue, split_stack_return): New insns.
	(split_stack_space_check): New insn.
	* config/i386/i386.h (struct machine_function): Add
	split_stack_varargs_pointer field.
	* config/i386/linux.h (TARGET_CAN_SPLIT_STACK): Define.
	(TARGET_THREAD_SPLIT_STACK_OFFSET): Define.
	* config/i386/linux64.h (TARGET_CAN_SPLIT_STACK): Define.
	(TARGET_THREAD_SPLIT_STACK_OFFSET): Define.
	* config/i386/i386-protos.h (ix86_expand_split_stack_prologue):
	Declare.
	(ix86_expand_call): Update declaration.

Patch

Index: gcc/config/i386/i386.h
===================================================================
--- gcc/config/i386/i386.h	(revision 164490)
+++ gcc/config/i386/i386.h	(working copy)
@@ -2281,6 +2281,13 @@  struct GTY(()) machine_function {
      has been computed for.  */
   int use_fast_prologue_epilogue_nregs;
 
+  /* For -fsplit-stack support: A stack local which holds a pointer to
+     the stack arguments for a function with a variable number of
+     arguments.  This is set at the start of the function and is used
+     to initialize the overflow_arg_area field of the va_list
+     structure.  */
+  rtx split_stack_varargs_pointer;
+
   /* This value is used for amd64 targets and specifies the current abi
      to be used. MS_ABI means ms abi. Otherwise SYSV_ABI means sysv abi.  */
   ENUM_BITFIELD(calling_abi) call_abi : 8;
Index: gcc/config/i386/linux.h
===================================================================
--- gcc/config/i386/linux.h	(revision 164490)
+++ gcc/config/i386/linux.h	(working copy)
@@ -218,4 +218,8 @@  along with GCC; see the file COPYING3.  
 #ifdef TARGET_LIBC_PROVIDES_SSP
 /* i386 glibc provides __stack_chk_guard in %gs:0x14.  */
 #define TARGET_THREAD_SSP_OFFSET	0x14
+
+/* We steal the last transactional memory word.  */
+#define TARGET_CAN_SPLIT_STACK
+#define TARGET_THREAD_SPLIT_STACK_OFFSET 0x30
 #endif
Index: gcc/config/i386/i386.md
===================================================================
--- gcc/config/i386/i386.md	(revision 164490)
+++ gcc/config/i386/i386.md	(working copy)
@@ -85,6 +85,7 @@ 
   UNSPEC_SET_RIP
   UNSPEC_SET_GOT_OFFSET
   UNSPEC_MEMORY_BLOCKAGE
+  UNSPEC_STACK_CHECK
 
   ;; TLS support
   UNSPEC_TP
@@ -11626,6 +11627,64 @@ 
   "leave"
   [(set_attr "type" "leave")])
 
+;; Handle -fsplit-stack.
+
+(define_expand "split_stack_prologue"
+  [(const_int 0)]
+  ""
+{
+  ix86_expand_split_stack_prologue ();
+  DONE;
+})
+
+;; In order to support the call/return predictor, we use a return
+;; instruction which the middle-end doesn't see.
+(define_insn "split_stack_return"
+  [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
+		    UNSPEC_STACK_CHECK)]
+  ""
+{
+  if (operands[0] == const0_rtx)
+    return "ret";
+  else
+    return "ret\t%0";
+}
+  [(set_attr "atom_unit" "jeu")
+   (set_attr "modrm" "0")
+   (set (attr "length")
+	(if_then_else (match_operand:SI 0 "const0_operand" "")
+		      (const_int 1)
+		      (const_int 3)))
+   (set (attr "length_immediate")
+	(if_then_else (match_operand:SI 0 "const0_operand" "")
+		      (const_int 0)
+		      (const_int 2)))])
+
+;; If there are operand 0 bytes available on the stack, jump to
+;; operand 1.
+
+(define_expand "split_stack_space_check"
+  [(set (pc) (if_then_else
+	      (ltu (minus (reg SP_REG)
+			  (match_operand 0 "register_operand" ""))
+		   (unspec [(const_int 0)] UNSPEC_STACK_CHECK))
+	      (label_ref (match_operand 1 "" ""))
+	      (pc)))]
+  ""
+{
+  rtx reg, size, limit;
+
+  reg = gen_reg_rtx (Pmode);
+  size = force_reg (Pmode, operands[0]);
+  emit_insn (gen_sub3_insn (reg, stack_pointer_rtx, size));
+  limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+			  UNSPEC_STACK_CHECK);
+  limit = gen_rtx_MEM (Pmode, gen_rtx_CONST (Pmode, limit));
+  ix86_expand_branch (GEU, reg, limit, operands[1]);
+
+  DONE;
+})
+
 ;; Bit manipulation instructions.
 
 (define_expand "ffs<mode>2"
Index: gcc/config/i386/linux64.h
===================================================================
--- gcc/config/i386/linux64.h	(revision 164490)
+++ gcc/config/i386/linux64.h	(working copy)
@@ -123,4 +123,8 @@  see the files COPYING3 and COPYING.RUNTI
 /* i386 glibc provides __stack_chk_guard in %gs:0x14,
    x86_64 glibc provides it in %fs:0x28.  */
 #define TARGET_THREAD_SSP_OFFSET	(TARGET_64BIT ? 0x28 : 0x14)
+
+/* We steal the last transactional memory word.  */
+#define TARGET_CAN_SPLIT_STACK
+#define TARGET_THREAD_SPLIT_STACK_OFFSET (TARGET_64BIT ? 0x70 : 0x30)
 #endif
Index: gcc/config/i386/i386-protos.h
===================================================================
--- gcc/config/i386/i386-protos.h	(revision 164490)
+++ gcc/config/i386/i386-protos.h	(working copy)
@@ -29,6 +29,7 @@  extern void ix86_setup_frame_addresses (
 extern HOST_WIDE_INT ix86_initial_elimination_offset (int, int);
 extern void ix86_expand_prologue (void);
 extern void ix86_expand_epilogue (int);
+extern void ix86_expand_split_stack_prologue (void);
 
 extern void ix86_output_addr_vec_elt (FILE *, int);
 extern void ix86_output_addr_diff_elt (FILE *, int, int);
@@ -117,7 +118,7 @@  extern bool ix86_expand_int_vcond (rtx[]
 extern void ix86_expand_sse_unpack (rtx[], bool, bool);
 extern void ix86_expand_sse4_unpack (rtx[], bool, bool);
 extern bool ix86_expand_int_addcc (rtx[]);
-extern void ix86_expand_call (rtx, rtx, rtx, rtx, rtx, int);
+extern rtx ix86_expand_call (rtx, rtx, rtx, rtx, rtx, int);
 extern void x86_initialize_trampoline (rtx, rtx, rtx);
 extern rtx ix86_zero_extend_to_Pmode (rtx);
 extern void ix86_split_long_move (rtx[]);
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 164490)
+++ gcc/config/i386/i386.c	(working copy)
@@ -1989,6 +1989,8 @@  static void ix86_add_new_builtins (int);
 static rtx ix86_expand_vec_perm_builtin (tree);
 static tree ix86_canonical_va_list_type (tree);
 static void predict_jump (int);
+static unsigned int split_stack_prologue_scratch_regno (void);
+static bool i386_asm_output_addr_const_extra (FILE *, rtx);
 
 enum ix86_function_specific_strings
 {
@@ -3649,7 +3651,7 @@  ix86_option_override_internal (bool main
 
   /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
      can be optimized to ap = __builtin_next_arg (0).  */
-  if (!TARGET_64BIT)
+  if (!TARGET_64BIT && !flag_split_stack)
     targetm.expand_builtin_va_start = NULL;
 
   if (TARGET_64BIT)
@@ -4890,6 +4892,10 @@  ix86_function_regparm (const_tree type, 
 	  if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
 	    local_regparm = 2;
 
+	  /* In 32-bit mode save a register for the split stack.  */
+	  if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
+	    local_regparm = 2;
+
 	  /* Each fixed register usage increases register pressure,
 	     so less registers should be used for argument passing.
 	     This functionality can be overriden by an explicit
@@ -7248,10 +7254,56 @@  ix86_va_start (tree valist, rtx nextarg)
   tree gpr, fpr, ovf, sav, t;
   tree type;
 
+  rtx ovf_rtx;
+
+  if (flag_split_stack
+      && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+    {
+      unsigned int scratch_regno;
+
+      /* When we are splitting the stack, we can't refer to the stack
+	 arguments using internal_arg_pointer, because they may be on
+	 the old stack.  The split stack prologue will arrange to
+	 leave a pointer to the old stack arguments in a scratch
+	 register, which we here copy to a pseudo-register.  The split
+	 stack prologue can't set the pseudo-register directly because
+	 it (the prologue) runs before any registers have been saved.  */
+
+      scratch_regno = split_stack_prologue_scratch_regno ();
+      if (scratch_regno != INVALID_REGNUM)
+	{
+	  rtx reg, seq;
+
+	  reg = gen_reg_rtx (Pmode);
+	  cfun->machine->split_stack_varargs_pointer = reg;
+
+	  start_sequence ();
+	  emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
+	  seq = get_insns ();
+	  end_sequence ();
+
+	  push_topmost_sequence ();
+	  emit_insn_after (seq, entry_of_function ());
+	  pop_topmost_sequence ();
+	}
+    }
+
   /* Only 64bit target needs something special.  */
   if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
     {
-      std_expand_builtin_va_start (valist, nextarg);
+      if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+	std_expand_builtin_va_start (valist, nextarg);
+      else
+	{
+	  rtx va_r, next;
+
+	  va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
+	  next = expand_binop (ptr_mode, add_optab,
+			       cfun->machine->split_stack_varargs_pointer,
+			       crtl->args.arg_offset_rtx,
+			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
+	  convert_move (va_r, next, 0);
+	}
       return;
     }
 
@@ -7297,7 +7349,11 @@  ix86_va_start (tree valist, rtx nextarg)
 
   /* Find the overflow area.  */
   type = TREE_TYPE (ovf);
-  t = make_tree (type, crtl->args.internal_arg_pointer);
+  if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+    ovf_rtx = crtl->args.internal_arg_pointer;
+  else
+    ovf_rtx = cfun->machine->split_stack_varargs_pointer;
+  t = make_tree (type, ovf_rtx);
   if (words != 0)
     t = build2 (POINTER_PLUS_EXPR, type, t,
 	        size_int (words * UNITS_PER_WORD));
@@ -8037,6 +8093,9 @@  ix86_code_end (void)
       set_cfun (NULL);
       current_function_decl = NULL;
     }
+
+  if (flag_split_stack)
+    file_end_indicate_split_stack ();
 }
 
 /* Emit code for the SET_GOT patterns.  */
@@ -8339,6 +8398,29 @@  ix86_builtin_setjmp_frame_value (void)
   return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
 }
 
+/* On the x86 -fsplit-stack and -fstack-protector both use the same
+   field in the TCB, so they can not be used together.  */
+
+static bool
+ix86_supports_split_stack (bool report ATTRIBUTE_UNUSED)
+{
+  bool ret = true;
+
+#ifndef TARGET_THREAD_SPLIT_STACK_OFFSET
+  if (report)
+    error ("%<-fsplit-stack%> currently only supported on GNU/Linux");
+  ret = false;
+#endif
+
+  return ret;
+}
+
+/* When using -fsplit-stack, the allocation routines set a field in
+   the TCB to the bottom of the stack plus this much space, measured
+   in bytes.  */
+
+#define SPLIT_STACK_AVAILABLE 256
+
 /* Fill structure ix86_frame about frame of currently computed function.  */
 
 static void
@@ -9692,7 +9774,7 @@  ix86_expand_prologue (void)
     }
   else
     {
-      rtx eax = gen_rtx_REG (Pmode, AX_REG);;
+      rtx eax = gen_rtx_REG (Pmode, AX_REG);
       rtx r10 = NULL;
       bool eax_live = false;
       bool r10_live = false;
@@ -10317,6 +10399,287 @@  ix86_output_function_epilogue (FILE *fil
 #endif
 
 }
+
+/* Return a scratch register to use in the split stack prologue.  The
+   split stack prologue is used for -fsplit-stack.  It is the first
+   instructions in the function, even before the regular prologue.
+   The scratch register can be any caller-saved register which is not
+   used for parameters or for the static chain.  */
+
+static unsigned int
+split_stack_prologue_scratch_regno (void)
+{
+  if (TARGET_64BIT)
+    return R11_REG;
+  else
+    {
+      bool is_fastcall;
+      int regparm;
+
+      is_fastcall = (lookup_attribute ("fastcall",
+				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
+		     != NULL);
+      regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
+
+      if (is_fastcall)
+	{
+	  if (DECL_STATIC_CHAIN (cfun->decl))
+	    {
+	      sorry ("-fsplit-stack does not support fastcall with "
+		     "nested function");
+	      return INVALID_REGNUM;
+	    }
+	  return AX_REG;
+	}
+      else if (regparm < 3)
+	{
+	  if (!DECL_STATIC_CHAIN (cfun->decl))
+	    return CX_REG;
+	  else
+	    {
+	      if (regparm >= 2)
+		{
+		  sorry ("-fsplit-stack does not support 2 register "
+			 " parameters for a nested function");
+		  return INVALID_REGNUM;
+		}
+	      return DX_REG;
+	    }
+	}
+      else
+	{
+	  /* FIXME: We could make this work by pushing a register
+	     around the addition and comparison.  */
+	  sorry ("-fsplit-stack does not support 3 register parameters");
+	  return INVALID_REGNUM;
+	}
+    }
+}
+
+/* A SYMBOL_REF for the function which allocates new stackspace for
+   -fsplit-stack.  */
+
+static GTY(()) rtx split_stack_fn;
+
+/* Handle -fsplit-stack.  These are the first instructions in the
+   function, even before the regular prologue.  */
+
+void
+ix86_expand_split_stack_prologue (void)
+{
+  struct ix86_frame frame;
+  HOST_WIDE_INT allocate;
+  int args_size;
+  rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
+  rtx scratch_reg = NULL_RTX;
+  rtx varargs_label = NULL_RTX;
+
+  gcc_assert (flag_split_stack && reload_completed);
+
+  ix86_finalize_stack_realign_flags ();
+  ix86_compute_frame_layout (&frame);
+  allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
+
+  /* This is the label we will branch to if we have enough stack
+     space.  We expect the basic block reordering pass to reverse this
+     branch if optimizing, so that we branch in the unlikely case.  */
+  label = gen_label_rtx ();
+
+  /* We need to compare the stack pointer minus the frame size with
+     the stack boundary in the TCB.  The stack boundary always gives
+     us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
+     can compare directly.  Otherwise we need to do an addition.  */
+
+  limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+			  UNSPEC_STACK_CHECK);
+  limit = gen_rtx_CONST (Pmode, limit);
+  limit = gen_rtx_MEM (Pmode, limit);
+  if (allocate < SPLIT_STACK_AVAILABLE)
+    current = stack_pointer_rtx;
+  else
+    {
+      unsigned int scratch_regno;
+      rtx offset;
+
+      /* We need a scratch register to hold the stack pointer minus
+	 the required frame size.  Since this is the very start of the
+	 function, the scratch register can be any caller-saved
+	 register which is not used for parameters.  */
+      offset = GEN_INT (- allocate);
+      scratch_regno = split_stack_prologue_scratch_regno ();
+      if (scratch_regno == INVALID_REGNUM)
+	return;
+      scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
+      if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
+	{
+	  /* We don't use ix86_gen_add3 in this case because it will
+	     want to split to lea, but when not optimizing the insn
+	     will not be split after this point.  */
+	  emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
+				  gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+						offset)));
+	}
+      else
+	{
+	  emit_move_insn (scratch_reg, offset);
+	  emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
+				 stack_pointer_rtx));
+	}
+      current = scratch_reg;
+    }
+
+  ix86_expand_branch (GEU, current, limit, label);
+  jump_insn = get_last_insn ();
+  JUMP_LABEL (jump_insn) = label;
+
+  /* Mark the jump as very likely to be taken.  */
+  add_reg_note (jump_insn, REG_BR_PROB,
+		GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
+
+  /* Get more stack space.  We pass in the desired stack space and the
+     size of the arguments to copy to the new stack.  In 32-bit mode
+     we push the parameters; __morestack will return on a new stack
+     anyhow.  In 64-bit mode we pass the parameters in r10 and
+     r11.  */
+  allocate_rtx = GEN_INT (allocate);
+  args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
+  call_fusage = NULL_RTX;
+  if (TARGET_64BIT)
+    {
+      rtx reg;
+
+      reg = gen_rtx_REG (Pmode, R10_REG);
+
+      /* If this function uses a static chain, it will be in %r10.
+	 Preserve it across the call to __morestack.  */
+      if (DECL_STATIC_CHAIN (cfun->decl))
+	{
+	  rtx rax;
+
+	  rax = gen_rtx_REG (Pmode, AX_REG);
+	  emit_move_insn (rax, reg);
+	  use_reg (&call_fusage, rax);
+	}
+
+      emit_move_insn (reg, allocate_rtx);
+      use_reg (&call_fusage, reg);
+      reg = gen_rtx_REG (Pmode, R11_REG);
+      emit_move_insn (reg, GEN_INT (args_size));
+      use_reg (&call_fusage, reg);
+    }
+  else
+    {
+      emit_insn (gen_push (GEN_INT (args_size)));
+      emit_insn (gen_push (allocate_rtx));
+    }
+  if (split_stack_fn == NULL_RTX)
+    split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
+  call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, split_stack_fn),
+				GEN_INT (UNITS_PER_WORD), constm1_rtx,
+				NULL_RTX, 0);
+  add_function_usage_to (call_insn, call_fusage);
+
+  /* In order to make call/return prediction work right, we now need
+     to execute a return instruction.  See
+     libgcc/config/i386/morestack.S for the details on how this works.
+
+     In order to support backtracing, we need to set the CFA around
+     the call, so that the unwinder knows how to correctly pick up the
+     return address.  We set the CFA around the call because the
+     unwinder looks up to the point of the call but not after the
+     call.  */
+  add_reg_note (call_insn, REG_CFA_TEMPORARY,
+		gen_rtx_PLUS (Pmode, gen_rtx_REG (Pmode, SP_REG),
+			      GEN_INT (UNITS_PER_WORD)));
+  RTX_FRAME_RELATED_P (call_insn) = 1;
+
+  /* For flow purposes gcc must not see this as a return
+     instruction--we need control flow to continue at the subsequent
+     label.  Therefore, we use an unspec.  */
+  gcc_assert (crtl->args.pops_args < 65536);
+  emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
+
+  /* If we are in 64-bit mode and this function uses a static chain,
+     we saved %r10 in %rax before calling _morestack.  */
+  if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
+    emit_move_insn (gen_rtx_REG (Pmode, R10_REG),
+		    gen_rtx_REG (Pmode, AX_REG));
+
+  /* If this function calls va_start, we need to store a pointer to
+     the arguments on the old stack, because they may not have been
+     all copied to the new stack.  At this point the old stack can be
+     found at the frame pointer value used by __morestack, because
+     __morestack has set that up before calling back to us.  Here we
+     store that pointer in a scratch register, and in
+     ix86_expand_prologue we store the scratch register in a stack
+     slot.  */
+  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+    {
+      unsigned int scratch_regno;
+      rtx frame_reg;
+      int words;
+
+      scratch_regno = split_stack_prologue_scratch_regno ();
+      scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
+      frame_reg = gen_rtx_REG (Pmode, BP_REG);
+
+      /* 64-bit:
+	 fp -> old fp value
+	       return address within this function
+	       return address of caller of this function
+	       stack arguments
+	 So we add three words to get to the stack arguments.
+
+	 32-bit:
+	 fp -> old fp value
+	       return address within this function
+               first argument to __morestack
+               second argument to __morestack
+               return address of caller of this function
+               stack arguments
+         So we add five words to get to the stack arguments.
+      */
+      words = TARGET_64BIT ? 3 : 5;
+      emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
+			      gen_rtx_PLUS (Pmode, frame_reg,
+					    GEN_INT (words * UNITS_PER_WORD))));
+
+      varargs_label = gen_label_rtx ();
+      emit_jump_insn (gen_jump (varargs_label));
+      JUMP_LABEL (get_last_insn ()) = varargs_label;
+
+      emit_barrier ();
+    }
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  /* If this function calls va_start, we now have to set the scratch
+     register for the case where we do not call __morestack.  In this
+     case we need to set it based on the stack pointer.  */
+  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+    {
+      emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
+			      gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+					    GEN_INT (UNITS_PER_WORD))));
+
+      emit_label (varargs_label);
+      LABEL_NUSES (varargs_label) = 1;
+    }
+}
+
+/* We may have to tell the dataflow pass that the split stack prologue
+   is initializing a scratch register.  */
+
+static void
+ix86_live_on_entry (bitmap regs)
+{
+  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+    {
+      gcc_assert (flag_split_stack);
+      bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
+    }
+}
 
 /* Extract the parts of an RTL expression that is a valid memory address
    for an instruction.  Return 0 if the structure of the address is
@@ -10962,6 +11325,10 @@  ix86_legitimate_address_p (enum machine_
 	  case UNSPEC_DTPOFF:
 	    break;
 
+	  case UNSPEC_STACK_CHECK:
+	    gcc_assert (flag_split_stack);
+	    break;
+
 	  default:
 	    /* Invalid address unspec.  */
 	    return false;
@@ -11853,6 +12220,13 @@  output_pic_addr_const (FILE *file, rtx x
       break;
 
      case UNSPEC:
+       if (XINT (x, 1) == UNSPEC_STACK_CHECK)
+	 {
+	   bool f = i386_asm_output_addr_const_extra (file, x);
+	   gcc_assert (f);
+	   break;
+	 }
+
        gcc_assert (XVECLEN (x, 0) == 1);
        output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
        switch (XINT (x, 1))
@@ -13256,6 +13630,22 @@  i386_asm_output_addr_const_extra (FILE *
       break;
 #endif
 
+    case UNSPEC_STACK_CHECK:
+      {
+	int offset;
+
+	gcc_assert (flag_split_stack);
+
+#ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
+	offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
+#else
+	gcc_unreachable ();
+#endif
+
+	fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
+      }
+      break;
+
     default:
       return false;
     }
@@ -20342,7 +20732,7 @@  construct_plt_address (rtx symbol)
   return tmp;
 }
 
-void
+rtx
 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
 		  rtx callarg2,
 		  rtx pop, int sibcall)
@@ -20433,6 +20823,8 @@  ix86_expand_call (rtx retval, rtx fnaddr
   call = emit_call_insn (call);
   if (use)
     CALL_INSN_FUNCTION_USAGE (call) = use;
+
+  return call;
 }
 
 
@@ -32839,6 +33231,9 @@  ix86_units_per_simd_word (enum machine_m
 #undef TARGET_STACK_PROTECT_FAIL
 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
 
+#undef TARGET_SUPPORTS_SPLIT_STACK
+#define TARGET_SUPPORTS_SPLIT_STACK ix86_supports_split_stack
+
 #undef TARGET_FUNCTION_VALUE
 #define TARGET_FUNCTION_VALUE ix86_function_value
 
@@ -32897,6 +33292,9 @@  ix86_units_per_simd_word (enum machine_m
 #undef TARGET_CAN_ELIMINATE
 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
 
+#undef TARGET_EXTRA_LIVE_ON_ENTRY
+#define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
+
 #undef TARGET_ASM_CODE_END
 #define TARGET_ASM_CODE_END ix86_code_end