Index: generic-morestack.c
===================================================================
--- generic-morestack.c	(revision 193263)
+++ generic-morestack.c	(working copy)
@@ -549,6 +549,7 @@ __generic_morestack (size_t *pframe_size
   char *to;
   void *ret;
   size_t i;
+  size_t aligned;
 
   current = __morestack_current_segment;
 
@@ -580,15 +581,19 @@ __generic_morestack (size_t *pframe_size
 
   *pframe_size = current->size - param_size;
 
+  /* Align the returned stack to a 32-byte boundary.  */
+  aligned = (param_size + 31) & ~ (size_t) 31;
+
 #ifdef STACK_GROWS_DOWNWARD
   {
     char *bottom = (char *) (current + 1) + current->size;
-    to = bottom - param_size;
-    ret = bottom - param_size;
+    to = bottom - aligned;
+    ret = bottom - aligned;
   }
 #else
   to = current + 1;
-  ret = (char *) (current + 1) + param_size;
+  to += aligned - param_size;
+  ret = (char *) (current + 1) + aligned;
 #endif
 
   /* We don't call memcpy to avoid worrying about the dynamic linker
Index: config/i386/morestack.S
===================================================================
--- config/i386/morestack.S	(revision 193263)
+++ config/i386/morestack.S	(working copy)
@@ -200,18 +200,19 @@ __morestack_non_split:
 
 	jb	2f			# Get more space if we need it.
 
-	# This breaks call/return prediction, as described above.
-	incq	8(%rsp)			# Increment the return address.
-
 	# If the instruction that we return to is
 	#   leaq  24(%rbp), %r11n
 	# then we have been called by a varargs function that expects
 	# %ebp to hold a real value.  That can only work if we do the
 	# full stack split routine.  FIXME: This is fragile.
 	movq	8(%rsp),%rax
+	incq	%rax			# Skip ret instruction in caller.
 	cmpl	$0x185d8d4c,(%rax)
 	je	2f
 
+	# This breaks call/return prediction, as described above.
+	incq	8(%rsp)			# Increment the return address.
+
 	popq	%rax			# Restore register.
 
 	.cfi_adjust_cfa_offset -8	# Adjust for popped register.
@@ -296,9 +297,13 @@ __morestack:
 	# argument size is pushed then the new stack frame size is
 	# pushed.
 
-	# Align stack to 16-byte boundary with enough space for saving
-	# registers and passing parameters to functions we call.
-	subl	$40,%esp
+	# In the body of a non-leaf function, the stack pointer will
+	# be aligned to a 16-byte boundary.  That is CFA + 12 in the
+	# stack picture above: (CFA + 12) % 16 == 0.  At this point we
+	# have %esp == CFA - 8, so %esp % 16 == 12.  We need some
+	# space for saving registers and passing parameters, and we
+	# need to wind up with %esp % 16 == 0.
+	subl	$44,%esp
 
 	# Because our cleanup code may need to clobber %ebx, we need
 	# to save it here so the unwinder can restore the value used
@@ -393,13 +398,15 @@ __morestack:
 
 	movl	%ebp,%esp		# Restore stack pointer.
 
+	# As before, we now have %esp % 16 == 12.
+
 	pushl	%eax			# Push return value on old stack.
 	pushl	%edx
-	subl	$8,%esp			# Align stack to 16-byte boundary.
+	subl	$4,%esp			# Align stack to 16-byte boundary.
 
 	call	__morestack_unblock_signals
 
-	addl	$8,%esp
+	addl	$4,%esp
 	popl	%edx			# Restore return value.
 	popl	%eax
 
@@ -485,15 +492,21 @@ __morestack:
 	pushq	%r9
 
 	pushq	%r11
-	pushq	$0			# For alignment.
+
+	# We entered morestack with the stack pointer aligned to a
+	# 16-byte boundary (the call to morestack's caller used 8
+	# bytes, and the call to morestack used 8 bytes).  We have now
+	# pushed 10 registers, so we are still aligned to a 16-byte
+	# boundary.
 
 	call	__morestack_block_signals
 
 	leaq	-8(%rbp),%rdi		# Address of new frame size.
 	leaq	24(%rbp),%rsi		# The caller's parameters.
-	addq	$8,%rsp
 	popq	%rdx			# The size of the parameters.
 
+	subq	$8,%rsp			# Align stack.
+
 	call	__generic_morestack
 
 	movq	-8(%rbp),%r10		# Reload modified frame size
@@ -564,6 +577,9 @@ __morestack:
 
 	movq	%rbp,%rsp		# Restore stack pointer.
 
+	# Now (%rsp & 16) == 8.
+
+	subq	$8,%rsp			# For alignment.
 	pushq	%rax			# Push return value on old stack.
 	pushq	%rdx
 
@@ -571,6 +587,7 @@ __morestack:
 
 	popq	%rdx			# Restore return value.
 	popq	%rax
+	addq	$8,%rsp
 
 	.cfi_remember_state
 	popq	%rbp
