Patchwork [RFA/ARM,02/02] : ARM epilogue in RTL

login
register
mail settings
Submitter Sameera Deshpande
Date Dec. 1, 2011, 11:57 a.m.
Message ID <1322740649.20873.61.camel@e102549-lin.cambridge.arm.com>
Download mbox | patch
Permalink /patch/128686/
State New
Headers show

Comments

Sameera Deshpande - Dec. 1, 2011, 11:57 a.m.
On Tue, 2011-11-22 at 10:37 +0000, Ramana Radhakrishnan wrote:
> >>
> >> Though I don't fully understand this patch, I think it is unnecessary to consider turn on Thumb2 for iwmmxt,
> >> because there is no chip that supports WMMX instructions with thumb2 mode.
> 
> Xinyu: I seem to have mis-remembered that one of your patches was
> turning on Thumb2 for wMMX.
> 
> >
> > Ramana, in that case, should I add the change you suggested in ARM RTL
> > epilogue patch only?
> 
> The comment in Thumb2 epilogues should remain and yes - it should be
> added to the ARM RTL epilogue patch only. 

Ramana,

Please find attached updated patch for ARM RTL epilogue.
The iwmmxt register restore was already implemented in the patch. Added
CFA restore notes for iwmmxt and FPA registers.

Bootstrap and gcc-check are passing without regression.

--
Ramana Radhakrishnan - Dec. 9, 2011, 12:06 p.m.
>
> Bootstrap and gcc-check are passing without regression.

Sorry, it's taken me a while to get to this but I looked at this for
sometime last night and think this needs a bit of work .

1. Can the code for TARGET_APCS_FRAME be factored out into a separate
function from arm_expand_epilogue ?

   Essentially factor out from arm_expand_epilogue:

 if (TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM)
+    {
.......

+    }

2. I'm not happy about the amount of very similar code between ARM and
Thumb2 state as far as arm_expand_return / thumb2_expand_return and
arm_expand_epilogue / thumb2_expand_epilogue go. Could you try to
merge the 2 together into the same functions.

3. The return sequence of loading into PC from the stack has
disappeared . A simple test appeared to generate :

	@ sp needed for prologue
	ldr	lr, [sp], #4
	bx	lr

instead of ldmfd sp!, {pc} atleast in the single return cases.

Which is still correct but sub-optimal.

4. A number of cases around *cond_return and *cond_return_inverted are
removed - This removes the chance for getting conditional returns from
the compiler - marking the return pattern as predicable doesn't really
generate conditional returns you really need the (if_then_else form).


5. This hunk here :

@@ -22741,6 +22646,7 @@ thumb2_expand_epilogue (void)
              return-address' instruction.  Instead, pop LR in PC.  */
           if (ARM_FUNC_TYPE (func_type) == ARM_FT_NORMAL
               && !IS_STACKALIGN (func_type)
+              && !is_sibling
               && crtl->args.pretend_args_size == 0
               && saved_regs_mask & (1 << LR_REGNUM)
               && !crtl->calls_eh_return)

This would be better off in the original Thumb2 RTL epilogues patch ?


6. This has missed out the hunk that deals with a ARM10 VFPr1 bug --

-  /* Workaround ARM10 VFPr1 bug.  */
-  if (count == 2 && !arm_arch6)
-    {
-      if (reg == 15)
-       reg--;
-      count++;
-    }

I'm not sure how many cores had this issue in the wild, but the
compiler has never generated things that went across from d15-d16 - We
seem to have always ended up loading something more into one of the
registers.

Next time please also submit a changelog entry -  it makes it easier
to check and review as well. Its largely similar to your original
changelog but it's easier to find that in the mail with the patch
rather than ferret around in older mails.


cheers
Ramana

Patch

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 2c38883..67fcdac 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -28,7 +28,8 @@  extern int use_return_insn (int, rtx);
 extern enum reg_class arm_regno_class (int);
 extern void arm_load_pic_register (unsigned long);
 extern int arm_volatile_func (void);
-extern const char *arm_output_epilogue (rtx);
+extern void arm_expand_epilogue (bool);
+extern void arm_expand_return (bool);
 extern void arm_expand_prologue (void);
 extern const char *arm_strip_name_encoding (const char *);
 extern void arm_asm_output_labelref (FILE *, const char *);
@@ -181,7 +182,7 @@  extern const char *thumb1_unexpanded_epilogue (void);
 extern void thumb1_expand_prologue (void);
 extern void thumb1_expand_epilogue (void);
 extern const char *thumb1_output_interwork (void);
-extern void thumb2_expand_epilogue (void);
+extern void thumb2_expand_epilogue (bool);
 extern void thumb2_output_return (rtx);
 extern void thumb2_expand_return (void);
 #ifdef TREE_CODE
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 40c8b44..29ed9e9 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -13507,86 +13507,6 @@  fp_const_from_val (REAL_VALUE_TYPE *r)
   gcc_unreachable ();
 }
 
-/* Output the operands of a LDM/STM instruction to STREAM.
-   MASK is the ARM register set mask of which only bits 0-15 are important.
-   REG is the base register, either the frame pointer or the stack pointer,
-   INSTR is the possibly suffixed load or store instruction.
-   RFE is nonzero if the instruction should also copy spsr to cpsr.  */
-
-static void
-print_multi_reg (FILE *stream, const char *instr, unsigned reg,
-		 unsigned long mask, int rfe)
-{
-  unsigned i;
-  bool not_first = FALSE;
-
-  gcc_assert (!rfe || (mask & (1 << PC_REGNUM)));
-  fputc ('\t', stream);
-  asm_fprintf (stream, instr, reg);
-  fputc ('{', stream);
-
-  for (i = 0; i <= LAST_ARM_REGNUM; i++)
-    if (mask & (1 << i))
-      {
-	if (not_first)
-	  fprintf (stream, ", ");
-
-	asm_fprintf (stream, "%r", i);
-	not_first = TRUE;
-      }
-
-  if (rfe)
-    fprintf (stream, "}^\n");
-  else
-    fprintf (stream, "}\n");
-}
-
-
-/* Output a FLDMD instruction to STREAM.
-   BASE if the register containing the address.
-   REG and COUNT specify the register range.
-   Extra registers may be added to avoid hardware bugs.
-
-   We output FLDMD even for ARMv5 VFP implementations.  Although
-   FLDMD is technically not supported until ARMv6, it is believed
-   that all VFP implementations support its use in this context.  */
-
-static void
-vfp_output_fldmd (FILE * stream, unsigned int base, int reg, int count)
-{
-  int i;
-
-  /* Workaround ARM10 VFPr1 bug.  */
-  if (count == 2 && !arm_arch6)
-    {
-      if (reg == 15)
-	reg--;
-      count++;
-    }
-
-  /* FLDMD may not load more than 16 doubleword registers at a time. Split the
-     load into multiple parts if we have to handle more than 16 registers.  */
-  if (count > 16)
-    {
-      vfp_output_fldmd (stream, base, reg, 16);
-      vfp_output_fldmd (stream, base, reg + 16, count - 16);
-      return;
-    }
-
-  fputc ('\t', stream);
-  asm_fprintf (stream, "fldmfdd\t%r!, {", base);
-
-  for (i = reg; i < reg + count; i++)
-    {
-      if (i > reg)
-	fputs (", ", stream);
-      asm_fprintf (stream, "d%d", i);
-    }
-  fputs ("}\n", stream);
-
-}
-
-
 /* Output the assembly for a store multiple.  */
 
 const char *
@@ -15289,10 +15209,7 @@  output_return_instruction (rtx operand, int really_return, int reverse)
 {
   char conditional[10];
   char instr[100];
-  unsigned reg;
-  unsigned long live_regs_mask;
   unsigned long func_type;
-  arm_stack_offsets *offsets;
 
   func_type = arm_current_func_type ();
 
@@ -15319,147 +15236,10 @@  output_return_instruction (rtx operand, int really_return, int reverse)
       return "";
     }
 
-  gcc_assert (!cfun->calls_alloca || really_return);
-
   sprintf (conditional, "%%?%%%c0", reverse ? 'D' : 'd');
 
   cfun->machine->return_used_this_function = 1;
 
-  offsets = arm_get_frame_offsets ();
-  live_regs_mask = offsets->saved_regs_mask;
-
-  if (live_regs_mask)
-    {
-      const char * return_reg;
-
-      /* If we do not have any special requirements for function exit
-	 (e.g. interworking) then we can load the return address
-	 directly into the PC.  Otherwise we must load it into LR.  */
-      if (really_return
-	  && (IS_INTERRUPT (func_type) || !TARGET_INTERWORK))
-	return_reg = reg_names[PC_REGNUM];
-      else
-	return_reg = reg_names[LR_REGNUM];
-
-      if ((live_regs_mask & (1 << IP_REGNUM)) == (1 << IP_REGNUM))
-	{
-	  /* There are three possible reasons for the IP register
-	     being saved.  1) a stack frame was created, in which case
-	     IP contains the old stack pointer, or 2) an ISR routine
-	     corrupted it, or 3) it was saved to align the stack on
-	     iWMMXt.  In case 1, restore IP into SP, otherwise just
-	     restore IP.  */
-	  if (frame_pointer_needed)
-	    {
-	      live_regs_mask &= ~ (1 << IP_REGNUM);
-	      live_regs_mask |=   (1 << SP_REGNUM);
-	    }
-	  else
-	    gcc_assert (IS_INTERRUPT (func_type) || TARGET_REALLY_IWMMXT);
-	}
-
-      /* On some ARM architectures it is faster to use LDR rather than
-	 LDM to load a single register.  On other architectures, the
-	 cost is the same.  In 26 bit mode, or for exception handlers,
-	 we have to use LDM to load the PC so that the CPSR is also
-	 restored.  */
-      for (reg = 0; reg <= LAST_ARM_REGNUM; reg++)
-	if (live_regs_mask == (1U << reg))
-	  break;
-
-      if (reg <= LAST_ARM_REGNUM
-	  && (reg != LR_REGNUM
-	      || ! really_return
-	      || ! IS_INTERRUPT (func_type)))
-	{
-	  sprintf (instr, "ldr%s\t%%|%s, [%%|sp], #4", conditional,
-		   (reg == LR_REGNUM) ? return_reg : reg_names[reg]);
-	}
-      else
-	{
-	  char *p;
-	  int first = 1;
-
-	  /* Generate the load multiple instruction to restore the
-	     registers.  Note we can get here, even if
-	     frame_pointer_needed is true, but only if sp already
-	     points to the base of the saved core registers.  */
-	  if (live_regs_mask & (1 << SP_REGNUM))
-	    {
-	      unsigned HOST_WIDE_INT stack_adjust;
-
-	      stack_adjust = offsets->outgoing_args - offsets->saved_regs;
-	      gcc_assert (stack_adjust == 0 || stack_adjust == 4);
-
-	      if (stack_adjust && arm_arch5 && TARGET_ARM)
-		if (TARGET_UNIFIED_ASM)
-		  sprintf (instr, "ldmib%s\t%%|sp, {", conditional);
-		else
-		  sprintf (instr, "ldm%sib\t%%|sp, {", conditional);
-	      else
-		{
-		  /* If we can't use ldmib (SA110 bug),
-		     then try to pop r3 instead.  */
-		  if (stack_adjust)
-		    live_regs_mask |= 1 << 3;
-
-		  if (TARGET_UNIFIED_ASM)
-		    sprintf (instr, "ldmfd%s\t%%|sp, {", conditional);
-		  else
-		    sprintf (instr, "ldm%sfd\t%%|sp, {", conditional);
-		}
-	    }
-	  else
-	    if (TARGET_UNIFIED_ASM)
-	      sprintf (instr, "pop%s\t{", conditional);
-	    else
-	      sprintf (instr, "ldm%sfd\t%%|sp!, {", conditional);
-
-	  p = instr + strlen (instr);
-
-	  for (reg = 0; reg <= SP_REGNUM; reg++)
-	    if (live_regs_mask & (1 << reg))
-	      {
-		int l = strlen (reg_names[reg]);
-
-		if (first)
-		  first = 0;
-		else
-		  {
-		    memcpy (p, ", ", 2);
-		    p += 2;
-		  }
-
-		memcpy (p, "%|", 2);
-		memcpy (p + 2, reg_names[reg], l);
-		p += l + 2;
-	      }
-
-	  if (live_regs_mask & (1 << LR_REGNUM))
-	    {
-	      sprintf (p, "%s%%|%s}", first ? "" : ", ", return_reg);
-	      /* If returning from an interrupt, restore the CPSR.  */
-	      if (IS_INTERRUPT (func_type))
-		strcat (p, "^");
-	    }
-	  else
-	    strcpy (p, "}");
-	}
-
-      output_asm_insn (instr, & operand);
-
-      /* See if we need to generate an extra instruction to
-	 perform the actual function return.  */
-      if (really_return
-	  && func_type != ARM_FT_INTERWORKED
-	  && (live_regs_mask & (1 << LR_REGNUM)) != 0)
-	{
-	  /* The return has already been handled
-	     by loading the LR into the PC.  */
-	  really_return = 0;
-	}
-    }
-
   if (really_return)
     {
       switch ((int) ARM_FUNC_TYPE (func_type))
@@ -15598,451 +15378,6 @@  arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size)
 
 }
 
-const char *
-arm_output_epilogue (rtx sibling)
-{
-  int reg;
-  unsigned long saved_regs_mask;
-  unsigned long func_type;
-  /* Floats_offset is the offset from the "virtual" frame.  In an APCS
-     frame that is $fp + 4 for a non-variadic function.  */
-  int floats_offset = 0;
-  rtx operands[3];
-  FILE * f = asm_out_file;
-  unsigned int lrm_count = 0;
-  int really_return = (sibling == NULL);
-  int start_reg;
-  arm_stack_offsets *offsets;
-
-  /* If we have already generated the return instruction
-     then it is futile to generate anything else.  */
-  if (use_return_insn (FALSE, sibling) &&
-      (cfun->machine->return_used_this_function != 0))
-    return "";
-
-  func_type = arm_current_func_type ();
-
-  if (IS_NAKED (func_type))
-    /* Naked functions don't have epilogues.  */
-    return "";
-
-  if (IS_VOLATILE (func_type) && TARGET_ABORT_NORETURN)
-    {
-      rtx op;
-
-      /* A volatile function should never return.  Call abort.  */
-      op = gen_rtx_SYMBOL_REF (Pmode, NEED_PLT_RELOC ? "abort(PLT)" : "abort");
-      assemble_external_libcall (op);
-      output_asm_insn ("bl\t%a0", &op);
-
-      return "";
-    }
-
-  /* If we are throwing an exception, then we really must be doing a
-     return, so we can't tail-call.  */
-  gcc_assert (!crtl->calls_eh_return || really_return);
-
-  offsets = arm_get_frame_offsets ();
-  saved_regs_mask = offsets->saved_regs_mask;
-
-  if (TARGET_IWMMXT)
-    lrm_count = bit_count (saved_regs_mask);
-
-  floats_offset = offsets->saved_args;
-  /* Compute how far away the floats will be.  */
-  for (reg = 0; reg <= LAST_ARM_REGNUM; reg++)
-    if (saved_regs_mask & (1 << reg))
-      floats_offset += 4;
-
-  if (TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM)
-    {
-      /* This variable is for the Virtual Frame Pointer, not VFP regs.  */
-      int vfp_offset = offsets->frame;
-
-      if (TARGET_FPA_EMU2)
-	{
-	  for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--)
-	    if (df_regs_ever_live_p (reg) && !call_used_regs[reg])
-	      {
-		floats_offset += 12;
-		asm_fprintf (f, "\tldfe\t%r, [%r, #-%d]\n",
-			     reg, FP_REGNUM, floats_offset - vfp_offset);
-	      }
-	}
-      else
-	{
-	  start_reg = LAST_FPA_REGNUM;
-
-	  for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--)
-	    {
-	      if (df_regs_ever_live_p (reg) && !call_used_regs[reg])
-		{
-		  floats_offset += 12;
-
-		  /* We can't unstack more than four registers at once.  */
-		  if (start_reg - reg == 3)
-		    {
-		      asm_fprintf (f, "\tlfm\t%r, 4, [%r, #-%d]\n",
-			           reg, FP_REGNUM, floats_offset - vfp_offset);
-		      start_reg = reg - 1;
-		    }
-		}
-	      else
-		{
-		  if (reg != start_reg)
-		    asm_fprintf (f, "\tlfm\t%r, %d, [%r, #-%d]\n",
-				 reg + 1, start_reg - reg,
-				 FP_REGNUM, floats_offset - vfp_offset);
-		  start_reg = reg - 1;
-		}
-	    }
-
-	  /* Just in case the last register checked also needs unstacking.  */
-	  if (reg != start_reg)
-	    asm_fprintf (f, "\tlfm\t%r, %d, [%r, #-%d]\n",
-			 reg + 1, start_reg - reg,
-			 FP_REGNUM, floats_offset - vfp_offset);
-	}
-
-      if (TARGET_HARD_FLOAT && TARGET_VFP)
-	{
-	  int saved_size;
-
-	  /* The fldmd insns do not have base+offset addressing
-             modes, so we use IP to hold the address.  */
-	  saved_size = arm_get_vfp_saved_size ();
-
-	  if (saved_size > 0)
-	    {
-	      floats_offset += saved_size;
-	      asm_fprintf (f, "\tsub\t%r, %r, #%d\n", IP_REGNUM,
-			   FP_REGNUM, floats_offset - vfp_offset);
-	    }
-	  start_reg = FIRST_VFP_REGNUM;
-	  for (reg = FIRST_VFP_REGNUM; reg < LAST_VFP_REGNUM; reg += 2)
-	    {
-	      if ((!df_regs_ever_live_p (reg) || call_used_regs[reg])
-		  && (!df_regs_ever_live_p (reg + 1) || call_used_regs[reg + 1]))
-		{
-		  if (start_reg != reg)
-		    vfp_output_fldmd (f, IP_REGNUM,
-				      (start_reg - FIRST_VFP_REGNUM) / 2,
-				      (reg - start_reg) / 2);
-		  start_reg = reg + 2;
-		}
-	    }
-	  if (start_reg != reg)
-	    vfp_output_fldmd (f, IP_REGNUM,
-			      (start_reg - FIRST_VFP_REGNUM) / 2,
-			      (reg - start_reg) / 2);
-	}
-
-      if (TARGET_IWMMXT)
-	{
-	  /* The frame pointer is guaranteed to be non-double-word aligned.
-	     This is because it is set to (old_stack_pointer - 4) and the
-	     old_stack_pointer was double word aligned.  Thus the offset to
-	     the iWMMXt registers to be loaded must also be non-double-word
-	     sized, so that the resultant address *is* double-word aligned.
-	     We can ignore floats_offset since that was already included in
-	     the live_regs_mask.  */
-	  lrm_count += (lrm_count % 2 ? 2 : 1);
-
-	  for (reg = LAST_IWMMXT_REGNUM; reg >= FIRST_IWMMXT_REGNUM; reg--)
-	    if (df_regs_ever_live_p (reg) && !call_used_regs[reg])
-	      {
-		asm_fprintf (f, "\twldrd\t%r, [%r, #-%d]\n",
-			     reg, FP_REGNUM, lrm_count * 4);
-		lrm_count += 2;
-	      }
-	}
-
-      /* saved_regs_mask should contain the IP, which at the time of stack
-	 frame generation actually contains the old stack pointer.  So a
-	 quick way to unwind the stack is just pop the IP register directly
-	 into the stack pointer.  */
-      gcc_assert (saved_regs_mask & (1 << IP_REGNUM));
-      saved_regs_mask &= ~ (1 << IP_REGNUM);
-      saved_regs_mask |=   (1 << SP_REGNUM);
-
-      /* There are two registers left in saved_regs_mask - LR and PC.  We
-	 only need to restore the LR register (the return address), but to
-	 save time we can load it directly into the PC, unless we need a
-	 special function exit sequence, or we are not really returning.  */
-      if (really_return
-	  && ARM_FUNC_TYPE (func_type) == ARM_FT_NORMAL
-	  && !crtl->calls_eh_return)
-	/* Delete the LR from the register mask, so that the LR on
-	   the stack is loaded into the PC in the register mask.  */
-	saved_regs_mask &= ~ (1 << LR_REGNUM);
-      else
-	saved_regs_mask &= ~ (1 << PC_REGNUM);
-
-      /* We must use SP as the base register, because SP is one of the
-         registers being restored.  If an interrupt or page fault
-         happens in the ldm instruction, the SP might or might not
-         have been restored.  That would be bad, as then SP will no
-         longer indicate the safe area of stack, and we can get stack
-         corruption.  Using SP as the base register means that it will
-         be reset correctly to the original value, should an interrupt
-         occur.  If the stack pointer already points at the right
-         place, then omit the subtraction.  */
-      if (offsets->outgoing_args != (1 + (int) bit_count (saved_regs_mask))
-	  || cfun->calls_alloca)
-	asm_fprintf (f, "\tsub\t%r, %r, #%d\n", SP_REGNUM, FP_REGNUM,
-		     4 * bit_count (saved_regs_mask));
-      print_multi_reg (f, "ldmfd\t%r, ", SP_REGNUM, saved_regs_mask, 0);
-
-      if (IS_INTERRUPT (func_type))
-	/* Interrupt handlers will have pushed the
-	   IP onto the stack, so restore it now.  */
-	print_multi_reg (f, "ldmfd\t%r!, ", SP_REGNUM, 1 << IP_REGNUM, 0);
-    }
-  else
-    {
-      /* This branch is executed for ARM mode (non-apcs frames) and
-	 Thumb-2 mode. Frame layout is essentially the same for those
-	 cases, except that in ARM mode frame pointer points to the
-	 first saved register, while in Thumb-2 mode the frame pointer points
-	 to the last saved register.
-
-	 It is possible to make frame pointer point to last saved
-	 register in both cases, and remove some conditionals below.
-	 That means that fp setup in prologue would be just "mov fp, sp"
-	 and sp restore in epilogue would be just "mov sp, fp", whereas
-	 now we have to use add/sub in those cases. However, the value
-	 of that would be marginal, as both mov and add/sub are 32-bit
-	 in ARM mode, and it would require extra conditionals
-	 in arm_expand_prologue to distingish ARM-apcs-frame case
-	 (where frame pointer is required to point at first register)
-	 and ARM-non-apcs-frame. Therefore, such change is postponed
-	 until real need arise.  */
-      unsigned HOST_WIDE_INT amount;
-      int rfe;
-      /* Restore stack pointer if necessary.  */
-      if (TARGET_ARM && frame_pointer_needed)
-	{
-	  operands[0] = stack_pointer_rtx;
-	  operands[1] = hard_frame_pointer_rtx;
-
-	  operands[2] = GEN_INT (offsets->frame - offsets->saved_regs);
-	  output_add_immediate (operands);
-	}
-      else
-	{
-	  if (frame_pointer_needed)
-	    {
-	      /* For Thumb-2 restore sp from the frame pointer.
-		 Operand restrictions mean we have to incrememnt FP, then copy
-		 to SP.  */
-	      amount = offsets->locals_base - offsets->saved_regs;
-	      operands[0] = hard_frame_pointer_rtx;
-	    }
-	  else
-	    {
-	      unsigned long count;
-	      operands[0] = stack_pointer_rtx;
-	      amount = offsets->outgoing_args - offsets->saved_regs;
-	      /* pop call clobbered registers if it avoids a
-	         separate stack adjustment.  */
-	      count = offsets->saved_regs - offsets->saved_args;
-	      if (optimize_size
-		  && count != 0
-		  && !crtl->calls_eh_return
-		  && bit_count(saved_regs_mask) * 4 == count
-		  && !IS_INTERRUPT (func_type)
-		  && !IS_STACKALIGN (func_type)
-		  && !crtl->tail_call_emit)
-		{
-		  unsigned long mask;
-                  /* Preserve return values, of any size.  */
-		  mask = (1 << ((arm_size_return_regs() + 3) / 4)) - 1;
-		  mask ^= 0xf;
-		  mask &= ~saved_regs_mask;
-		  reg = 0;
-		  while (bit_count (mask) * 4 > amount)
-		    {
-		      while ((mask & (1 << reg)) == 0)
-			reg++;
-		      mask &= ~(1 << reg);
-		    }
-		  if (bit_count (mask) * 4 == amount) {
-		      amount = 0;
-		      saved_regs_mask |= mask;
-		  }
-		}
-	    }
-
-	  if (amount)
-	    {
-	      operands[1] = operands[0];
-	      operands[2] = GEN_INT (amount);
-	      output_add_immediate (operands);
-	    }
-	  if (frame_pointer_needed)
-	    asm_fprintf (f, "\tmov\t%r, %r\n",
-			 SP_REGNUM, HARD_FRAME_POINTER_REGNUM);
-	}
-
-      if (TARGET_FPA_EMU2)
-	{
-	  for (reg = FIRST_FPA_REGNUM; reg <= LAST_FPA_REGNUM; reg++)
-	    if (df_regs_ever_live_p (reg) && !call_used_regs[reg])
-	      asm_fprintf (f, "\tldfe\t%r, [%r], #12\n",
-			   reg, SP_REGNUM);
-	}
-      else
-	{
-	  start_reg = FIRST_FPA_REGNUM;
-
-	  for (reg = FIRST_FPA_REGNUM; reg <= LAST_FPA_REGNUM; reg++)
-	    {
-	      if (df_regs_ever_live_p (reg) && !call_used_regs[reg])
-		{
-		  if (reg - start_reg == 3)
-		    {
-		      asm_fprintf (f, "\tlfmfd\t%r, 4, [%r]!\n",
-				   start_reg, SP_REGNUM);
-		      start_reg = reg + 1;
-		    }
-		}
-	      else
-		{
-		  if (reg != start_reg)
-		    asm_fprintf (f, "\tlfmfd\t%r, %d, [%r]!\n",
-				 start_reg, reg - start_reg,
-				 SP_REGNUM);
-
-		  start_reg = reg + 1;
-		}
-	    }
-
-	  /* Just in case the last register checked also needs unstacking.  */
-	  if (reg != start_reg)
-	    asm_fprintf (f, "\tlfmfd\t%r, %d, [%r]!\n",
-			 start_reg, reg - start_reg, SP_REGNUM);
-	}
-
-      if (TARGET_HARD_FLOAT && TARGET_VFP)
-	{
-	  int end_reg = LAST_VFP_REGNUM + 1;
-
-	  /* Scan the registers in reverse order.  We need to match
-	     any groupings made in the prologue and generate matching
-	     pop operations.  */
-	  for (reg = LAST_VFP_REGNUM - 1; reg >= FIRST_VFP_REGNUM; reg -= 2)
-	    {
-	      if ((!df_regs_ever_live_p (reg) || call_used_regs[reg])
-		  && (!df_regs_ever_live_p (reg + 1)
-		      || call_used_regs[reg + 1]))
-		{
-		  if (end_reg > reg + 2)
-		    vfp_output_fldmd (f, SP_REGNUM,
-				      (reg + 2 - FIRST_VFP_REGNUM) / 2,
-				      (end_reg - (reg + 2)) / 2);
-		  end_reg = reg;
-		}
-	    }
-	  if (end_reg > reg + 2)
-	    vfp_output_fldmd (f, SP_REGNUM, 0,
-			      (end_reg - (reg + 2)) / 2);
-	}
-
-      if (TARGET_IWMMXT)
-	for (reg = FIRST_IWMMXT_REGNUM; reg <= LAST_IWMMXT_REGNUM; reg++)
-	  if (df_regs_ever_live_p (reg) && !call_used_regs[reg])
-	    asm_fprintf (f, "\twldrd\t%r, [%r], #8\n", reg, SP_REGNUM);
-
-      /* If we can, restore the LR into the PC.  */
-      if (ARM_FUNC_TYPE (func_type) != ARM_FT_INTERWORKED
-	  && (TARGET_ARM || ARM_FUNC_TYPE (func_type) == ARM_FT_NORMAL)
-	  && !IS_STACKALIGN (func_type)
-	  && really_return
-	  && crtl->args.pretend_args_size == 0
-	  && saved_regs_mask & (1 << LR_REGNUM)
-	  && !crtl->calls_eh_return)
-	{
-	  saved_regs_mask &= ~ (1 << LR_REGNUM);
-	  saved_regs_mask |=   (1 << PC_REGNUM);
-	  rfe = IS_INTERRUPT (func_type);
-	}
-      else
-	rfe = 0;
-
-      /* Load the registers off the stack.  If we only have one register
-	 to load use the LDR instruction - it is faster.  For Thumb-2
-	 always use pop and the assembler will pick the best instruction.*/
-      if (TARGET_ARM && saved_regs_mask == (1 << LR_REGNUM)
-	  && !IS_INTERRUPT(func_type))
-	{
-	  asm_fprintf (f, "\tldr\t%r, [%r], #4\n", LR_REGNUM, SP_REGNUM);
-	}
-      else if (saved_regs_mask)
-	{
-	  if (saved_regs_mask & (1 << SP_REGNUM))
-	    /* Note - write back to the stack register is not enabled
-	       (i.e. "ldmfd sp!...").  We know that the stack pointer is
-	       in the list of registers and if we add writeback the
-	       instruction becomes UNPREDICTABLE.  */
-	    print_multi_reg (f, "ldmfd\t%r, ", SP_REGNUM, saved_regs_mask,
-			     rfe);
-	  else if (TARGET_ARM)
-	    print_multi_reg (f, "ldmfd\t%r!, ", SP_REGNUM, saved_regs_mask,
-			     rfe);
-	  else
-	    print_multi_reg (f, "pop\t", SP_REGNUM, saved_regs_mask, 0);
-	}
-
-      if (crtl->args.pretend_args_size)
-	{
-	  /* Unwind the pre-pushed regs.  */
-	  operands[0] = operands[1] = stack_pointer_rtx;
-	  operands[2] = GEN_INT (crtl->args.pretend_args_size);
-	  output_add_immediate (operands);
-	}
-    }
-
-  /* We may have already restored PC directly from the stack.  */
-  if (!really_return || saved_regs_mask & (1 << PC_REGNUM))
-    return "";
-
-  /* Stack adjustment for exception handler.  */
-  if (crtl->calls_eh_return)
-    asm_fprintf (f, "\tadd\t%r, %r, %r\n", SP_REGNUM, SP_REGNUM,
-		 ARM_EH_STACKADJ_REGNUM);
-
-  /* Generate the return instruction.  */
-  switch ((int) ARM_FUNC_TYPE (func_type))
-    {
-    case ARM_FT_ISR:
-    case ARM_FT_FIQ:
-      asm_fprintf (f, "\tsubs\t%r, %r, #4\n", PC_REGNUM, LR_REGNUM);
-      break;
-
-    case ARM_FT_EXCEPTION:
-      asm_fprintf (f, "\tmovs\t%r, %r\n", PC_REGNUM, LR_REGNUM);
-      break;
-
-    case ARM_FT_INTERWORKED:
-      asm_fprintf (f, "\tbx\t%r\n", LR_REGNUM);
-      break;
-
-    default:
-      if (IS_STACKALIGN (func_type))
-	{
-	  /* See comment in arm_expand_prologue.  */
-	  asm_fprintf (f, "\tmov\t%r, %r\n", SP_REGNUM, 0);
-	}
-      if (arm_arch5 || arm_arch4t)
-	asm_fprintf (f, "\tbx\t%r\n", LR_REGNUM);
-      else
-	asm_fprintf (f, "\tmov\t%r, %r\n", PC_REGNUM, LR_REGNUM);
-      break;
-    }
-
-  return "";
-}
-
 static void
 arm_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
 			      HOST_WIDE_INT frame_size ATTRIBUTE_UNUSED)
@@ -16235,13 +15570,9 @@  emit_multi_reg_push (unsigned long mask)
 
    Unfortunately, since this insn does not reflect very well the actual
    semantics of the operation, we need to annotate the insn for the benefit
-   of DWARF2 frame unwind information.
-
-   There's no reason why this couldn't be used for Thumb-1 or ARM, in theory,
-   but currently the pattern that matches this in the MD file is only enabled
-   for Thumb-2.  */
+   of DWARF2 frame unwind information.  */
 static void
-thumb2_emit_multi_reg_pop (unsigned long saved_regs_mask, bool really_return)
+arm_emit_multi_reg_pop (unsigned long saved_regs_mask, bool really_return)
 {
   int num_regs = 0;
   int i, j;
@@ -16310,7 +15641,7 @@  thumb2_emit_multi_reg_pop (unsigned long saved_regs_mask, bool really_return)
    semantics of the operation, we need to annotate the insn for the benefit
    of DWARF2 frame unwind information.  */
 static void
-thumb2_emit_vfp_multi_reg_pop (int first_reg, int num_regs)
+arm_emit_vfp_multi_reg_pop (int first_reg, int num_regs, rtx base_reg)
 {
   int i, j;
   rtx par;
@@ -16321,8 +15652,8 @@  thumb2_emit_vfp_multi_reg_pop (int first_reg, int num_regs)
 
   if (num_regs > 16)
     {
-      thumb2_emit_vfp_multi_reg_pop (first_reg, 16);
-      thumb2_emit_vfp_multi_reg_pop (first_reg + 16, num_regs - 16);
+      arm_emit_vfp_multi_reg_pop (first_reg, 16, base_reg);
+      arm_emit_vfp_multi_reg_pop (first_reg + 16, num_regs - 16, base_reg);
       return;
     }
 
@@ -16333,8 +15664,8 @@  thumb2_emit_vfp_multi_reg_pop (int first_reg, int num_regs)
   /* Increment the stack pointer, based on there being
      num_regs 8-byte registers to restore.  */
   tmp = gen_rtx_SET (VOIDmode,
-                     stack_pointer_rtx,
-                     plus_constant (stack_pointer_rtx, 8 * num_regs));
+                     base_reg,
+                     plus_constant (base_reg, 8 * num_regs));
   RTX_FRAME_RELATED_P (tmp) = 1;
   XVECEXP (par, 0, 0) = tmp;
 
@@ -16347,8 +15678,7 @@  thumb2_emit_vfp_multi_reg_pop (int first_reg, int num_regs)
                          reg,
                          gen_frame_mem
                          (DFmode,
-                          plus_constant (stack_pointer_rtx,
-                                         8 * j)));
+                          plus_constant (base_reg, 8 * j)));
       RTX_FRAME_RELATED_P (tmp) = 1;
       XVECEXP (par, 0, j + 1) = tmp;
 
@@ -22514,6 +21844,139 @@  thumb1_expand_epilogue (void)
 /* Generate pattern *pop_multiple_with_stack_update_and_return if single
    POP instruction can be generated.  LR should be replaced by PC.  All
    the checks required are already done by  USE_RETURN_INSN ().  Hence,
+   all we need to do here is generate multi-reg pop and let return pattern
+   handle other instructions to be generated, if any.  */
+void
+arm_expand_return (bool really_return)
+{
+  int i, num_regs, return_reg;
+  unsigned long saved_regs_mask, func_type;
+  arm_stack_offsets *offsets;
+  unsigned HOST_WIDE_INT stack_adjust;
+
+  func_type = arm_current_func_type ();
+
+  if (IS_NAKED (func_type)
+      || (IS_VOLATILE (func_type) && TARGET_ABORT_NORETURN))
+    {
+      emit_jump_insn (ret_rtx);
+      return;
+    }
+
+  gcc_assert (!cfun->calls_alloca || really_return);
+
+  offsets = arm_get_frame_offsets ();
+  saved_regs_mask = offsets->saved_regs_mask;
+  for (i = 0, num_regs = 0; i <= LAST_ARM_REGNUM; i++)
+    if (saved_regs_mask & (1 << i))
+      num_regs++;
+
+  if (saved_regs_mask)
+    {
+      /* If we do not have any special requirements for function exit (e.g.
+         interworking), then we can load the return address directly into
+         the PC.  Otherwise we must load it into LR.  */
+      if (really_return
+          && (IS_INTERRUPT (func_type) || !TARGET_INTERWORK)
+          && (saved_regs_mask & (1 << LR_REGNUM)))
+        return_reg = PC_REGNUM;
+      else
+        return_reg = LR_REGNUM;
+
+      if (saved_regs_mask & (1 << IP_REGNUM))
+        {
+           /* There are three possible reasons for the IP register being saved.
+              1) a stack frame was created, in which case IP contains the old
+                 stack pointer, or
+              2) an ISR routine corrupted it, or
+              3) it was saved to align the stack on iWMMXt.
+              In case 1, restore IP into SP, otherwise just restore IP.  */
+           if (frame_pointer_needed)
+             {
+               saved_regs_mask &= ~ (1 << IP_REGNUM);
+               saved_regs_mask |=   (1 << SP_REGNUM);
+             }
+           else
+             {
+               gcc_assert (IS_INTERRUPT (func_type) || TARGET_REALLY_IWMMXT);
+             }
+        }
+
+      for (i = 0, num_regs = 0; i <= LAST_ARM_REGNUM; i++)
+        /*  On some ARM architectures it is faster to use LDR rather than LDM
+            to load a single register.  So identify single-register-restore
+            case.  */
+        if (saved_regs_mask == (1 << i))
+          break;
+
+      if (i <= LAST_ARM_REGNUM
+          && (i != LR_REGNUM
+              || ! really_return
+              || ! IS_INTERRUPT (func_type)))
+        {
+          rtx addr;
+          rtx reg;
+
+          reg = gen_rtx_REG (SImode, ((i == LR_REGNUM) ? return_reg : i));
+          if (saved_regs_mask & (1 << SP_REGNUM))
+            addr = gen_rtx_MEM (SImode, stack_pointer_rtx);
+          else
+            addr = gen_rtx_MEM (SImode,
+                                gen_rtx_POST_INC (SImode,
+                                                  stack_pointer_rtx));
+
+          set_mem_alias_set (addr, get_frame_alias_set ());
+
+          if (saved_regs_mask & (1 << PC_REGNUM) || return_reg == PC_REGNUM)
+            {
+              rtx insn = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
+              XVECEXP (insn, 0, 0) = ret_rtx;
+              XVECEXP (insn, 0, 1) = gen_rtx_SET (SImode, reg, addr);
+              RTX_FRAME_RELATED_P (XVECEXP (insn, 0, 1)) = 1;
+              emit_jump_insn (insn);
+            }
+          else
+            {
+              rtx insn = gen_rtx_SET (SImode, reg, addr);
+              RTX_FRAME_RELATED_P (insn) = 1;
+              emit_insn (insn);
+            }
+        }
+      else
+        {
+          if (saved_regs_mask & (1 << SP_REGNUM))
+            {
+              stack_adjust = offsets->outgoing_args - offsets->saved_regs;
+              gcc_assert (stack_adjust == 0 || stack_adjust == 4);
+
+              if (stack_adjust && !(arm_arch5 && TARGET_ARM))
+                saved_regs_mask |= 1 << 3;
+            }
+
+          arm_emit_multi_reg_pop (((saved_regs_mask & (~ (1 << LR_REGNUM)))
+                                   | (1 << return_reg)),
+                                  (saved_regs_mask & (1 << PC_REGNUM)
+                                   || return_reg == PC_REGNUM));
+        }
+
+      /* Check if we need to generate extra instruction to perform actual
+         function return.  */
+      if (really_return
+          && func_type != ARM_FT_INTERWORKED
+          && (saved_regs_mask & (1 << LR_REGNUM)) != 0)
+        /* The return has already been handled by loading LR in PC.  */
+        really_return = false;
+    }
+
+    /* output_return_instruction () will take care of actual instruction to be
+       emited to return from function.  */
+    if (really_return)
+      emit_jump_insn (ret_rtx);
+}
+
+/* Generate pattern *pop_multiple_with_stack_update_and_return if single
+   POP instruction can be generated.  LR should be replaced by PC.  All
+   the checks required are already done by  USE_RETURN_INSN ().  Hence,
    all we really need to check here is if single register is to be
    returned, or multiple register return.  */
 void
@@ -22549,7 +22012,7 @@  thumb2_expand_return (void)
         {
           saved_regs_mask &= ~ (1 << LR_REGNUM);
           saved_regs_mask |=   (1 << PC_REGNUM);
-          thumb2_emit_multi_reg_pop (saved_regs_mask, true);
+          arm_emit_multi_reg_pop (saved_regs_mask, true);
         }
     }
   else
@@ -22558,13 +22021,452 @@  thumb2_expand_return (void)
     }
 }
 
+/* Generate RTL to represent ARM epilogue.  Really_return is true if the
+   function is not a sibcall.  */
+void
+arm_expand_epilogue (bool really_return)
+{
+  unsigned long func_type;
+  unsigned long saved_regs_mask;
+  int num_regs = 0;
+  int i;
+  int amount;
+  int floats_from_frame = 0;
+  arm_stack_offsets *offsets;
+
+  func_type = arm_current_func_type ();
+
+  /* Naked functions don't have epilogue.  Hence, generate return pattern, and
+     let output_return_instruction take care of instruction emition if any.  */
+  if (IS_NAKED (func_type)
+      || (IS_VOLATILE (func_type) && TARGET_ABORT_NORETURN))
+    {
+      emit_jump_insn (ret_rtx);
+      return;
+    }
+
+   gcc_assert (!crtl->calls_eh_return || really_return);
+
+  /* Get frame offsets for ARM.  */
+  offsets = arm_get_frame_offsets ();
+  saved_regs_mask = offsets->saved_regs_mask;
+
+  /* Find offset of floating point register from frame pointer.
+     The initialization is done in this way to take care of frame pointer and
+     static-chain register, if stored.  */
+  floats_from_frame = offsets->saved_regs - offsets->frame;
+
+  for (i = 0; i <= LAST_ARM_REGNUM; i++)
+    if (saved_regs_mask & (1 << i))
+      {
+        num_regs++;
+        floats_from_frame += 4;
+      }
+
+  if (TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM)
+    {
+      if (TARGET_FPA_EMU2)
+        {
+          for (i = LAST_FPA_REGNUM; i >= FIRST_FPA_REGNUM; i--)
+            if (df_regs_ever_live_p (i) && !call_used_regs[i])
+              {
+                rtx addr;
+                rtx insn;
+                floats_from_frame += 12;
+                addr = gen_rtx_MEM (XFmode,
+                                        gen_rtx_PLUS (SImode,
+                                              hard_frame_pointer_rtx,
+                                              GEN_INT (- floats_from_frame)));
+                set_mem_alias_set (addr, get_frame_alias_set ());
+                insn = emit_insn (gen_rtx_SET (XFmode,
+                                               gen_rtx_REG (XFmode, i),
+                                               addr));
+                REG_NOTES (insn) = alloc_reg_note (REG_CFA_RESTORE,
+                                                   gen_rtx_REG (XFmode, i),
+                                                   NULL_RTX);
+              }
+        }
+      else
+        {
+          int idx = 0;
+          rtx load_seq[4];
+          rtx dwarf = NULL_RTX;
+          rtx par;
+
+          for (i = LAST_FPA_REGNUM; i >= FIRST_FPA_REGNUM; i--)
+            {
+              floats_from_frame += 12;
+
+              if (idx == 4)
+                {
+                  par = emit_insn (gen_rtx_PARALLEL (VOIDmode,
+                                               gen_rtvec_v (idx, load_seq)));
+                  REG_NOTES (par) = dwarf;
+                  dwarf = NULL_RTX;
+                  idx = 0;
+                }
+
+              if (df_regs_ever_live_p (i) && !call_used_regs[i])
+                {
+                  load_seq[idx] = gen_rtx_SET (VOIDmode,
+                                      gen_rtx_REG (XFmode, i),
+                                      gen_frame_mem (XFmode,
+                                          plus_constant (hard_frame_pointer_rtx,
+                                                         - floats_from_frame)));
+                  dwarf = alloc_reg_note (REG_CFA_RESTORE,
+                                          gen_rtx_REG (XFmode, i),
+                                          dwarf);
+                  idx++;
+                }
+              else
+                {
+                  if (idx)
+                    {
+                      /* Create parallel and emit.  */
+                      par = emit_insn (gen_rtx_PARALLEL (VOIDmode,
+                                                 gen_rtvec_v (idx, load_seq)));
+                      REG_NOTES (par) = dwarf;
+                      dwarf = NULL_RTX;
+                      idx = 0;
+                    }
+                }
+            }
+
+          if (idx)
+            {
+              par = emit_insn (gen_rtx_PARALLEL (VOIDmode,
+                                                 gen_rtvec_v (idx, load_seq)));
+              REG_NOTES (par) = dwarf;
+            }
+        }
+
+      if (TARGET_HARD_FLOAT && TARGET_VFP)
+        {
+          int start_reg;
+          /* The offset is from IP_REGNUM.  */
+          emit_insn (gen_addsi3 (gen_rtx_REG (SImode, IP_REGNUM),
+                                 hard_frame_pointer_rtx,
+                                 GEN_INT (- floats_from_frame)));
+
+          /* Generate VFP register multi-pop.  */
+          start_reg = FIRST_VFP_REGNUM;
+
+          for (i = FIRST_VFP_REGNUM; i < LAST_VFP_REGNUM; i += 2)
+            /* Look for a case where a reg does not need restoring.  */
+            if ((!df_regs_ever_live_p (i) || call_used_regs[i])
+                && (!df_regs_ever_live_p (i + 1)
+                    || call_used_regs[i + 1]))
+              {
+                if (start_reg != i)
+                  arm_emit_vfp_multi_reg_pop (start_reg,
+                                              ((i - start_reg)) / 2,
+                                              gen_rtx_REG (SImode,
+                                                           IP_REGNUM));
+                start_reg = i;
+              }
+
+          /* Restore the remaining regs that we have discovered (or possibly
+             even all of them, if the conditional in the for loop never
+             fired).  */
+          if (start_reg != i)
+            arm_emit_vfp_multi_reg_pop (start_reg,
+                                        (i - start_reg) / 2,
+                                        gen_rtx_REG (SImode, IP_REGNUM));
+        }
+
+      if (TARGET_IWMMXT)
+        {
+          /* The frame pointer is guaranteed to be non-double-word aligned, as
+             it is set to double-word-aligned old_stack_pointer - 4.  */
+          rtx insn;
+          int lrm_count = (num_regs % 2) ? (num_regs + 2) : (num_regs + 1);
+          for (i = LAST_IWMMXT_REGNUM; i >= FIRST_IWMMXT_REGNUM; i--)
+            if (df_regs_ever_live_p (i) && !call_used_regs[i])
+              {
+                rtx addr = gen_frame_mem (V2SImode,
+                                     plus_constant (hard_frame_pointer_rtx,
+                                                    - lrm_count * 4));
+                insn = emit_insn (gen_movsi (gen_rtx_REG (V2SImode, i), addr));
+                REG_NOTES (insn) = alloc_reg_note (REG_CFA_RESTORE,
+                                                   gen_rtx_REG (V2SImode, i),
+                                                   NULL_RTX);
+                lrm_count += 2;
+              }
+        }
+
+      /* saved_regs_mask should contain the IP which contains old stack pointer
+         at the time of activation creation.  To unwind stack quickly, pop IP
+         in SP.  */
+      gcc_assert (saved_regs_mask & (1 << IP_REGNUM));
+      saved_regs_mask &= ~ (1 << IP_REGNUM);
+      saved_regs_mask |=   (1 << SP_REGNUM);
+
+      if (really_return
+          && ARM_FUNC_TYPE (func_type) == ARM_FT_NORMAL
+          && !crtl->calls_eh_return)
+        saved_regs_mask &= ~ (1 << LR_REGNUM);
+      else
+        saved_regs_mask &= ~ (1 << PC_REGNUM);
+
+      if (offsets->outgoing_args != (1 + num_regs)
+          || cfun->calls_alloca)
+        /* Unwind the stack till saved registers.  */
+        gen_addsi3 (stack_pointer_rtx,
+                    hard_frame_pointer_rtx,
+                    GEN_INT (- 4 * num_regs));
+
+      arm_emit_multi_reg_pop (saved_regs_mask, really_return);
+
+      if (IS_INTERRUPT (func_type))
+        {
+          rtx insn;
+          rtx addr = gen_rtx_MEM (SImode,
+                                  gen_rtx_POST_INC (SImode,
+                                  stack_pointer_rtx));
+          set_mem_alias_set (addr, get_frame_alias_set ());
+          insn = emit_insn (gen_movsi (gen_rtx_REG (SImode, IP_REGNUM), addr));
+          REG_NOTES (insn) = alloc_reg_note (REG_CFA_RESTORE,
+                                             gen_rtx_REG (SImode, IP_REGNUM),
+                                             NULL_RTX);
+        }
+
+      if (!really_return || saved_regs_mask & (1 << PC_REGNUM))
+        return;
+
+      if (crtl->calls_eh_return)
+        emit_insn (gen_addsi3 (stack_pointer_rtx,
+                   stack_pointer_rtx,
+                   GEN_INT (ARM_EH_STACKADJ_REGNUM)));
+
+        emit_jump_insn (ret_rtx);
+    }
+  else
+    {
+      if (frame_pointer_needed) /* Handles case for TARGET_ARM only.  */
+        {
+          /* In ARM mode, frame pointer points to first saved register.
+             Restore stack pointer to last saved register.  */
+          emit_insn (gen_addsi3 (hard_frame_pointer_rtx,
+                             hard_frame_pointer_rtx,
+                             GEN_INT (offsets->frame - offsets->saved_regs)));
+          emit_insn (gen_movsi (stack_pointer_rtx, hard_frame_pointer_rtx));
+          /* Emit USE(stack_pointer_rtx) to ensure that stack adjustment is not
+             deleted.  */
+          emit_insn (gen_prologue_use (stack_pointer_rtx));
+        }
+      else
+        {
+          /* Pop off outgoing args and local frame to adjust stack pointer to
+             last saved register.  */
+          amount = offsets->outgoing_args - offsets->saved_regs;
+          if (amount)
+            {
+              emit_insn (gen_addsi3 (stack_pointer_rtx,
+                                     stack_pointer_rtx,
+                                     GEN_INT (amount)));
+              /* Emit USE(stack_pointer_rtx) to ensure that stack adjustment is
+                 not deleted.  */
+              emit_insn (gen_prologue_use (stack_pointer_rtx));
+            }
+        }
+
+      if (TARGET_FPA_EMU2)
+        {
+          for (i = FIRST_FPA_REGNUM; i <= LAST_FPA_REGNUM; i++)
+            if (df_regs_ever_live_p (i) && !call_used_regs[i])
+              {
+                /* Generate memory reference with write-back to SP.  */
+                rtx insn;
+                rtx addr = gen_rtx_MEM (XFmode,
+                                        gen_rtx_POST_INC (SImode,
+                                                          stack_pointer_rtx));
+                set_mem_alias_set (addr, get_frame_alias_set ());
+                insn = emit_insn (gen_movxf (gen_rtx_REG (XFmode, i), addr));
+                REG_NOTES (insn) = alloc_reg_note (REG_CFA_RESTORE,
+                                                   gen_rtx_REG (XFmode, i),
+                                                   NULL_RTX);
+              }
+        }
+      else
+        {
+          int idx = 0;
+          rtx load_seq[5];
+          rtx par;
+          rtx dwarf = NULL_RTX;
+
+          for (i = FIRST_FPA_REGNUM; i <= LAST_FPA_REGNUM; i++)
+            {
+              if (idx == 4)
+                {
+                  load_seq[0] = gen_rtx_SET (VOIDmode,
+                                             stack_pointer_rtx,
+                                             plus_constant (stack_pointer_rtx,
+                                                            12 * idx));
+                  par = emit_insn (gen_rtx_PARALLEL (VOIDmode,
+                                               gen_rtvec_v (idx + 1, load_seq)));
+                  REG_NOTES (par) = dwarf;
+                  dwarf = NULL_RTX;
+                  idx = 0;
+                }
+
+              if (df_regs_ever_live_p (i) && !call_used_regs[i])
+                {
+                  load_seq[idx + 1] = gen_rtx_SET (VOIDmode,
+                                          gen_rtx_REG (XFmode, i),
+                                          gen_frame_mem (XFmode,
+                                              plus_constant (stack_pointer_rtx,
+                                                             12 * idx)));
+                  dwarf = alloc_reg_note (REG_CFA_RESTORE,
+                                          gen_rtx_REG (XFmode, i),
+                                          dwarf);
+                  idx++;
+                }
+              else
+                {
+                  if (idx)
+                    {
+                      /* Create parallel and emit.  */
+                      load_seq[0] = gen_rtx_SET (VOIDmode,
+                                        stack_pointer_rtx,
+                                        plus_constant (stack_pointer_rtx,
+                                                       12 * idx));
+                      par = emit_insn (gen_rtx_PARALLEL (VOIDmode,
+                                                   gen_rtvec_v (idx + 1,
+                                                                load_seq)));
+                      REG_NOTES (par) = dwarf;
+                      dwarf = NULL_RTX;
+                      idx = 0;
+                    }
+                }
+            }
+
+          if (idx)
+            {
+              load_seq[0] = gen_rtx_SET (VOIDmode,
+                                         stack_pointer_rtx,
+                                         plus_constant (stack_pointer_rtx,
+                                                        12 * idx));
+              par = emit_insn (gen_rtx_PARALLEL (VOIDmode,
+                                           gen_rtvec_v (idx + 1, load_seq)));
+              REG_NOTES (par) = dwarf;
+              dwarf = NULL_RTX;
+              idx = 0;
+            }
+        }
+
+        if (TARGET_HARD_FLOAT && TARGET_VFP)
+          {
+            /* Generate VFP register multi-pop.  */
+            int end_reg = LAST_VFP_REGNUM + 1;
+
+            /* Scan the registers in reverse order.  We need to match
+               any groupings made in the prologue and generate matching
+               fldmdd operations.  The need to match groups is because,
+               unlike pop, fldmdd can only do consecutive regs.  */
+            for (i = LAST_VFP_REGNUM - 1; i >= FIRST_VFP_REGNUM; i -= 2)
+              /* Look for a case where a reg does not need restoring.  */
+              if ((!df_regs_ever_live_p (i) || call_used_regs[i])
+                  && (!df_regs_ever_live_p (i + 1)
+                      || call_used_regs[i + 1]))
+                {
+                  /* Restore the regs discovered so far (from reg+2 to
+                     end_reg).  */
+                  if (end_reg > i + 2)
+                    arm_emit_vfp_multi_reg_pop (i + 2,
+                                                (end_reg - (i + 2)) / 2,
+                                                stack_pointer_rtx);
+                  end_reg = i;
+                }
+
+            /* Restore the remaining regs that we have discovered (or possibly
+               even all of them, if the conditional in the for loop never
+               fired).  */
+            if (end_reg > i + 2)
+              arm_emit_vfp_multi_reg_pop (i + 2,
+                                          (end_reg - (i + 2)) / 2,
+                                          stack_pointer_rtx);
+          }
+
+        if (TARGET_IWMMXT)
+          for (i = FIRST_IWMMXT_REGNUM; i <= LAST_IWMMXT_REGNUM; i++)
+            if (df_regs_ever_live_p (i) && !call_used_regs[i])
+              {
+                rtx insn;
+                rtx addr = gen_rtx_MEM (V2SImode,
+                                    gen_rtx_POST_INC (SImode,
+                                                      stack_pointer_rtx));
+                set_mem_alias_set (addr, get_frame_alias_set ());
+                insn = emit_insn (gen_movsi (gen_rtx_REG (V2SImode, i), addr));
+                REG_NOTES (insn) = alloc_reg_note (REG_CFA_RESTORE,
+                                                   gen_rtx_REG (V2SImode, i),
+                                                   NULL_RTX);
+              }
+
+        if (saved_regs_mask)
+          {
+            rtx insn;
+            bool return_in_pc = false;
+
+            if (num_regs == 1)
+              {
+                for (i = 0; i <= LAST_ARM_REGNUM; i++)
+                  if (saved_regs_mask & (1 << i))
+                    {
+                      rtx addr = gen_rtx_MEM (SImode,
+                                              gen_rtx_POST_INC (SImode,
+                                                            stack_pointer_rtx));
+                      set_mem_alias_set (addr, get_frame_alias_set ());
+                      insn = emit_insn (gen_movsi (gen_rtx_REG (SImode, i), addr));
+                      REG_NOTES (insn) = alloc_reg_note (REG_CFA_RESTORE,
+                                                         gen_rtx_REG (SImode, i),
+                                                         NULL_RTX);
+                    }
+              }
+            else
+              {
+                if (ARM_FUNC_TYPE (func_type) != ARM_FT_INTERWORKED
+                    && TARGET_ARM
+                    && !IS_STACKALIGN (func_type)
+                    && really_return
+                    && crtl->args.pretend_args_size == 0
+                    && saved_regs_mask & (1 << LR_REGNUM)
+                    && !crtl->calls_eh_return)
+                  {
+                    saved_regs_mask &= ~ (1 << LR_REGNUM);
+                    saved_regs_mask |=   (1 << PC_REGNUM);
+                    return_in_pc = true;
+                  }
+
+                arm_emit_multi_reg_pop (saved_regs_mask, return_in_pc);
+                if (return_in_pc == true)
+                  return;
+              }
+          }
+
+        if (crtl->args.pretend_args_size)
+          emit_insn (gen_addsi3 (stack_pointer_rtx,
+                                 stack_pointer_rtx,
+                                 GEN_INT (crtl->args.pretend_args_size)));
+
+        if (!really_return)
+          return;
+
+        if (crtl->calls_eh_return)
+          emit_insn (gen_addsi3 (stack_pointer_rtx,
+                     stack_pointer_rtx,
+                     GEN_INT (ARM_EH_STACKADJ_REGNUM)));
+
+        emit_jump_insn (ret_rtx);
+    }
+}
+
 /* Generate RTL to represent a Thumb-2 epilogue.
 
    Note that this RTL does not include the
    Return insn, which is created separately and
    handled in thumb2_output_return.  */
 void
-thumb2_expand_epilogue (void)
+thumb2_expand_epilogue (bool is_sibling)
 {
   HOST_WIDE_INT amount;
   int reg;
@@ -22633,20 +22535,23 @@  thumb2_expand_epilogue (void)
           {
             /* Restore the regs discovered so far (from reg+2 to end_reg).  */
             if (end_reg > reg + 2)
-              thumb2_emit_vfp_multi_reg_pop (reg + 2,
-                                             (end_reg - (reg + 2)) / 2);
+              arm_emit_vfp_multi_reg_pop (reg + 2,
+                                          (end_reg - (reg + 2)) / 2,
+                                          stack_pointer_rtx);
             end_reg = reg;
           }
 
       /* Restore the remaining regs that we have discovered (or possibly
          even all of them, if the conditional in the for loop never fired).  */
       if (end_reg > reg + 2)
-        thumb2_emit_vfp_multi_reg_pop (reg + 2, (end_reg - (reg + 2)) / 2);
+        arm_emit_vfp_multi_reg_pop (reg + 2,
+                                    (end_reg - (reg + 2)) / 2,
+                                    stack_pointer_rtx);
     }
 
   /* iWMMXt is not supported when Thumb-2 in use.  If it were, we would
      want to be restoring the appropriate iWMMXt regs here, in a similar
-     way to arm_output_epilogue.  */
+     way to arm_expand_epilogue.  */
 
   /* If there are registers to restore, make it happen.  */
   if (saved_regs_mask)
@@ -22676,6 +22581,7 @@  thumb2_expand_epilogue (void)
              return-address' instruction.  Instead, pop LR in PC.  */
           if (ARM_FUNC_TYPE (func_type) == ARM_FT_NORMAL
               && !IS_STACKALIGN (func_type)
+              && !is_sibling
               && crtl->args.pretend_args_size == 0
               && saved_regs_mask & (1 << LR_REGNUM)
               && !crtl->calls_eh_return)
@@ -22685,7 +22591,7 @@  thumb2_expand_epilogue (void)
               really_return = true;
             }
 
-          thumb2_emit_multi_reg_pop (saved_regs_mask, really_return);
+          arm_emit_multi_reg_pop (saved_regs_mask, really_return);
           if (really_return == true)
             return;
         }
@@ -22697,6 +22603,9 @@  thumb2_expand_epilogue (void)
                            stack_pointer_rtx,
                            GEN_INT (crtl->args.pretend_args_size)));
 
+  if (is_sibling)
+    return;
+
   /* Stack adjustment for exception handler.  */
   if (crtl->calls_eh_return)
     emit_insn (gen_addsi3 (stack_pointer_rtx,
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 6cfb2da..9ba7823 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -6682,26 +6682,43 @@ 
           (plus:SI (match_dup 1)
                    (match_operand:SI 2 "const_int_operand" "I")))
         ])]
-  "TARGET_THUMB2"
+  "TARGET_32BIT"
   "*
   {
     int i;
     char pattern[100];
     int num_saves = XVECLEN (operands[0], 0);
 
-    if (REGNO (operands[1]) == SP_REGNUM)
+    if (TARGET_THUMB2)
       {
-        strcpy (pattern, \"pop\\t{\");
+        if (REGNO (operands[1]) == SP_REGNUM)
+          {
+            strcpy (pattern, \"pop%?\\t{\");
+          }
+        else
+          {
+            strcpy (pattern, \"ldm%(ia%)\\t\");
+            strcat (pattern, reg_names[REGNO (operands[1])]);
+            strcat (pattern, \"!, {\");
+          }
       }
     else
       {
-        strcpy (pattern, \"ldm%(ia%)\\t\");
+        strcpy (pattern, \"ldm%(fd%)\\t\");
         strcat (pattern, reg_names[REGNO (operands[1])]);
-        strcat (pattern, \"!, {\");
+
+        for (i = 2; i < num_saves; i++)
+          if (REGNO (XEXP (XVECEXP (operands[0], 0, i), 0)) == SP_REGNUM)
+            break;
+
+        if (i != num_saves)
+          strcat (pattern, \", {\");
+        else
+          strcat (pattern, \"!, {\");
       }
 
     strcat (pattern,
-                reg_names[REGNO (XEXP (XVECEXP (operands[0], 0, 1), 0))]);
+            reg_names[REGNO (XEXP (XVECEXP (operands[0], 0, 1), 0))]);
 
     /* Skip over the first element and the one we just generated.  */
     for (i = 2; i < (num_saves); i++)
@@ -6717,7 +6734,8 @@ 
     return \"\";
   }
   "
-  [(set_attr "type" "load4")]
+  [(set_attr "type" "load4")
+   (set_attr "predicable" "yes")]
 )
 
 ;; Pop with return (as used in epilogue RTL)
@@ -6733,14 +6751,31 @@ 
           (plus:SI (match_dup 1)
                    (match_operand:SI 2 "const_int_operand" "I")))
         ])]
-  "TARGET_THUMB2"
+  "TARGET_32BIT"
   "*
   {
     int i;
     char pattern[100];
     int num_saves = XVECLEN (operands[0], 0);
 
-    strcpy (pattern, \"pop\\t{\");
+    if (TARGET_THUMB2)
+      {
+        strcpy (pattern, \"pop%?\\t{\");
+      }
+    else
+      {
+        strcpy (pattern, \"ldm%(fd%)\\t\");
+        strcat (pattern, reg_names[REGNO (operands[1])]);
+        for (i = 3; i < num_saves; i++)
+          if (REGNO (XEXP (XVECEXP (operands[0], 0, i), 0)) == SP_REGNUM)
+            break;
+
+        if (i != num_saves)
+          strcat (pattern, \", {\");
+        else
+          strcat (pattern, \"!, {\");
+      }
+
     strcat (pattern,
                 reg_names[REGNO (XEXP (XVECEXP (operands[0], 0, 2), 0))]);
 
@@ -6753,19 +6788,24 @@ 
       }
 
     strcat (pattern, \"}\");
+
+    if (IS_INTERRUPT (arm_current_func_type ()))
+      strcat (pattern, \"^\");
+
     output_asm_insn (pattern, operands);
 
     return \"\";
   }
   "
-  [(set_attr "type" "load4")]
+  [(set_attr "type" "load4")
+   (set_attr "predicable" "yes")]
 )
 
-(define_insn "*thumb2_ldr_with_return"
+(define_insn "*ldr_with_return"
   [(return)
    (set (reg:SI PC_REGNUM)
         (mem:SI (post_inc:SI (match_operand:SI 0 "s_register_operand" "+k"))))]
-  "TARGET_THUMB2"
+  "TARGET_32BIT"
   "ldr%?\t%|pc, [%0], #4"
   [(set_attr "type" "load1")
    (set_attr "predicable" "yes")]
@@ -6773,12 +6813,12 @@ 
 
 (define_insn "*vfp_pop_multiple_with_stack_update"
   [(match_parallel 0 "load_multiple_operation_fp"
-    [(set (match_operand:SI 1 "s_register_operand" "+k")
+    [(set (match_operand:SI 1 "s_register_operand" "+rk")
           (plus:SI (match_dup 1)
                    (match_operand:SI 2 "const_int_operand" "I")))
      (set (match_operand:DF 3 "arm_hard_register_operand" "")
           (mem:DF (match_dup 1)))])]
-  "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_VFP"
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP"
   "*
   {
     int num_regs = XVECLEN (operands[0], 0);
@@ -6801,7 +6841,9 @@ 
     return \"\";
   }
   "
-  [(set_attr "type" "load4")]
+  [(set_attr "type" "load4")
+   (set_attr "conds" "unconditional")
+   (set_attr "predicable" "no")]
 )
 
 (define_expand "store_multiple"
@@ -8632,13 +8674,18 @@ 
         thumb2_expand_return ();
         DONE;
       }
+    else if (TARGET_ARM)
+      {
+        arm_expand_return (true);
+        DONE;
+      }
   }
   ")
 
 ;; Often the return insn will be the same as loading from memory, so set attr
 (define_insn "*arm_return"
   [(return)]
-  "TARGET_ARM && USE_RETURN_INSN (FALSE)"
+  "TARGET_ARM"
   "*
   {
     if (arm_ccfsm_state == 2)
@@ -8648,53 +8695,11 @@ 
       }
     return output_return_instruction (const_true_rtx, TRUE, FALSE);
   }"
-  [(set_attr "type" "load1")
-   (set_attr "length" "12")
+  [(set_attr "type" "branch")
+   (set_attr "length" "4")
    (set_attr "predicable" "yes")]
 )
 
-(define_insn "*cond_return"
-  [(set (pc)
-        (if_then_else (match_operator 0 "arm_comparison_operator"
-		       [(match_operand 1 "cc_register" "") (const_int 0)])
-                      (return)
-                      (pc)))]
-  "TARGET_ARM && USE_RETURN_INSN (TRUE)"
-  "*
-  {
-    if (arm_ccfsm_state == 2)
-      {
-        arm_ccfsm_state += 2;
-        return \"\";
-      }
-    return output_return_instruction (operands[0], TRUE, FALSE);
-  }"
-  [(set_attr "conds" "use")
-   (set_attr "length" "12")
-   (set_attr "type" "load1")]
-)
-
-(define_insn "*cond_return_inverted"
-  [(set (pc)
-        (if_then_else (match_operator 0 "arm_comparison_operator"
-		       [(match_operand 1 "cc_register" "") (const_int 0)])
-                      (pc)
-		      (return)))]
-  "TARGET_ARM && USE_RETURN_INSN (TRUE)"
-  "*
-  {
-    if (arm_ccfsm_state == 2)
-      {
-        arm_ccfsm_state += 2;
-        return \"\";
-      }
-    return output_return_instruction (operands[0], TRUE, TRUE);
-  }"
-  [(set_attr "conds" "use")
-   (set_attr "length" "12")
-   (set_attr "type" "load1")]
-)
-
 ;; Generate a sequence of instructions to determine if the processor is
 ;; in 26-bit or 32-bit mode, and return the appropriate return address
 ;; mask.
@@ -10674,20 +10679,27 @@ 
   if (crtl->calls_eh_return)
     emit_insn (gen_prologue_use (gen_rtx_REG (Pmode, 2)));
   if (TARGET_THUMB1)
+  {
     thumb1_expand_epilogue ();
+    emit_jump_insn (gen_rtx_UNSPEC_VOLATILE (VOIDmode,
+             gen_rtvec (1, ret_rtx), VUNSPEC_EPILOGUE));
+    DONE;
+  }
   else if (TARGET_THUMB2)
   {
-    thumb2_expand_epilogue ();
+    thumb2_expand_epilogue (false);
     DONE;
   }
   else if (USE_RETURN_INSN (FALSE))
-    {
-      emit_jump_insn (gen_return ());
-      DONE;
-    }
-  emit_jump_insn (gen_rtx_UNSPEC_VOLATILE (VOIDmode,
-	gen_rtvec (1, ret_rtx), VUNSPEC_EPILOGUE));
-  DONE;
+  {
+    arm_expand_return (true);
+    DONE;
+  }
+  else if (TARGET_ARM)
+  {
+    arm_expand_epilogue (true);
+    DONE;
+  }
   "
 )
 
@@ -10703,31 +10715,23 @@ 
 ;; to add an unspec of the link register to ensure that flow
 ;; does not think that it is unused by the sibcall branch that
 ;; will replace the standard function epilogue.
-(define_insn "sibcall_epilogue"
+(define_expand "sibcall_epilogue"
   [(parallel [(unspec:SI [(reg:SI LR_REGNUM)] UNSPEC_PROLOGUE_USE)
               (unspec_volatile [(return)] VUNSPEC_EPILOGUE)])]
   "TARGET_32BIT"
-  "*
-  if (use_return_insn (FALSE, next_nonnote_insn (insn)))
-    return output_return_instruction (const_true_rtx, FALSE, FALSE);
-  return arm_output_epilogue (next_nonnote_insn (insn));
   "
-;; Length is absolute worst case
-  [(set_attr "length" "44")
-   (set_attr "type" "block")
-   ;; We don't clobber the conditions, but the potential length of this
-   ;; operation is sufficient to make conditionalizing the sequence 
-   ;; unlikely to be profitable.
-   (set_attr "conds" "clob")]
+    if (TARGET_ARM)
+      arm_expand_epilogue (false);
+    else if (TARGET_THUMB2)
+      thumb2_expand_epilogue (true);
+    DONE;
+  "
 )
 
 (define_insn "*epilogue_insns"
   [(unspec_volatile [(return)] VUNSPEC_EPILOGUE)]
-  "TARGET_ARM || TARGET_THUMB1"
+  "TARGET_THUMB1"
   "*
-  if (TARGET_32BIT)
-    return arm_output_epilogue (NULL);
-  else /* TARGET_THUMB1 */
     return thumb1_unexpanded_epilogue ();
   "
   ; Length is absolute worst case