diff mbox

ARM patch: Improve Thumb epilogues (PR40657)

Message ID 4C36E5F2.7000307@codesourcery.com
State New
Headers show

Commit Message

Bernd Schmidt July 9, 2010, 9:03 a.m. UTC
On 07/09/2010 01:06 AM, Richard Earnshaw wrote:
> On 08/07/10 23:47, Bernd Schmidt wrote:
>>
>> On 07/09/2010 12:46 AM, Richard Earnshaw wrote:
>>> You also need a test that the optimization isn't used when the result
>>> would be clobbered.  For example, if the function returned long long and
>>> 12 bytes needed to be popped.
>>>
>>> OK with that change.
>>
>> Surely that's covered by all the existing execute test?
>>
>>
>> Bernd
>>
>>
> Such as?

I'd expect something in gcc.c-torture/execute or maybe libstdc++ would
fail if we clobbered return values.

> I'd like to be sure that this is clearly tested.

I've added a new execute test.  Committed.


Bernd
diff mbox

Patch

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 161977)
+++ ChangeLog	(working copy)
@@ -1,3 +1,12 @@ 
+2010-07-09  Bernd Schmidt  <bernds@codesourcery.com>
+
+	PR target/40657
+	* config/arm/arm.c (thumb1_extra_regs_pushed): New arg FOR_PROLOGUE.
+	All callers changed.
+	Handle the case when we're called for the epilogue.
+	(thumb_unexpanded_epilogue): Use it.
+	(thumb1_expand_epilogue): Likewise.
+
 2010-07-08  Andi Kleen <ak@linux.intel.com>
 
 	* lto-section-in.c (lto_section_name): Add missing comma.
Index: testsuite/ChangeLog
===================================================================
--- testsuite/ChangeLog	(revision 161977)
+++ testsuite/ChangeLog	(working copy)
@@ -1,3 +1,10 @@ 
+2010-07-09  Bernd Schmidt  <bernds@codesourcery.com>
+
+	PR target/40657
+	* gcc.target/arm/pr40657-1.c: New test.
+	* gcc.target/arm/pr40657-2.c: New test.
+	* gcc.c-torture/execute/pr40657.c: New test.
+
 2010-07-08  Janus Weil  <janus@gcc.gnu.org>
 
 	PR fortran/44649
Index: config/arm/arm.c
===================================================================
--- config/arm/arm.c	(revision 161831)
+++ config/arm/arm.c	(working copy)
@@ -19564,6 +19564,81 @@  is_called_in_ARM_mode (tree func)
 #endif
 }
 
+/* Given the stack offsets and register mask in OFFSETS, decide how
+   many additional registers to push instead of subtracting a constant
+   from SP.  For epilogues the principle is the same except we use pop.
+   FOR_PROLOGUE indicates which we're generating.  */
+static int
+thumb1_extra_regs_pushed (arm_stack_offsets *offsets, bool for_prologue)
+{
+  HOST_WIDE_INT amount;
+  unsigned long live_regs_mask = offsets->saved_regs_mask;
+  /* Extract a mask of the ones we can give to the Thumb's push/pop
+     instruction.  */
+  unsigned long l_mask = live_regs_mask & (for_prologue ? 0x40ff : 0xff);
+  /* Then count how many other high registers will need to be pushed.  */
+  unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
+  int n_free, reg_base;
+
+  if (!for_prologue && frame_pointer_needed)
+    amount = offsets->locals_base - offsets->saved_regs;
+  else
+    amount = offsets->outgoing_args - offsets->saved_regs;
+
+  /* If the stack frame size is 512 exactly, we can save one load
+     instruction, which should make this a win even when optimizing
+     for speed.  */
+  if (!optimize_size && amount != 512)
+    return 0;
+
+  /* Can't do this if there are high registers to push.  */
+  if (high_regs_pushed != 0)
+    return 0;
+
+  /* Shouldn't do it in the prologue if no registers would normally
+     be pushed at all.  In the epilogue, also allow it if we'll have
+     a pop insn for the PC.  */
+  if  (l_mask == 0
+       && (for_prologue
+	   || TARGET_BACKTRACE
+	   || (live_regs_mask & 1 << LR_REGNUM) == 0
+	   || TARGET_INTERWORK
+	   || crtl->args.pretend_args_size != 0))
+    return 0;
+
+  /* Don't do this if thumb_expand_prologue wants to emit instructions
+     between the push and the stack frame allocation.  */
+  if (for_prologue
+      && ((flag_pic && arm_pic_register != INVALID_REGNUM)
+	  || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0)))
+    return 0;
+
+  reg_base = 0;
+  n_free = 0;
+  if (!for_prologue)
+    {
+      reg_base = arm_size_return_regs () / UNITS_PER_WORD;
+      live_regs_mask >>= reg_base;
+    }
+
+  while (reg_base + n_free < 8 && !(live_regs_mask & 1)
+	 && (for_prologue || call_used_regs[reg_base + n_free]))
+    {
+      live_regs_mask >>= 1;
+      n_free++;
+    }
+
+  if (n_free == 0)
+    return 0;
+  gcc_assert (amount / 4 * 4 == amount);
+
+  if (amount >= 512 && (amount - n_free * 4) < 512)
+    return (amount - 508) / 4;
+  if (amount <= n_free * 4)
+    return amount / 4;
+  return 0;
+}
+
 /* The bits which aren't usefully expanded as rtl.  */
 const char *
 thumb_unexpanded_epilogue (void)
@@ -19572,6 +19647,7 @@  thumb_unexpanded_epilogue (void)
   int regno;
   unsigned long live_regs_mask = 0;
   int high_regs_pushed = 0;
+  int extra_pop;
   int had_to_push_lr;
   int size;
 
@@ -19591,6 +19667,13 @@  thumb_unexpanded_epilogue (void)
      the register is used to hold a return value.  */
   size = arm_size_return_regs ();
 
+  extra_pop = thumb1_extra_regs_pushed (offsets, false);
+  if (extra_pop > 0)
+    {
+      unsigned long extra_mask = (1 << extra_pop) - 1;
+      live_regs_mask |= extra_mask << (size / UNITS_PER_WORD);
+    }
+
   /* The prolog may have pushed some high registers to use as
      work registers.  e.g. the testsuite file:
      gcc/testsuite/gcc/gcc.c-torture/execute/complex-2.c
@@ -19674,7 +19757,9 @@  thumb_unexpanded_epilogue (void)
 		       live_regs_mask);
 
       /* We have either just popped the return address into the
-	 PC or it is was kept in LR for the entire function.  */
+	 PC or it is was kept in LR for the entire function.
+	 Note that thumb_pushpop has already called thumb_exit if the
+	 PC was in the list.  */
       if (!had_to_push_lr)
 	thumb_exit (asm_out_file, LR_REGNUM);
     }
@@ -19820,51 +19905,6 @@  thumb_compute_initial_elimination_offset
     }
 }
 
-/* Given the stack offsets and register mask in OFFSETS, decide
-   how many additional registers to push instead of subtracting
-   a constant from SP.  */
-static int
-thumb1_extra_regs_pushed (arm_stack_offsets *offsets)
-{
-  HOST_WIDE_INT amount = offsets->outgoing_args - offsets->saved_regs;
-  unsigned long live_regs_mask = offsets->saved_regs_mask;
-  /* Extract a mask of the ones we can give to the Thumb's push instruction.  */
-  unsigned long l_mask = live_regs_mask & 0x40ff;
-  /* Then count how many other high registers will need to be pushed.  */
-  unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
-  int n_free;
-
-  /* If the stack frame size is 512 exactly, we can save one load
-     instruction, which should make this a win even when optimizing
-     for speed.  */
-  if (!optimize_size && amount != 512)
-    return 0;
-
-  /* Can't do this if there are high registers to push, or if we
-     are not going to do a push at all.  */
-  if (high_regs_pushed != 0 || l_mask == 0)
-    return 0;
-
-  /* Don't do this if thumb1_expand_prologue wants to emit instructions
-     between the push and the stack frame allocation.  */
-  if ((flag_pic && arm_pic_register != INVALID_REGNUM)
-      || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0))
-    return 0;
-
-  for (n_free = 0; n_free < 8 && !(live_regs_mask & 1); live_regs_mask >>= 1)
-    n_free++;
-
-  if (n_free == 0)
-    return 0;
-  gcc_assert (amount / 4 * 4 == amount);
-
-  if (amount >= 512 && (amount - n_free * 4) < 512)
-    return (amount - 508) / 4;
-  if (amount <= n_free * 4)
-    return amount / 4;
-  return 0;
-}
-
 /* Generate the rest of a function's prologue.  */
 void
 thumb1_expand_prologue (void)
@@ -19901,7 +19941,7 @@  thumb1_expand_prologue (void)
 		    stack_pointer_rtx);
 
   amount = offsets->outgoing_args - offsets->saved_regs;
-  amount -= 4 * thumb1_extra_regs_pushed (offsets);
+  amount -= 4 * thumb1_extra_regs_pushed (offsets, true);
   if (amount)
     {
       if (amount < 512)
@@ -19986,6 +20026,7 @@  thumb1_expand_epilogue (void)
       emit_insn (gen_movsi (stack_pointer_rtx, hard_frame_pointer_rtx));
       amount = offsets->locals_base - offsets->saved_regs;
     }
+  amount -= 4 * thumb1_extra_regs_pushed (offsets, false);
 
   gcc_assert (amount >= 0);
   if (amount)
@@ -20208,7 +20249,7 @@  thumb1_output_function_prologue (FILE *f
 	   || (high_regs_pushed == 0 && l_mask))
     {
       unsigned long mask = l_mask;
-      mask |= (1 << thumb1_extra_regs_pushed (offsets)) - 1;
+      mask |= (1 << thumb1_extra_regs_pushed (offsets, true)) - 1;
       thumb_pushpop (f, mask, 1, &cfa_offset, mask);
     }
 
Index: testsuite/gcc.c-torture/execute/pr40657.c
===================================================================
--- testsuite/gcc.c-torture/execute/pr40657.c	(revision 0)
+++ testsuite/gcc.c-torture/execute/pr40657.c	(revision 0)
@@ -0,0 +1,23 @@ 
+/* Verify that that Thumb-1 epilogue size optimization does not clobber the
+   return value.  */
+
+long long v = 0x123456789abc;
+
+__attribute__((noinline)) void bar (int *x)
+{
+  asm volatile ("" : "=m" (x) ::);
+}
+
+__attribute__((noinline)) long long foo()
+{
+  int x;
+  bar(&x);
+  return v;
+}
+
+int main ()
+{
+  if (foo () != v)
+    abort ();
+  exit (0);
+}
Index: testsuite/gcc.target/arm/pr40657-1.c
===================================================================
--- testsuite/gcc.target/arm/pr40657-1.c	(revision 0)
+++ testsuite/gcc.target/arm/pr40657-1.c	(revision 0)
@@ -0,0 +1,13 @@ 
+/* { dg-options "-Os -march=armv5te -mthumb" }  */
+/* { dg-require-effective-target arm_thumb1_ok } */
+/* { dg-final { scan-assembler "pop.*r1.*pc" } } */
+/* { dg-final { scan-assembler-not "sub\[\\t \]*sp,\[\\t \]*sp" } } */
+/* { dg-final { scan-assembler-not "add\[\\t \]*sp,\[\\t \]*sp" } } */
+
+extern void bar(int*);
+int foo()
+{
+  int x;
+  bar(&x);
+  return x;
+}
Index: testsuite/gcc.target/arm/pr40657-2.c
===================================================================
--- testsuite/gcc.target/arm/pr40657-2.c	(revision 0)
+++ testsuite/gcc.target/arm/pr40657-2.c	(revision 0)
@@ -0,0 +1,20 @@ 
+/* { dg-options "-Os -march=armv4t -mthumb" }  */
+/* { dg-require-effective-target arm_thumb1_ok } */
+/* { dg-final { scan-assembler-not "sub\[\\t \]*sp,\[\\t \]*sp" } } */
+/* { dg-final { scan-assembler-not "add\[\\t \]*sp,\[\\t \]*sp" } } */
+
+/* Here, we test that if there's a pop of r[4567] in the epilogue,
+   add sp,sp,#12 is removed and replaced by three additional pops
+   of lower-numbered regs.  */
+
+extern void bar(int*);
+
+int t1, t2, t3, t4, t5;
+int foo()
+{
+  int i,j,k,x = 0;
+  for (i = 0; i < t1; i++)
+    for (j = 0; j < t2; j++)
+	  bar(&x);
+  return x;
+}