Patchwork ARM patch: Improve Thumb epilogues (PR40657)

login
register
mail settings
Submitter Bernd Schmidt
Date July 8, 2010, 10:20 p.m.
Message ID <4C364F2A.7040104@codesourcery.com>
Download mbox | patch
Permalink /patch/58307/
State New
Headers show

Comments

Bernd Schmidt - July 8, 2010, 10:20 p.m.
When optimizing for size, the Thumb-1 epilogue

        add     sp, sp, #12
        pop     {pc}

can be improved to

        pop     {r1-r3, pc}

I previously added this kind of optimization for prologues, and somehow
overlooked that the PR also mentions epilogues.

Regression tested with

Schedule of variations:
    qemu-system-armv7-2/arch=armv5te/thumb
    qemu-system-armv7-2/thumb

(as v4 and v5 apparently have slightly different epilogues)

Ok?


Bernd
* config/arm/arm.c (thumb1_extra_regs_pushed): New arg FOR_PROLOGUE.
	All callers changed.
	Handle the case when we're called for the epilogue.
	(thumb_unexpanded_epilogue): Use it.
	(thumb1_expand_epilogue): Likewise.

testsuite/
	* gcc.target/arm/pr40657-1.c: New test.
	* gcc.target/arm/pr40657-2.c: New test.
Richard Earnshaw - July 8, 2010, 10:46 p.m.
On 08/07/10 23:20, Bernd Schmidt wrote:
> When optimizing for size, the Thumb-1 epilogue
>
>          add     sp, sp, #12
>          pop     {pc}
>
> can be improved to
>
>          pop     {r1-r3, pc}
>
> I previously added this kind of optimization for prologues, and somehow
> overlooked that the PR also mentions epilogues.
>
> Regression tested with
>
> Schedule of variations:
>      qemu-system-armv7-2/arch=armv5te/thumb
>      qemu-system-armv7-2/thumb
>
> (as v4 and v5 apparently have slightly different epilogues)
>
> Ok?
>
>
> Bernd


You also need a test that the optimization isn't used when the result 
would be clobbered.  For example, if the function returned long long and 
12 bytes needed to be popped.

OK with that change.

R.
Bernd Schmidt - July 8, 2010, 10:47 p.m.
On 07/09/2010 12:46 AM, Richard Earnshaw wrote:
> You also need a test that the optimization isn't used when the result
> would be clobbered.  For example, if the function returned long long and
> 12 bytes needed to be popped.
> 
> OK with that change.

Surely that's covered by all the existing execute test?


Bernd
Richard Earnshaw - July 8, 2010, 11:06 p.m.
On 08/07/10 23:47, Bernd Schmidt wrote:
>
> On 07/09/2010 12:46 AM, Richard Earnshaw wrote:
>> You also need a test that the optimization isn't used when the result
>> would be clobbered.  For example, if the function returned long long and
>> 12 bytes needed to be popped.
>>
>> OK with that change.
>
> Surely that's covered by all the existing execute test?
>
>
> Bernd
>
>
Such as?

I'd like to be sure that this is clearly tested.

R.

Patch

Index: testsuite/gcc.target/arm/pr40657-1.c
===================================================================
--- testsuite/gcc.target/arm/pr40657-1.c	(revision 0)
+++ testsuite/gcc.target/arm/pr40657-1.c	(revision 0)
@@ -0,0 +1,13 @@ 
+/* { dg-options "-Os -march=armv5te -mthumb" }  */
+/* { dg-require-effective-target arm_thumb1_ok } */
+/* { dg-final { scan-assembler "pop.*r1.*pc" } } */
+/* { dg-final { scan-assembler-not "sub\[\\t \]*sp,\[\\t \]*sp" } } */
+/* { dg-final { scan-assembler-not "add\[\\t \]*sp,\[\\t \]*sp" } } */
+
+extern void bar(int*);
+int foo()
+{
+  int x;
+  bar(&x);
+  return x;
+}
Index: testsuite/gcc.target/arm/pr40657-2.c
===================================================================
--- testsuite/gcc.target/arm/pr40657-2.c	(revision 0)
+++ testsuite/gcc.target/arm/pr40657-2.c	(revision 0)
@@ -0,0 +1,20 @@ 
+/* { dg-options "-Os -march=armv4t -mthumb" }  */
+/* { dg-require-effective-target arm_thumb1_ok } */
+/* { dg-final { scan-assembler-not "sub\[\\t \]*sp,\[\\t \]*sp" } } */
+/* { dg-final { scan-assembler-not "add\[\\t \]*sp,\[\\t \]*sp" } } */
+
+/* Here, we test that if there's a pop of r[4567] in the epilogue,
+   add sp,sp,#12 is removed and replaced by three additional pops
+   of lower-numbered regs.  */
+
+extern void bar(int*);
+
+int t1, t2, t3, t4, t5;
+int foo()
+{
+  int i,j,k,x = 0;
+  for (i = 0; i < t1; i++)
+    for (j = 0; j < t2; j++)
+	  bar(&x);
+  return x;
+}
Index: config/arm/arm.c
===================================================================
--- config/arm/arm.c	(revision 161831)
+++ config/arm/arm.c	(working copy)
@@ -19564,6 +19564,81 @@  is_called_in_ARM_mode (tree func)
 #endif
 }
 
+/* Given the stack offsets and register mask in OFFSETS, decide how
+   many additional registers to push instead of subtracting a constant
+   from SP.  For epilogues the principle is the same except we use pop.
+   FOR_PROLOGUE indicates which we're generating.  */
+static int
+thumb1_extra_regs_pushed (arm_stack_offsets *offsets, bool for_prologue)
+{
+  HOST_WIDE_INT amount;
+  unsigned long live_regs_mask = offsets->saved_regs_mask;
+  /* Extract a mask of the ones we can give to the Thumb's push/pop
+     instruction.  */
+  unsigned long l_mask = live_regs_mask & (for_prologue ? 0x40ff : 0xff);
+  /* Then count how many other high registers will need to be pushed.  */
+  unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
+  int n_free, reg_base;
+
+  if (!for_prologue && frame_pointer_needed)
+    amount = offsets->locals_base - offsets->saved_regs;
+  else
+    amount = offsets->outgoing_args - offsets->saved_regs;
+
+  /* If the stack frame size is 512 exactly, we can save one load
+     instruction, which should make this a win even when optimizing
+     for speed.  */
+  if (!optimize_size && amount != 512)
+    return 0;
+
+  /* Can't do this if there are high registers to push.  */
+  if (high_regs_pushed != 0)
+    return 0;
+
+  /* Shouldn't do it in the prologue if no registers would normally
+     be pushed at all.  In the epilogue, also allow it if we'll have
+     a pop insn for the PC.  */
+  if  (l_mask == 0
+       && (for_prologue
+	   || TARGET_BACKTRACE
+	   || (live_regs_mask & 1 << LR_REGNUM) == 0
+	   || TARGET_INTERWORK
+	   || crtl->args.pretend_args_size != 0))
+    return 0;
+
+  /* Don't do this if thumb_expand_prologue wants to emit instructions
+     between the push and the stack frame allocation.  */
+  if (for_prologue
+      && ((flag_pic && arm_pic_register != INVALID_REGNUM)
+	  || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0)))
+    return 0;
+
+  reg_base = 0;
+  n_free = 0;
+  if (!for_prologue)
+    {
+      reg_base = arm_size_return_regs () / UNITS_PER_WORD;
+      live_regs_mask >>= reg_base;
+    }
+
+  while (reg_base + n_free < 8 && !(live_regs_mask & 1)
+	 && (for_prologue || call_used_regs[reg_base + n_free]))
+    {
+      live_regs_mask >>= 1;
+      n_free++;
+    }
+
+  if (n_free == 0)
+    return 0;
+  gcc_assert (amount / 4 * 4 == amount);
+
+  if (amount >= 512 && (amount - n_free * 4) < 512)
+    return (amount - 508) / 4;
+  if (amount <= n_free * 4)
+    return amount / 4;
+  return 0;
+}
+
 /* The bits which aren't usefully expanded as rtl.  */
 const char *
 thumb_unexpanded_epilogue (void)
@@ -19572,6 +19647,7 @@  thumb_unexpanded_epilogue (void)
   int regno;
   unsigned long live_regs_mask = 0;
   int high_regs_pushed = 0;
+  int extra_pop;
   int had_to_push_lr;
   int size;
 
@@ -19591,6 +19667,13 @@  thumb_unexpanded_epilogue (void)
      the register is used to hold a return value.  */
   size = arm_size_return_regs ();
 
+  extra_pop = thumb1_extra_regs_pushed (offsets, false);
+  if (extra_pop > 0)
+    {
+      unsigned long extra_mask = (1 << extra_pop) - 1;
+      live_regs_mask |= extra_mask << (size / UNITS_PER_WORD);
+    }
+
   /* The prolog may have pushed some high registers to use as
      work registers.  e.g. the testsuite file:
      gcc/testsuite/gcc/gcc.c-torture/execute/complex-2.c
@@ -19674,7 +19757,9 @@  thumb_unexpanded_epilogue (void)
 		       live_regs_mask);
 
       /* We have either just popped the return address into the
-	 PC or it is was kept in LR for the entire function.  */
+	 PC or it is was kept in LR for the entire function.
+	 Note that thumb_pushpop has already called thumb_exit if the
+	 PC was in the list.  */
       if (!had_to_push_lr)
 	thumb_exit (asm_out_file, LR_REGNUM);
     }
@@ -19820,51 +19905,6 @@  thumb_compute_initial_elimination_offset
     }
 }
 
-/* Given the stack offsets and register mask in OFFSETS, decide
-   how many additional registers to push instead of subtracting
-   a constant from SP.  */
-static int
-thumb1_extra_regs_pushed (arm_stack_offsets *offsets)
-{
-  HOST_WIDE_INT amount = offsets->outgoing_args - offsets->saved_regs;
-  unsigned long live_regs_mask = offsets->saved_regs_mask;
-  /* Extract a mask of the ones we can give to the Thumb's push instruction.  */
-  unsigned long l_mask = live_regs_mask & 0x40ff;
-  /* Then count how many other high registers will need to be pushed.  */
-  unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
-  int n_free;
-
-  /* If the stack frame size is 512 exactly, we can save one load
-     instruction, which should make this a win even when optimizing
-     for speed.  */
-  if (!optimize_size && amount != 512)
-    return 0;
-
-  /* Can't do this if there are high registers to push, or if we
-     are not going to do a push at all.  */
-  if (high_regs_pushed != 0 || l_mask == 0)
-    return 0;
-
-  /* Don't do this if thumb1_expand_prologue wants to emit instructions
-     between the push and the stack frame allocation.  */
-  if ((flag_pic && arm_pic_register != INVALID_REGNUM)
-      || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0))
-    return 0;
-
-  for (n_free = 0; n_free < 8 && !(live_regs_mask & 1); live_regs_mask >>= 1)
-    n_free++;
-
-  if (n_free == 0)
-    return 0;
-  gcc_assert (amount / 4 * 4 == amount);
-
-  if (amount >= 512 && (amount - n_free * 4) < 512)
-    return (amount - 508) / 4;
-  if (amount <= n_free * 4)
-    return amount / 4;
-  return 0;
-}
-
 /* Generate the rest of a function's prologue.  */
 void
 thumb1_expand_prologue (void)
@@ -19901,7 +19941,7 @@  thumb1_expand_prologue (void)
 		    stack_pointer_rtx);
 
   amount = offsets->outgoing_args - offsets->saved_regs;
-  amount -= 4 * thumb1_extra_regs_pushed (offsets);
+  amount -= 4 * thumb1_extra_regs_pushed (offsets, true);
   if (amount)
     {
       if (amount < 512)
@@ -19986,6 +20026,7 @@  thumb1_expand_epilogue (void)
       emit_insn (gen_movsi (stack_pointer_rtx, hard_frame_pointer_rtx));
       amount = offsets->locals_base - offsets->saved_regs;
     }
+  amount -= 4 * thumb1_extra_regs_pushed (offsets, false);
 
   gcc_assert (amount >= 0);
   if (amount)
@@ -20208,7 +20249,7 @@  thumb1_output_function_prologue (FILE *f
 	   || (high_regs_pushed == 0 && l_mask))
     {
       unsigned long mask = l_mask;
-      mask |= (1 << thumb1_extra_regs_pushed (offsets)) - 1;
+      mask |= (1 << thumb1_extra_regs_pushed (offsets, true)) - 1;
       thumb_pushpop (f, mask, 1, &cfa_offset, mask);
     }