diff mbox

[AArch64/GCC,17/N] Optimize prologue when there is no frame pointe

Message ID 53D0FF7E.1070309@arm.com
State New
Headers show

Commit Message

Jiong Wang July 24, 2014, 12:43 p.m. UTC
Under new pro/epi code, we could also utilize our store write-back to optimize
stack adjustment when there is no frame pointer.

* if there is candidate reg pair and adjustment amount is less than 512 then we
could use aarch64's paired store write-back.
* if there is only a single candidate reg and adjustment amount is less than 256,
we could use aarch64's single store write-back.
* otherwise use explictly subtraction to finish stack adjustment.
     
   Improved testcases:
     
     gcc.target/aarch64/test_frame_1.c
     gcc.target/aarch64/test_frame_10.c
     gcc.target/aarch64/test_frame_2.c
     gcc.target/aarch64/test_frame_4.c
     gcc.target/aarch64/test_frame_6.c
     gcc.target/aarch64/test_frame_7.c
     gcc.target/aarch64/test_frame_8.c
     gcc.target/aarch64/test_fp_attribute_1.c

ok for install?

gcc/
   * config/aarch64/aarch64.c (aarch64_pushwb_single_reg): New function.
   (aarch64_expand_prologue): Optimize prologue when !frame_pointer_needed.

gcc/testsuite/
   * gcc.target/aarch64/test_frame_1.c: Match optimized instruction sequences.
   * gcc.target/aarch64/test_frame_10.c: Likewise.
   * gcc.target/aarch64/test_frame_2.c: Likewise.
   * gcc.target/aarch64/test_frame_4.c: Likewise.
   * gcc.target/aarch64/test_frame_6.c: Likewise.
   * gcc.target/aarch64/test_frame_7.c: Likewise.
   * gcc.target/aarch64/test_frame_8.c: Likewise.
   * gcc.target/aarch64/test_fp_attribute_1.c: Likewise.

Comments

Marcus Shawcroft July 24, 2014, 2:45 p.m. UTC | #1
On 24 July 2014 13:43, Jiong Wang <jiong.wang@arm.com> wrote:
> Under new pro/epi code, we could also utilize our store write-back to
> optimize
> stack adjustment when there is no frame pointer.
>
> * if there is candidate reg pair and adjustment amount is less than 512 then
> we
> could use aarch64's paired store write-back.
> * if there is only a single candidate reg and adjustment amount is less than
> 256,
> we could use aarch64's single store write-back.
> * otherwise use explictly subtraction to finish stack adjustment.
>       Improved testcases:
>         gcc.target/aarch64/test_frame_1.c
>     gcc.target/aarch64/test_frame_10.c
>     gcc.target/aarch64/test_frame_2.c
>     gcc.target/aarch64/test_frame_4.c
>     gcc.target/aarch64/test_frame_6.c
>     gcc.target/aarch64/test_frame_7.c
>     gcc.target/aarch64/test_frame_8.c
>     gcc.target/aarch64/test_fp_attribute_1.c

OK and committed.
/Marcus
diff mbox

Patch

From e3ab087747c2f4ddeef0482983b2ebc3bbdc131f Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@arm.com>
Date: Tue, 17 Jun 2014 22:24:44 +0100
Subject: [PATCH 17/19] [AArch64/GCC][18/20] Optimize prologue when there is
 no frame pointer

Under new pro/epi code, we could also utilize our store write-back to optimize
stack adjustment when there is no frame pointer.

* if there is candidate reg pair and adjustment amount is less than 512 then we
  could use aarch64's paired store write-back.
* if there is only a single candidate reg and adjustment amount is less than
  256, we could use aarch64's single store write-back.
* otherwise use explictly subtraction to finish stack adjustment.

  Improved testcases:

  gcc.target/aarch64/test_frame_1.c
  gcc.target/aarch64/test_frame_10.c
  gcc.target/aarch64/test_frame_2.c
  gcc.target/aarch64/test_frame_4.c
  gcc.target/aarch64/test_frame_6.c
  gcc.target/aarch64/test_frame_7.c
  gcc.target/aarch64/test_frame_8.c
  gcc.target/aarch64/test_fp_attribute_1.c

2014-06-16  Jiong Wang <jiong.wang@arm.com>
	    Marcus Shawcroft  <marcus.shawcroft@arm.com>

gcc/
  * config/aarch64/aarch64.c (aarch64_pushwb_single_reg): New function.
  (aarch64_expand_prologue): Optimize prologue when !frame_pointer_needed.

gcc/testsuite/
  * gcc.target/aarch64/test_frame_1.c: Match optimized instruction sequences.
  * gcc.target/aarch64/test_frame_10.c: Likewise.
  * gcc.target/aarch64/test_frame_2.c: Likewise.
  * gcc.target/aarch64/test_frame_4.c: Likewise.
  * gcc.target/aarch64/test_frame_6.c: Likewise.
  * gcc.target/aarch64/test_frame_7.c: Likewise.
  * gcc.target/aarch64/test_frame_8.c: Likewise.
  * gcc.target/aarch64/test_fp_attribute_1.c: Likewise.
---
 gcc/config/aarch64/aarch64.c                       |   58 +++++++++++++++-----
 .../gcc.target/aarch64/test_fp_attribute_1.c       |    2 +-
 gcc/testsuite/gcc.target/aarch64/test_frame_1.c    |    5 +-
 gcc/testsuite/gcc.target/aarch64/test_frame_10.c   |    5 +-
 gcc/testsuite/gcc.target/aarch64/test_frame_2.c    |    6 +-
 gcc/testsuite/gcc.target/aarch64/test_frame_4.c    |    5 +-
 gcc/testsuite/gcc.target/aarch64/test_frame_6.c    |    5 +-
 gcc/testsuite/gcc.target/aarch64/test_frame_7.c    |    5 +-
 gcc/testsuite/gcc.target/aarch64/test_frame_8.c    |    5 +-
 9 files changed, 74 insertions(+), 22 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 26d5fba..365fdd4 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1927,6 +1927,22 @@  aarch64_next_callee_save (unsigned regno, unsigned limit)
   return regno;
 }
 
+static void
+aarch64_pushwb_single_reg (enum machine_mode mode, unsigned regno,
+			   HOST_WIDE_INT adjustment)
+ {
+  rtx base_rtx = stack_pointer_rtx;
+  rtx insn, reg, mem;
+
+  reg = gen_rtx_REG (mode, regno);
+  mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
+			    plus_constant (Pmode, base_rtx, -adjustment));
+  mem = gen_rtx_MEM (mode, mem);
+
+  insn = emit_move_insn (mem, reg);
+  RTX_FRAME_RELATED_P (insn) = 1;
+}
+
 static rtx
 aarch64_gen_storewb_pair (enum machine_mode mode, rtx base, rtx reg, rtx reg2,
 			  HOST_WIDE_INT adjustment)
@@ -2276,11 +2292,10 @@  aarch64_expand_prologue (void)
     {
       bool skip_wb = false;
 
-      /* Save the frame pointer and lr if the frame pointer is needed
-	 first.  Make the frame pointer point to the location of the
-	 old frame pointer on the stack.  */
       if (frame_pointer_needed)
 	{
+	  skip_wb = true;
+
 	  if (fp_offset)
 	    {
 	      insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
@@ -2288,12 +2303,11 @@  aarch64_expand_prologue (void)
 	      RTX_FRAME_RELATED_P (insn) = 1;
 	      aarch64_set_frame_expr (gen_rtx_SET
 				      (Pmode, stack_pointer_rtx,
-				       gen_rtx_MINUS (Pmode,
-						      stack_pointer_rtx,
+				       gen_rtx_MINUS (Pmode, stack_pointer_rtx,
 						      GEN_INT (offset))));
 
 	      aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
-					 R30_REGNUM, skip_wb);
+					 R30_REGNUM, false);
 	    }
 	  else
 	    aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
@@ -2311,20 +2325,36 @@  aarch64_expand_prologue (void)
 	  RTX_FRAME_RELATED_P (insn) = 1;
 	  insn = emit_insn (gen_stack_tie (stack_pointer_rtx,
 					   hard_frame_pointer_rtx));
-
-	  aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R28_REGNUM,
-				     skip_wb);
 	}
       else
 	{
-	  insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
-					   GEN_INT (-offset)));
-	  RTX_FRAME_RELATED_P (insn) = 1;
+	  unsigned reg1 = cfun->machine->frame.wb_candidate1;
+	  unsigned reg2 = cfun->machine->frame.wb_candidate2;
 
-	  aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
-				     skip_wb);
+	  if (fp_offset
+	      || reg1 == FIRST_PSEUDO_REGISTER
+	      || (reg2 == FIRST_PSEUDO_REGISTER
+		  && offset >= 256))
+	    {
+	      insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
+					       GEN_INT (-offset)));
+	      RTX_FRAME_RELATED_P (insn) = 1;
+	    }
+	  else
+	    {
+	      enum machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
+
+	      skip_wb = true;
+
+	      if (reg2 == FIRST_PSEUDO_REGISTER)
+		aarch64_pushwb_single_reg (mode1, reg1, offset);
+	      else
+		aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
+	    }
 	}
 
+      aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
+				 skip_wb);
       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
 				 skip_wb);
     }
diff --git a/gcc/testsuite/gcc.target/aarch64/test_fp_attribute_1.c b/gcc/testsuite/gcc.target/aarch64/test_fp_attribute_1.c
index 7538250..960174a 100644
--- a/gcc/testsuite/gcc.target/aarch64/test_fp_attribute_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/test_fp_attribute_1.c
@@ -21,6 +21,6 @@  non_leaf_2 (void)
   leaf ();
 }
 
-/* { dg-final { scan-assembler-times "str\tx30, \\\[sp\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 2 } } */
 
 /* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_1.c b/gcc/testsuite/gcc.target/aarch64/test_frame_1.c
index feea7a2..e9d04aa 100644
--- a/gcc/testsuite/gcc.target/aarch64/test_frame_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/test_frame_1.c
@@ -6,9 +6,12 @@ 
      * optimized code should use "str !" for stack adjustment.  */
 
 /* { dg-do run } */
-/* { dg-options "-O2 -fomit-frame-pointer" } */
+/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
 
 #include "test_frame_common.h"
 
 t_frame_pattern (test1, 200, )
 t_frame_run (test1)
+
+/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 2 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_10.c b/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
index 2892c5f..b646a71 100644
--- a/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
+++ b/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
@@ -8,9 +8,12 @@ 
        the first subtractions could be optimized into "stp !".  */
 
 /* { dg-do run } */
-/* { dg-options "-O2 -fomit-frame-pointer" } */
+/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
 
 #include "test_frame_common.h"
 
 t_frame_pattern_outgoing (test10, 480, "x19", 24, a[8], a[9], a[10])
 t_frame_run (test10)
+
+/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" 1 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_2.c b/gcc/testsuite/gcc.target/aarch64/test_frame_2.c
index aa15dae..b972664 100644
--- a/gcc/testsuite/gcc.target/aarch64/test_frame_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/test_frame_2.c
@@ -6,9 +6,13 @@ 
      * optimized code should use "stp !" for stack adjustment.  */
 
 /* { dg-do run } */
-/* { dg-options "-O2 -fomit-frame-pointer" } */
+/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
 
 #include "test_frame_common.h"
 
 t_frame_pattern (test2, 200, "x19")
 t_frame_run (test2)
+
+
+/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" 1 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_4.c b/gcc/testsuite/gcc.target/aarch64/test_frame_4.c
index c45e740..5a9a919 100644
--- a/gcc/testsuite/gcc.target/aarch64/test_frame_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/test_frame_4.c
@@ -6,9 +6,12 @@ 
      * we can use "stp !" to optimize stack adjustment.  */
 
 /* { dg-do run } */
-/* { dg-options "-O2 -fomit-frame-pointer" } */
+/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
 
 #include "test_frame_common.h"
 
 t_frame_pattern (test4, 400, "x19")
 t_frame_run (test4)
+
+/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" 1 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_6.c b/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
index 54f646b..6056f57 100644
--- a/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
@@ -7,9 +7,12 @@ 
        the second subtraction should use "str !".  */
 
 /* { dg-do run } */
-/* { dg-options "-O2 -fomit-frame-pointer" } */
+/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
 
 #include "test_frame_common.h"
 
 t_frame_pattern (test6, 700, )
 t_frame_run (test6)
+
+/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 2 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_7.c b/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
index aa97bc0..991860c 100644
--- a/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
@@ -7,9 +7,12 @@ 
        the second subtraction should use "stp !".  */
 
 /* { dg-do run } */
-/* { dg-options "-O2 -fomit-frame-pointer" } */
+/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
 
 #include "test_frame_common.h"
 
 t_frame_pattern (test7, 700, "x19")
 t_frame_run (test7)
+
+/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" 1 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_8.c b/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
index f75f080..4a4d93b 100644
--- a/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
+++ b/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
@@ -5,9 +5,12 @@ 
      * number of callee-saved reg == 1.  */
 
 /* { dg-do run } */
-/* { dg-options "-O2 -fomit-frame-pointer" } */
+/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
 
 #include "test_frame_common.h"
 
 t_frame_pattern_outgoing (test8, 700, , 8, a[8])
 t_frame_run (test8)
+
+/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 3 } } */
+/* { dg-final { cleanup-saved-temps } } */
-- 
1.7.9.5