diff mbox

[Updated] i386: Avoid stack realignment if possible

Message ID 20170813220203.GA32085@gmail.com
State New
Headers show

Commit Message

H.J. Lu Aug. 13, 2017, 10:02 p.m. UTC
On Mon, Aug 07, 2017 at 08:58:49AM -0700, H.J. Lu wrote:
> On Tue, Jul 25, 2017 at 7:54 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> > On Tue, Jul 25, 2017 at 3:52 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> >> On Fri, Jul 14, 2017 at 4:46 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> >>> On Fri, Jul 7, 2017 at 5:56 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> >>>> On Fri, Jul 07, 2017 at 09:58:42AM -0700, H.J. Lu wrote:
> >>>>> On Fri, Dec 20, 2013 at 8:06 AM, Jakub Jelinek <jakub@redhat.com> wrote:
> >>>>> > Hi!
> >>>>> >
> >>>>> > Honza recently changed the i?86 backend, so that it often doesn't
> >>>>> > do -maccumulate-outgoing-args by default on x86_64.
> >>>>> > Unfortunately, on some of the here included testcases this regressed
> >>>>> > quite a bit the generated code.  As AVX vectors are used, the dynamic
> >>>>> > realignment code needs to assume e.g. that some of them will need to be
> >>>>> > spilled, and for -mno-accumulate-outgoing-args the code needs to set
> >>>>> > need_drap early as well.  But in when emitting the prologue/epilogue,
> >>>>> > if need_drap is set, we don't perform the optimization for leaf functions
> >>>>> > which have zero size stack frame, thus we end up with uselessly doing
> >>>>> > dynamic stack realignment, setting up DRAP that nothing uses and later on
> >>>>> > restore everything back.
> >>>>> >
> >>>>> > This patch improves it, if the DRAP register isn't live at the start of
> >>>>> > entry bb successor and we aren't going to realign the stack, we don't
> >>>>> > need DRAP at all, and even if we need DRAP register, that can't be the sole
> >>>>> > reason for doing stack realignment, the prologue code is able to set up DRAP
> >>>>> > even without dynamic stack realignment.
> >>>>> >
> >>>>> > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> >>>>> >
> >>>>> > 2013-12-20  Jakub Jelinek  <jakub@redhat.com>
> >>>>> >
> >>>>> >         PR target/59501
> >>>>> >         * config/i386/i386.c (ix86_save_reg): Don't return true for drap_reg
> >>>>> >         if !crtl->stack_realign_needed.
> >>>>> >         (ix86_finalize_stack_realign_flags): If drap_reg isn't live on entry
> >>>>> >         and stack_realign_needed will be false, clear drap_reg and need_drap.
> >>>>> >         Optimize leaf functions that don't need stack frame even if
> >>>>> >         crtl->need_drap.
> >>>>> >
> >>>>> >         * gcc.target/i386/pr59501-1.c: New test.
> >>>>> >         * gcc.target/i386/pr59501-1a.c: New test.
> >>>>> >         * gcc.target/i386/pr59501-2.c: New test.
> >>>>> >         * gcc.target/i386/pr59501-2a.c: New test.
> >>>>> >         * gcc.target/i386/pr59501-3.c: New test.
> >>>>> >         * gcc.target/i386/pr59501-3a.c: New test.
> >>>>> >         * gcc.target/i386/pr59501-4.c: New test.
> >>>>> >         * gcc.target/i386/pr59501-4a.c: New test.
> >>>>> >         * gcc.target/i386/pr59501-5.c: New test.
> >>>>> >         * gcc.target/i386/pr59501-6.c: New test.
> >
> > LGTM, assuming Jakub is OK with the patch.
> >
> > Thanks,
> > Uros.
> 
> Jakub, can you take a look at this:
> 
> https://gcc.gnu.org/ml/gcc-patches/2017-07/msg00400.html
> 

Here is the updated patch to fix

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81769

OK for trunk?

Thanks.

H.J.
---
ix86_finalize_stack_frame_flags has been extended to eliminate frame
pointer when the new stack frame isn't needed with and without
-maccumulate-outgoing-args as well as -fomit-frame-pointer.  Since stack
access with larger alignment may be optimized out, to decide if stack
realignment is needed, we need to not only check for stack frame access,
but also verify the alignment of stack frame access.  Since alignment of
memory access via arg_pointer is set up by caller, not by callee, we
should find the maximum stack alignment from the stack frame access
instructions via stack pointer and frame pointrer to avoid stack
realignment when stack alignment needed is less than incoming stack
boundary.

gcc/

	PR target/59501
	PR target/81624
	PR target/81769
	* config/i386/i386.c (ix86_finalize_stack_frame_flags): Don't
	realign stack if stack alignment needed is less than incoming
	stack boundary.

gcc/testsuite/

	PR target/59501
	PR target/81624
	PR target/81769
	* gcc.target/i386/pr59501-4a.c: Remove xfail.
	* gcc.target/i386/pr81769-1a.c: New test.
	* gcc.target/i386/pr81769-1b.c: Likewise.
	* gcc.target/i386/pr81769-2.c: Likewise.
---
 gcc/config/i386/i386.c                     | 143 ++++++++++++++++++-----------
 gcc/testsuite/gcc.target/i386/pr59501-4a.c |   2 +-
 gcc/testsuite/gcc.target/i386/pr81769-1a.c |  21 +++++
 gcc/testsuite/gcc.target/i386/pr81769-1b.c |   7 ++
 gcc/testsuite/gcc.target/i386/pr81769-2.c  |  21 +++++
 5 files changed, 138 insertions(+), 56 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr81769-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr81769-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr81769-2.c
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 1d88e4f247a..2161f79b999 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -14237,6 +14237,11 @@  ix86_finalize_stack_frame_flags (void)
       add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
       add_to_hard_reg_set (&set_up_by_prologue, Pmode,
 			   HARD_FRAME_POINTER_REGNUM);
+
+      /* The preferred stack alignment is the minimum stack alignment.  */
+      unsigned int stack_alignment = crtl->preferred_stack_boundary;
+      bool require_stack_frame = false;
+
       FOR_EACH_BB_FN (bb, cfun)
         {
           rtx_insn *insn;
@@ -14245,79 +14250,107 @@  ix86_finalize_stack_frame_flags (void)
 		&& requires_stack_frame_p (insn, prologue_used,
 					   set_up_by_prologue))
 	      {
-		if (crtl->stack_realign_needed != stack_realign)
-		  recompute_frame_layout_p = true;
-		crtl->stack_realign_needed = stack_realign;
-		crtl->stack_realign_finalized = true;
-		if (recompute_frame_layout_p)
-		  ix86_compute_frame_layout ();
-		return;
+		require_stack_frame = true;
+
+		if (stack_realign)
+		  {
+		    /* Find the maximum stack alignment.  */
+		    subrtx_iterator::array_type array;
+		    FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
+		      if (MEM_P (*iter)
+			  && (reg_mentioned_p (stack_pointer_rtx,
+					       *iter)
+			      || reg_mentioned_p (frame_pointer_rtx,
+						  *iter)))
+			{
+			  unsigned int alignment = MEM_ALIGN (*iter);
+			  if (alignment > stack_alignment)
+			    stack_alignment = alignment;
+			}
+		  }
 	      }
 	}
 
-      /* If drap has been set, but it actually isn't live at the start
-	 of the function, there is no reason to set it up.  */
-      if (crtl->drap_reg)
+      if (require_stack_frame)
 	{
-	  basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
-	  if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
+	  /* Stack frame is required.  If stack alignment needed is less
+	     than incoming stack boundary, don't realign stack.  */
+	  stack_realign = incoming_stack_boundary < stack_alignment;
+	  if (!stack_realign)
 	    {
-	      crtl->drap_reg = NULL_RTX;
-	      crtl->need_drap = false;
+	      crtl->max_used_stack_slot_alignment
+		= incoming_stack_boundary;
+	      crtl->stack_alignment_needed
+		= incoming_stack_boundary;
 	    }
 	}
       else
-	cfun->machine->no_drap_save_restore = true;
-
-      frame_pointer_needed = false;
-      stack_realign = false;
-      crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
-      crtl->stack_alignment_needed = incoming_stack_boundary;
-      crtl->stack_alignment_estimated = incoming_stack_boundary;
-      if (crtl->preferred_stack_boundary > incoming_stack_boundary)
-	crtl->preferred_stack_boundary = incoming_stack_boundary;
-      df_finish_pass (true);
-      df_scan_alloc (NULL);
-      df_scan_blocks ();
-      df_compute_regs_ever_live (true);
-      df_analyze ();
-
-      if (flag_var_tracking)
 	{
-	  /* Since frame pointer is no longer available, replace it with
-	     stack pointer - UNITS_PER_WORD in debug insns.  */
-	  df_ref ref, next;
-	  for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
-	       ref; ref = next)
+	  /* If drap has been set, but it actually isn't live at the
+	     start of the function, there is no reason to set it up.  */
+	  if (crtl->drap_reg)
 	    {
-	      rtx_insn *insn = DF_REF_INSN (ref);
-	      /* Make sure the next ref is for a different instruction,
-		 so that we're not affected by the rescan.  */
-	      next = DF_REF_NEXT_REG (ref);
-	      while (next && DF_REF_INSN (next) == insn)
-		next = DF_REF_NEXT_REG (next);
-
-	      if (DEBUG_INSN_P (insn))
+	      basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+	      if (! REGNO_REG_SET_P (DF_LR_IN (bb),
+				     REGNO (crtl->drap_reg)))
+		{
+		  crtl->drap_reg = NULL_RTX;
+		  crtl->need_drap = false;
+		}
+	    }
+	  else
+	    cfun->machine->no_drap_save_restore = true;
+
+	  frame_pointer_needed = false;
+	  stack_realign = false;
+	  crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
+	  crtl->stack_alignment_needed = incoming_stack_boundary;
+	  crtl->stack_alignment_estimated = incoming_stack_boundary;
+	  if (crtl->preferred_stack_boundary > incoming_stack_boundary)
+	    crtl->preferred_stack_boundary = incoming_stack_boundary;
+	  df_finish_pass (true);
+	  df_scan_alloc (NULL);
+	  df_scan_blocks ();
+	  df_compute_regs_ever_live (true);
+	  df_analyze ();
+
+	  if (flag_var_tracking)
+	    {
+	      /* Since frame pointer is no longer available, replace it with
+		 stack pointer - UNITS_PER_WORD in debug insns.  */
+	      df_ref ref, next;
+	      for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
+		   ref; ref = next)
 		{
-		  bool changed = false;
-		  for (; ref != next; ref = DF_REF_NEXT_REG (ref))
+		  rtx_insn *insn = DF_REF_INSN (ref);
+		  /* Make sure the next ref is for a different instruction,
+		     so that we're not affected by the rescan.  */
+		  next = DF_REF_NEXT_REG (ref);
+		  while (next && DF_REF_INSN (next) == insn)
+		    next = DF_REF_NEXT_REG (next);
+
+		  if (DEBUG_INSN_P (insn))
 		    {
-		      rtx *loc = DF_REF_LOC (ref);
-		      if (*loc == hard_frame_pointer_rtx)
+		      bool changed = false;
+		      for (; ref != next; ref = DF_REF_NEXT_REG (ref))
 			{
-			  *loc = plus_constant (Pmode,
-						stack_pointer_rtx,
-						-UNITS_PER_WORD);
-			  changed = true;
+			  rtx *loc = DF_REF_LOC (ref);
+			  if (*loc == hard_frame_pointer_rtx)
+			    {
+			      *loc = plus_constant (Pmode,
+						    stack_pointer_rtx,
+						    -UNITS_PER_WORD);
+			      changed = true;
+			    }
 			}
+		      if (changed)
+			df_insn_rescan (insn);
 		    }
-		  if (changed)
-		    df_insn_rescan (insn);
 		}
 	    }
-	}
 
-      recompute_frame_layout_p = true;
+	  recompute_frame_layout_p = true;
+	}
     }
 
   if (crtl->stack_realign_needed != stack_realign)
diff --git a/gcc/testsuite/gcc.target/i386/pr59501-4a.c b/gcc/testsuite/gcc.target/i386/pr59501-4a.c
index 5c3cb683a2e..908c7f457b6 100644
--- a/gcc/testsuite/gcc.target/i386/pr59501-4a.c
+++ b/gcc/testsuite/gcc.target/i386/pr59501-4a.c
@@ -5,4 +5,4 @@ 
 #include "pr59501-3a.c"
 
 /* Verify no dynamic realignment is performed.  */
-/* { dg-final { scan-assembler-not "and\[^\n\r]*sp" { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*sp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr81769-1a.c b/gcc/testsuite/gcc.target/i386/pr81769-1a.c
new file mode 100644
index 00000000000..8ebe7292184
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr81769-1a.c
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O -mavx" } */
+
+typedef int v8si __attribute__ ((vector_size (32)));
+typedef unsigned long long int u64 __attribute__ ((aligned(64)));
+
+
+void
+#ifndef __x86_64__
+__attribute__((regparm(3)))
+#endif
+foo (u64 *idx, v8si *out_start, v8si *regions)
+{
+  if (*idx < 20 ) {
+    v8si base = regions[*idx];
+    *out_start = base;
+  }
+}
+
+/* Verify no dynamic realignment is performed.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*sp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr81769-1b.c b/gcc/testsuite/gcc.target/i386/pr81769-1b.c
new file mode 100644
index 00000000000..6505a5f0074
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr81769-1b.c
@@ -0,0 +1,7 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O -mavx -fno-omit-frame-pointer" } */
+
+#include "pr81769-1a.c"
+
+/* Verify no dynamic realignment is performed.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*sp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr81769-2.c b/gcc/testsuite/gcc.target/i386/pr81769-2.c
new file mode 100644
index 00000000000..e020db20227
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr81769-2.c
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O -mavx -fno-omit-frame-pointer" } */
+
+typedef unsigned long long int u64 __attribute__ ((aligned(64)));
+
+void
+#ifndef __x86_64__
+__attribute__((regparm(3)))
+#endif
+foo (u64 *idx, unsigned int *out_start, unsigned int *out_end,
+     unsigned int *regions)
+{
+  if (*idx < 20 ) {
+    unsigned int base = regions[*idx];
+    *out_start = base;
+    *out_end = base;
+  }
+}
+
+/* Verify no dynamic realignment is performed.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*sp" } } */