diff mbox

[MIPS] MIPS specific optimization for o32 ABI

Message ID 1438361507.19674.252.camel@ubuntu-sellcey
State New
Headers show

Commit Message

Steve Ellcey July 31, 2015, 4:51 p.m. UTC
On Fri, 2015-07-31 at 00:32 +0000, Joseph Myers wrote:
> New command-line options need documenting in invoke.texi.

Good point, thanks for catching that.  Here is an updated patch with
invoke.texi.  There are no other changes to the patch.

Steve Ellcey
sellcey@imgtec.com



2015-07-31  Steve Ellcey  <sellcey@imgtec.com>
	    Zoran Jovanovic  <zoran.jovanovic@imgtec.com>
	    Catherine Moore  <clm@codesourcery.com>
	    Tom de Vries  <tom@codesourcery.com>

	* config/mips/mips.opt (mframe-header-opt): New option.
	* config/mips/mips.c (struct mips_frame_info): Add
	skip_stack_frame_allocation_p field.
	(struct machine_function): Add callees_use_frame_header_p,
	uses_frame_header_p, and initial_total_size fields.
	(mips_frame_header_usage): New hash.
	(mips_find_if_frame_header_is_used): New Function.
	(mips_callee_use_frame_header): New Function.
	(mips_callees_use_frame_header_p): New Function.
	(mips_cfun_use_frame_header_p): New Function.
	(mips_get_updated_offset): New Function.
	(mips_skip_stack_frame_alloc): New Function.
	(mips_frame_header_update_insn): New Function.
	(mips_rest_of_frame_header_opt): New function.
	(mips_compute_frame_info): Add recalculate and frame arguments.
	(mips_frame_pointer_required): Add new args to
	mips_compute_frame_info call.
	(mips_initial_elimination_offset): Ditto.
	(mips_gp_expand_needed_p): New function factored out of
	mips_expand_ghost_gp_insns.
	(mips_expand_ghost_gp_insns): Use mips_gp_expand_needed_p.
	(mips_reorg): Use mips_rest_of_frame_header_opt.
	* doc/invoke.texi (MIPS Options): Document -mframe-header-opt flag.


2015-07-31  Steve Ellcey  <sellcey@imgtec.com>
	    Tom de Vries  <tom@codesourcery.com>

	* gcc.target/mips/fho-1.c: New test.
	* gcc.target/mips/fho-2.c: New test.
	* gcc.target/mips/mips.exp: Add -mframe-header-opt to
	mips_option_groups.

Comments

Matthew Fortune Aug. 13, 2015, 9:14 a.m. UTC | #1
Hi Steve,

Overall, I don't think these optimizations are ready to include. In principle
the idea looks good but it is done at the wrong point in the compiler in my
opinion.

The biggest concern I have is that the analysis should be possible at (or
prior to) the point where the prologue/epilogue are expanded. I don't think
it is safe enough to post-process the code and delete the stack allocation.

There is at least one other optimization idea that competes with this one
which is to allow LRA to use the argument save area for arbitrary spills when
it is not used for spilling arguments or to prepare varargs. I think we need
to at least consider how the frame header removal will interact with such
an optimization.

I'd also like to see this kind of optimization be on by default and the
fact it is off by default in this patch suggests you/whoever originally
wrote it is not confident enough about its safety.

I've been through the code in detail anyway as there are a couple of things
that should be addressed if you use this in its current form elsewhere.

Thanks,
Matthew

>diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
>index c3cd52d..7cdef89 100644
>--- a/gcc/config/mips/mips.c
>+++ b/gcc/config/mips/mips.c
>@@ -77,6 +77,7 @@ along with GCC; see the file COPYING3.  If not see
> #include "cgraph.h"
> #include "builtins.h"
> #include "rtl-iter.h"
>+#include "dumpfile.h"
> 
> /* This file should be included last.  */
> #include "target-def.h"
>@@ -380,6 +381,9 @@ struct GTY(())  mips_frame_info {
> 
>   /* The offset of hard_frame_pointer_rtx from the bottom of the frame.  */
>   HOST_WIDE_INT hard_frame_pointer_offset;
>+
>+  /* Skip stack frame allocation if possible.  */
>+  bool skip_stack_frame_allocation_p;
> };
> 
> /* Enumeration for masked vectored (VI) and non-masked (EIC) interrupts.  */
>@@ -472,6 +476,15 @@ struct GTY(())  machine_function {
>   /* True if this is an interrupt handler that should use DERET
>      instead of ERET.  */
>   bool use_debug_exception_return_p;
>+
>+  /* True if some of the callees uses its frame header.  */

Is this supposed to say 'use their frame header'?

>+  bool callees_use_frame_header_p;
>+
>+  /* True if current function uses its frame header.  */
>+  bool uses_frame_header_p;
>+
>+  /* Frame size before updated by optimizations.  */
>+  HOST_WIDE_INT initial_total_size;
> };
> 
> /* Information about a single argument.  */
>@@ -574,6 +587,8 @@ struct mips_rtx_cost_data
> 
> /* Global variables for machine-dependent things.  */
> 
>+static hash_map<tree, bool> *mips_frame_header_usage;
>+
> /* The -G setting, or the configuration's default small-data limit if
>    no -G option is given.  */
> static unsigned int mips_small_data_threshold;
>@@ -1296,6 +1311,7 @@ static const struct mips_rtx_cost_data
>   }
> };
> 

>+static void mips_rest_of_frame_header_opt (void);
> static rtx mips_find_pic_call_symbol (rtx_insn *, rtx, bool);
> static int mips_register_move_cost (machine_mode, reg_class_t,
> 				    reg_class_t);
>@@ -10358,6 +10374,114 @@ mips_save_reg_p (unsigned int regno)
>   return false;
> }
> 
>+/* Try to find if function may use its incoming frame header.  */
>+
>+static bool
>+mips_find_if_frame_header_is_used (tree fndecl)

mips_frame_header_used_p

>+{
>+  bool *frame_header_unused;

What is the pointer doing? This should just be a bool.

>+
>+  if (mips_frame_header_usage)
>+    frame_header_unused = mips_frame_header_usage->get (fndecl);
>+  else
>+    frame_header_unused = false;
>+
>+  return !frame_header_unused;

Is this just supposed to be (or maybe inverted, the logic looks weird):

  return !(mips_frame_header_usage
	   && mips_frame_header_usage->get (fndecl));

This should be inlined in the function below it only has one caller.

>+}
>+
>+/* Return true if the instruction is a call and the called function may use its
>+   incoming frame header.  */
>+
>+static bool
>+mips_callee_use_frame_header (rtx_insn *insn)
>+{
>+  rtx call_insn;
>+  tree fndecl;
>+
>+  if (insn == NULL_RTX || !USEFUL_INSN_P (insn))
>+    return false;
>+
>+  /* Handle sequence of instructions.  */
>+  if (GET_CODE (PATTERN (insn)) == SEQUENCE)
>+    {
>+      rtx_insn *subinsn;
>+      FOR_EACH_SUBINSN (subinsn, insn)
>+	if (INSN_P (subinsn) && mips_callee_use_frame_header (subinsn))
>+	  return true;
>+    }
>+
>+  if (GET_CODE (insn) != CALL_INSN)
>+    return false;
>+
>+  if (GET_CODE (PATTERN (insn)) != PARALLEL
>+      || GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) != SET)
>+    return true;

I don't understand this. It says there is a callee frame header usage if the
instruction does not have this pattern

(parallel [(set...)...])

Why? (I'm probably just being dumb here)

>+
>+  call_insn = SET_SRC (XVECEXP (PATTERN (insn), 0, 0));
>+
>+  if (GET_CODE (call_insn) != CALL
>+      || GET_CODE (XEXP (call_insn, 0)) != MEM
>+      || GET_CODE (XEXP (XEXP (call_insn, 0), 0)) != SYMBOL_REF)
>+    return true;

MEM_P, SYMBOL_REF_P etc.

Weak symbols? We must assume the worst for preemptable symbols.

>+
>+  fndecl = SYMBOL_REF_DECL (XEXP (XEXP (call_insn, 0), 0));
>+
>+  if (fndecl == current_function_decl)
>+    return true;
>+
>+  return mips_find_if_frame_header_is_used (fndecl);
>+}
>+
>+/* Return true if any of the callee functions may use its incoming frame

their incoming frame...

>+   header.  */
>+
>+static bool
>+mips_callees_use_frame_header_p (void)
>+{
>+  rtx_insn *insn;
>+
>+  /* Iterate through all instructions in the current function and check whether
>+     only already seen functions may be called.  Assume that any unseen function
>+     may use its incoming frame header.  */
>+  for (insn = get_insns (); insn != NULL_RTX; insn = NEXT_INSN (insn))
>+    if (mips_callee_use_frame_header (insn))
>+      return true;

Let's at least pull all the basic instruction iterating code into here. I.e. this
code at least (it doesn't need to check for a sequence aas FOR_EACH_SUBINSN is
safe on a single insn). It may simply be worth inlining the whole thing as the
separation doesn't seem to make things any clearer.

      rtx_insn *subinsn;
      FOR_EACH_SUBINSN (subinsn, insn)
	if (INSN_P (subinsn) && mips_callee_use_frame_header (subinsn))
	  return true;

>+
>+  return false;
>+}
>+
>+/* Return true if the current function may use its incoming frame header.
>+   If destination of memory store in format sp + offset and offset is greater
>+   or equal than frame->total_size than this function returns true.
>+   */

So is this code really just looking for spills/reloads? The restrictions on
applying this optimisation seem like they are basically saying the function
must have no need for frame. I don't see any reason to have such a large
number of restrictions, the only thing that matters is the use of the
incoming argument area.

>+
>+static bool
>+mips_cfun_use_frame_header_p (void)
>+{
>+  rtx_insn *insn;
>+
>+  for (insn = get_insns (); insn != NULL_RTX; insn = NEXT_INSN (insn))
>+    {
>+      if (insn != NULL_RTX && INSN_P (insn)

remove insn != NULL_RTX it is redundant.

>+	  && GET_CODE (PATTERN (insn)) == SET
>+	  && MEM_P (XEXP (PATTERN (insn), 0)))
>+	{
>+	  rtx mem_dst = XEXP (XEXP (PATTERN (insn), 0), 0);
>+	  if (GET_CODE (mem_dst) == PLUS
>+	      && CONST_INT_P (XEXP (mem_dst, 1))
>+	      && REG_P (XEXP (mem_dst, 0))
>+	      && REGNO (XEXP (mem_dst, 0)) == STACK_POINTER_REGNUM)

I'm uneasy about how simplistic the test is for using the frame header.
It probably works today but other optimisations could create things
like anchors into the frame and offset from there.

>+	    {
>+	      int offset = INTVAL (XEXP (mem_dst, 1));

HOST_WIDE_INT offset

>+	      if (offset >= cfun->machine->initial_total_size)
>+		return true;
>+	    }
>+	}
>+    }
>+
>+  return false;
>+}
>+
> /* Populate the current function's mips_frame_info structure.
> 
>    MIPS stack frames look like:
>@@ -10429,9 +10553,8 @@ mips_save_reg_p (unsigned int regno)
>    hard_frame_pointer_rtx unchanged.  */
> 
> static void
>-mips_compute_frame_info (void)
>+mips_compute_frame_info (bool recalculate, struct mips_frame_info *frame)
> {
>-  struct mips_frame_info *frame;
>   HOST_WIDE_INT offset, size;
>   unsigned int regno, i;
> 
>@@ -10457,11 +10580,11 @@ mips_compute_frame_info (void)
> 	}
>     }
> 
>-  frame = &cfun->machine->frame;
>   memset (frame, 0, sizeof (*frame));
>   size = get_frame_size ();
> 
>   cfun->machine->global_pointer = mips_global_pointer ();
>+  frame->cprestore_size = 0;

This has just been zero'd 3 lines earlier.

> 
>   /* The first two blocks contain the outgoing argument area and the $gp save
>      slot.  This area isn't needed in leaf functions, but if the
>@@ -10477,12 +10600,18 @@ mips_compute_frame_info (void)
> 	frame->args_size = REG_PARM_STACK_SPACE (cfun->decl);
>       else
> 	frame->args_size = 0;
>-      frame->cprestore_size = 0;

If you are going to delete this then delete the 'else' and args_size = 0 as well.

>     }
>   else
>     {
>-      frame->args_size = crtl->outgoing_args_size;
>-      frame->cprestore_size = MIPS_GP_SAVE_AREA_SIZE;
>+      /* If recalculate do not take args_size into account.  */

This comment needs to say why this happens. I don't follow why from the code.

>+      if (recalculate)
>+	frame->args_size = 0;
>+      else

No need to zero args_size just leave it as it has been memzero'd.

if (!recalculate)

>+	frame->args_size = crtl->outgoing_args_size;
>+
>+      /* Check if space allocated on stack for gp will be used.  */
>+      if (!recalculate || mips_must_initialize_gp_p ())
>+	frame->cprestore_size = MIPS_GP_SAVE_AREA_SIZE;

Why is this dependent on !recalculate? Please split out the fix for
not allocating space for cprestore as that can be applied
independently, it looks like a simple bug fix.

>     }
>   offset = frame->args_size + frame->cprestore_size;
> 
>@@ -10606,6 +10735,9 @@ mips_compute_frame_info (void)
>      instructions for local variables and incoming arguments.  */
>   if (TARGET_MIPS16)
>     frame->hard_frame_pointer_offset = frame->args_size;
>+
>+  if (!recalculate)
>+    cfun->machine->initial_total_size = frame->total_size;
> }
> 
> /* Return the style of GP load sequence that is being used for the
>@@ -10642,7 +10774,7 @@ mips_frame_pointer_required (void)
>      without using a second temporary register.  */
>   if (TARGET_MIPS16)
>     {
>-      mips_compute_frame_info ();
>+      mips_compute_frame_info (false, &cfun->machine->frame);
>       if (!SMALL_OPERAND (cfun->machine->frame.total_size))
> 	return true;
>     }
>@@ -10668,7 +10800,7 @@ mips_initial_elimination_offset (int from, int to)
> {
>   HOST_WIDE_INT offset;
> 
>-  mips_compute_frame_info ();
>+  mips_compute_frame_info (false, &cfun->machine->frame);
> 
>   /* Set OFFSET to the offset from the end-of-prologue stack pointer.  */
>   switch (from)
>@@ -16838,12 +16970,8 @@ mips_has_long_branch_p (void)
>   return false;
> }
> 
>-/* If we are using a GOT, but have not decided to use a global pointer yet,
>-   see whether we need one to implement long branches.  Convert the ghost
>-   global-pointer instructions into real ones if so.  */
>-
> static bool
>-mips_expand_ghost_gp_insns (void)
>+mips_gp_expand_needed_p (void)
> {
>   /* Quick exit if we already know that we will or won't need a
>      global pointer.  */
>@@ -16857,12 +16985,28 @@ mips_expand_ghost_gp_insns (void)
>     return false;
> 
>   /* We've now established that we need $gp.  */
>-  cfun->machine->must_initialize_gp_p = true;
>-  split_all_insns_noflow ();
>-
>   return true;
> }
> 
>+
>+/* If we are using a GOT, but have not decided to use a global pointer yet,
>+   see whether we need one to implement long branches.  Convert the ghost
>+   global-pointer instructions into real ones if so.  */
>+
>+static bool
>+mips_expand_ghost_gp_insns (void)
>+{
>+
>+  if (mips_gp_expand_needed_p ())
>+    {
>+      /* We've now established that we need $gp.  */
>+      cfun->machine->must_initialize_gp_p = true;
>+      split_all_insns_noflow ();
>+      return true;
>+    }
>+  return false;
>+}
>+
> /* Subroutine of mips_reorg to manage passes that require DF.  */
> 
> static void
>@@ -17004,6 +17148,9 @@ mips_reorg (void)
>       mips_df_reorg ();
>       free_bb_for_insn ();
>     }
>+
>+  if (flag_frame_header_optimization)
>+    mips_rest_of_frame_header_opt ();
> }
> 
> /* We use a machine specific pass to do a second machine dependent reorg
>@@ -18802,6 +18949,164 @@ mips_prepare_pch_save (void)
>   mips_set_compression_mode (0);
>   mips16_globals = 0;
> }
>+
>+/* Return new offset for stack load/store operations.  */
>+
>+static int
>+mips_get_updated_offset (int old_offset)
>+{
>+  struct mips_frame_info *frame = &cfun->machine->frame;
>+  int res = old_offset;
>+  int initial_total_size = cfun->machine->initial_total_size;
>+
>+  if (old_offset > 0 && old_offset <= frame->gp_sp_offset)
>+    /* It should be only gp.  */
>+    res = old_offset - (initial_total_size
>+			- REG_PARM_STACK_SPACE (cfun->decl));
>+  else if (old_offset >= frame->gp_sp_offset
>+	   && old_offset <= initial_total_size)
>+    /* gp registers, accumulators.  */
>+    res = old_offset - (initial_total_size
>+			- REG_PARM_STACK_SPACE (cfun->decl));

This reduces to old_offset < 0 & old_offset <= initial_total_size.

I don't understand the reason for the original separation nor the comment
about 'it should be only gp' will it only ever be gp or is there some
chance that it can be something else?

>+  else if (old_offset > initial_total_size)
>+    /* Incoming args.  */
>+    res = old_offset - initial_total_size;
>+
>+  return res;
>+}
>+
>+/* Test whether to skip frame header allocation.  TODO: Try to do stack
>+   frame allocation removal even if local variables are used.  */
>+
>+static bool
>+mips_skip_stack_frame_alloc (void)
>+{
>+  struct mips_frame_info *frame = &cfun->machine->frame;
>+  struct mips_frame_info opt_frame;
>+
>+  if (!flag_frame_header_optimization)
>+    return false;
>+
>+  if (cfun->calls_setjmp != 0
>+      || cfun->calls_alloca != 0
>+      || cfun->stdarg != 0
>+      || crtl->shrink_wrapped
>+      || frame->var_size != 0
>+      || frame->args_size > REG_PARM_STACK_SPACE (cfun->decl)
>+      || mips_abi != ABI_32
>+      || TARGET_MIPS16
>+      || TARGET_MICROMIPS
>+      || frame_pointer_needed != 0
>+      || mips_gp_expand_needed_p ())
>+    return false;

Ugh! This is fairly tightly constrained. I'm not sure all of this is
necessary.

>+
>+  if (mips_callees_use_frame_header_p ())
>+    return false;
>+
>+  mips_compute_frame_info (true, &opt_frame);
>+
>+  if (opt_frame.total_size > REG_PARM_STACK_SPACE (cfun->decl)
>+      || cfun->machine->uses_frame_header_p)
>+    return false;

I'm probably being dumb again but I'm struggling to understand why
this is the key condition for detecting when the optimisation can be
done. Since it would need a comment anyway can you explain?

>+
>+  return true;
>+}
>+
>+/* Update stack related instructions.  */
>+
>+static void
>+mips_frame_header_update_insn (rtx_insn *insn)
>+{
>+  rtx set_insn, src, dst;
>+
>+  if (insn == NULL_RTX || !USEFUL_INSN_P (insn))
>+    return;
>+
>+  set_insn = single_set (insn);
>+  if (set_insn == NULL_RTX)
>+    return;
>+
>+  src = SET_SRC (set_insn);
>+  dst = SET_DEST (set_insn);
>+
>+  if (GET_CODE (src) == REG && GET_CODE (dst) == MEM
>+      && GET_CODE (XEXP (dst, 0)) == PLUS
>+      && GET_CODE (XEXP (XEXP (dst, 0), 0)) == REG
>+      && CONST_INT_P (XEXP (XEXP (dst, 0), 1))
>+      && (REGNO (XEXP (XEXP (dst, 0), 0))
>+          == STACK_POINTER_REGNUM))

Use MEM_P and REG_P.

>+    {
>+      /* It is a store through sp - update offset.  */
>+      XEXP (XEXP (dst, 0), 1)
>+        = GEN_INT (mips_get_updated_offset (INTVAL (XEXP (XEXP (dst, 0), 1))));
>+      return;
>+    }
>+
>+  if (GET_CODE (src) == MEM && GET_CODE (dst) == REG
>+      && GET_CODE (XEXP (src, 0)) == PLUS
>+      && GET_CODE (XEXP (XEXP (src, 0), 0)) == REG
>+      && CONST_INT_P (XEXP (XEXP (src, 0), 1))
>+      && (REGNO (XEXP (XEXP (src, 0), 0))
>+          == STACK_POINTER_REGNUM))
>+    {
>+      /* It is a load through sp - update offset.  */
>+      XEXP (XEXP (src, 0), 1)
>+        = GEN_INT (mips_get_updated_offset (INTVAL (XEXP (XEXP (src, 0), 1))));
>+      return;
>+    }
>+
>+  if (GET_CODE (src) == PLUS
>+      && GET_CODE (XEXP (src, 0)) == REG
>+      && CONST_INT_P (XEXP (src, 1))
>+      && REGNO (XEXP (src, 0)) == STACK_POINTER_REGNUM
>+      && REGNO (SET_DEST (set_insn)) == STACK_POINTER_REGNUM)
>+    delete_insn (insn);
>+}

As in my summary, I don't like this. It feels dangerous and unnecessary to apply
the optimization so late.

>+
>+/* Entry function for the frame header optimization.  */
>+
>+static void
>+mips_rest_of_frame_header_opt (void)
>+{
>+  rtx_insn *insn;
>+  bool skip_stack_frame_alloc;
>+  struct mips_frame_info *frame = &cfun->machine->frame;
>+
>+  cfun->machine->uses_frame_header_p = mips_cfun_use_frame_header_p ();
>+  skip_stack_frame_alloc = mips_skip_stack_frame_alloc ();
>+
>+  /* Check if it is needed to recalculate stack frame info.  */
>+  if (skip_stack_frame_alloc)
>+    mips_compute_frame_info (true, frame);
>+
>+  if ((skip_stack_frame_alloc && frame->total_size == 0)
>+      || (!skip_stack_frame_alloc && !cfun->machine->uses_frame_header_p
>+	  && !cfun->stdarg))
>+    {
>+      /* Function does not use its incoming frame header.  */
>+
>+      if (!mips_frame_header_usage)
>+	mips_frame_header_usage = new hash_map<tree, bool>;
>+
>+      tree fndecl = current_function_decl;
>+      bool existed;
>+      bool &frame_hdr_unused = mips_frame_header_usage->get_or_insert (fndecl, &existed);
>+      if (!existed)
>+	frame_hdr_unused = true;
>+    }
>+
>+  if (skip_stack_frame_alloc)
>+    {
>+      if (dump_file && cfun->machine->initial_total_size > frame->total_size)
>+	fprintf (dump_file, "Frame size reduced by frame header optimization"
>+		 " from %ld to %ld.\n", cfun->machine->initial_total_size,
>+		 frame->total_size);
>+
>+      /* Update instructions.  */
>+      for (insn = get_insns (); insn != NULL_RTX; insn = NEXT_INSN (insn))
>+	mips_frame_header_update_insn (insn);
>+    }
>+}
> 

> /* Generate or test for an insn that supports a constant permutation.  */
> 
>diff --git a/gcc/config/mips/mips.opt b/gcc/config/mips/mips.opt
>index 348c6e0..3e72936 100644
>--- a/gcc/config/mips/mips.opt
>+++ b/gcc/config/mips/mips.opt
>@@ -412,6 +412,10 @@ modd-spreg
> Target Report Mask(ODD_SPREG)
> Enable use of odd-numbered single-precision registers
> 
>+mframe-header-opt
>+Target Report Var(flag_frame_header_optimization) Optimization
>+Optimize frame header
>+
> noasmopt
> Driver
> 
>diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>index 413ac16..3b7b1b6 100644
>--- a/gcc/doc/invoke.texi
>+++ b/gcc/doc/invoke.texi
>@@ -813,7 +813,8 @@ Objective-C and Objective-C++ Dialects}.
> -mbranch-cost=@var{num}  -mbranch-likely  -mno-branch-likely @gol
> -mfp-exceptions -mno-fp-exceptions @gol
> -mvr4130-align -mno-vr4130-align -msynci -mno-synci @gol
>--mrelax-pic-calls -mno-relax-pic-calls -mmcount-ra-address}
>+-mrelax-pic-calls -mno-relax-pic-calls -mmcount-ra-address @gol
>+-mframe-header-opt}

add -mno-frame-header-opt.

> 
> @emph{MMIX Options}
> @gccoptlist{-mlibfuncs  -mno-libfuncs  -mepsilon  -mno-epsilon  -mabi=gnu @gol
>@@ -18013,6 +18014,18 @@ if @var{ra-address} is nonnull.
> 
> The default is @option{-mno-mcount-ra-address}.
> 
>+@item -mframe-header-opt
>+@itemx -mno-frame-header-opt
>+@opindex mframe-header-opt
>+Enable (disable) frame header optimization in the O32 ABI.  When using
>+the O32 ABI, calling functions allocate 16 bytes on the stack in case
>+the called function needs to write out register arguments to memory so
>+that their address can be taken.  When enabled, this optimization allows
>+the called function to use those 16 bytes for other purposes if the
>+arguments do not need to be written to memory.
>+
>+This optimization is off by default at all optimization levels.
>+
> @end table
> 
> @node MMIX Options
Steve Ellcey Aug. 18, 2015, 11:30 p.m. UTC | #2
On Thu, 2015-08-13 at 02:14 -0700, Matthew Fortune wrote:
> Hi Steve,
> 
> Overall, I don't think these optimizations are ready to include. In principle
> the idea looks good but it is done at the wrong point in the compiler in my
> opinion.
> 
> The biggest concern I have is that the analysis should be possible at (or
> prior to) the point where the prologue/epilogue are expanded. I don't think
> it is safe enough to post-process the code and delete the stack allocation.

I think that to do this, what we would have to do is introduce a new
pass at the tree level (just before expanding to rtl) where we could do
the analysis of whether or not the outgoing argument area is needed or
not.  Then we could use that info during expand_prologue to reset
frame->args_size if the space is not needed.

> There is at least one other optimization idea that competes with this one
> which is to allow LRA to use the argument save area for arbitrary spills when
> it is not used for spilling arguments or to prepare varargs. I think we need
> to at least consider how the frame header removal will interact with such
> an optimization.

I am not sure how this would work.  It seems better to just not allocate
the space if it is not needed and then LRA can separately allocate
whatever it needs for its own use (if any).  I'll add Robert to the cc
list on this in case he has any ideas since he did the LRA
implementation for MIPS.

Steve Ellcey
sellcey@imgtec.com
Matthew Fortune Aug. 19, 2015, 8:33 a.m. UTC | #3
Steve Ellcey <Steve.Ellcey@imgtec.com> writes:
> On Thu, 2015-08-13 at 02:14 -0700, Matthew Fortune wrote:
> > Hi Steve,
> >
> > Overall, I don't think these optimizations are ready to include. In principle
> > the idea looks good but it is done at the wrong point in the compiler in my
> > opinion.
> >
> > The biggest concern I have is that the analysis should be possible at (or
> > prior to) the point where the prologue/epilogue are expanded. I don't think
> > it is safe enough to post-process the code and delete the stack allocation.
> 
> I think that to do this, what we would have to do is introduce a new
> pass at the tree level (just before expanding to rtl) where we could do
> the analysis of whether or not the outgoing argument area is needed or
> not.  Then we could use that info during expand_prologue to reset
> frame->args_size if the space is not needed.

One thought was that this problem seems to fit into the category of ipa-ra and
while I don't know how that is implemented or if it is early enough... it may
be worth seeing if extra information can be calculated there.

> > There is at least one other optimization idea that competes with this one
> > which is to allow LRA to use the argument save area for arbitrary spills when
> > it is not used for spilling arguments or to prepare varargs. I think we need
> > to at least consider how the frame header removal will interact with such
> > an optimization.
> 
> I am not sure how this would work.  It seems better to just not allocate
> the space if it is not needed and then LRA can separately allocate
> whatever it needs for its own use (if any).  I'll add Robert to the cc
> list on this in case he has any ideas since he did the LRA
> implementation for MIPS.

I think between these two there will always be one optimization that has to
come first and win. If we decide prior to expansion whether an outgoing argument
area is needed (and therefore also decide if an incoming argument area is
available in any given function) then we will of course preclude any use of
this area for spilling/locals in the callee. The saving when re-using this
area is that the callee doesn't have to do stack allocation which could be
a performance win if called in a loop. Removing the stack allocation from the
caller is not as big of a win.

Perhaps balancing the two optimizations (if/when we do the LRA one) can be
fit in later without too much trouble.

Thanks,
Matthew
diff mbox

Patch

diff --git a/gcc/testsuite/gcc.target/mips/fho-1.c b/gcc/testsuite/gcc.target/mips/fho-1.c
new file mode 100644
index 0000000..e373da4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/fho-1.c
@@ -0,0 +1,36 @@ 
+/* { dg-do compile } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
+/* { dg-options "-mabi=32 -mframe-header-opt -fdump-rtl-mach" } */
+/* Testing -mframe-header-opt optimization option.  */
+
+NOCOMPRESSION int __attribute__((noinline))
+B (int x)
+{
+  return x + 3;
+}
+
+/* We are sure that B is not using its incoming stack frame so we can skip
+   its allocation.  */
+NOCOMPRESSION int __attribute__((noinline))
+A (int x)
+{
+  return B (x) + 2;
+}
+
+NOCOMPRESSION int
+main (void)
+{
+  int a;
+  void *volatile sp1, *volatile sp2;
+  register void *sp asm ("$sp");
+  sp1 = sp;
+  a = A (5);
+  sp2 = sp;
+  return !(a == 10 && sp1 == sp2);
+}
+
+/* { dg-final { scan-rtl-dump "Frame size reduced by frame header optimization" "mach" } } */
+
+/* For enabled targets, test that only one stack allocation is present, the one
+   in main.  The one in A should have been removed by -mframe-header-opt.  */
+/* { dg-final { scan-assembler-times "addiu\t\\\$sp,\\\$sp,-" 1 } } */
diff --git a/gcc/testsuite/gcc.target/mips/fho-2.c b/gcc/testsuite/gcc.target/mips/fho-2.c
new file mode 100644
index 0000000..d3599b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/fho-2.c
@@ -0,0 +1,29 @@ 
+/* { dg-do run } */
+/* { dg-options "-mabi=32 -mframe-header-opt" } */
+/* Testing -mframe-header-opt optimization option.  */
+
+NOCOMPRESSION int __attribute__((noinline))
+B (int x)
+{
+  return x + 3;
+}
+
+/* We are sure that B is not using its incoming stack frame so we can skip
+   its allocation.  */
+NOCOMPRESSION int __attribute__((noinline))
+A (int x)
+{
+  return B (x) + 2;
+}
+
+NOCOMPRESSION int
+main (void)
+{
+  int a;
+  void *volatile sp1, *volatile sp2;
+  register void *sp asm ("$sp");
+  sp1 = sp;
+  a = A (5);
+  sp2 = sp;
+  return !(a == 10 && sp1 == sp2);
+}
diff --git a/gcc/testsuite/gcc.target/mips/mips.exp b/gcc/testsuite/gcc.target/mips/mips.exp
index b3617e4..6e6450e 100644
--- a/gcc/testsuite/gcc.target/mips/mips.exp
+++ b/gcc/testsuite/gcc.target/mips/mips.exp
@@ -237,6 +237,7 @@  set mips_option_groups {
     fpu "-m(double|single)-float"
     forbid_cpu "forbid_cpu=.*"
     fp "-mfp(32|xx|64)"
+    frame_header_opt "-mframe-header-opt|-mno-frame-header-opt"
     gp "-mgp(32|64)"
     long "-mlong(32|64)"
     micromips "-mmicromips|-mno-micromips"