diff mbox series

Updated patch for PR84101

Message ID alpine.LSU.2.20.1903281520570.27537@zhemvz.fhfr.qr
State New
Headers show
Series Updated patch for PR84101 | expand

Commit Message

Richard Biener March 28, 2019, 2:23 p.m. UTC
This is an update from last years attempt to tame down vectorization
when it runs in to ABI inefficiencies at function return.  I've
updated the patch to look for multi-reg returns instead of
!vector ones (due to word_mode vectorization) and handle a bit
more returns, esp. the common pattern of

  ret = vector;
  D.1234 = ret;
  ret = {CLOBBER};
  return D.1234;

Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?

I know this isn't the ultimate solution but we keep getting
duplicates of the PR.

Richard.

2019-03-28  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/84101
	* tree-vect-stmts.c: Include explow.h for hard_function_value,
	regs.h for hard_regno_nregs.
	(cfun_returns): New helper.
	(vect_model_store_cost): When vectorizing a store to a decl
	we return and the function ABI returns in a multi-reg location
	account for the possible spilling that will happen.

	* gcc.target/i386/pr84101.c: New testcase.

Comments

Richard Sandiford April 1, 2019, 4:42 p.m. UTC | #1
Richard Biener <rguenther@suse.de> writes:
> This is an update from last years attempt to tame down vectorization
> when it runs in to ABI inefficiencies at function return.  I've
> updated the patch to look for multi-reg returns instead of
> !vector ones (due to word_mode vectorization) and handle a bit
> more returns, esp. the common pattern of
>
>   ret = vector;
>   D.1234 = ret;
>   ret = {CLOBBER};
>   return D.1234;
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?
>
> I know this isn't the ultimate solution but we keep getting
> duplicates of the PR.
>
> Richard.
>
> 2019-03-28  Richard Biener  <rguenther@suse.de>
>
> 	PR tree-optimization/84101
> 	* tree-vect-stmts.c: Include explow.h for hard_function_value,
> 	regs.h for hard_regno_nregs.
> 	(cfun_returns): New helper.
> 	(vect_model_store_cost): When vectorizing a store to a decl
> 	we return and the function ABI returns in a multi-reg location
> 	account for the possible spilling that will happen.
>
> 	* gcc.target/i386/pr84101.c: New testcase.
>
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c	(revision 269987)
> +++ gcc/tree-vect-stmts.c	(working copy)
> @@ -43,6 +43,7 @@ along with GCC; see the file COPYING3.
>  #include "tree-cfg.h"
>  #include "tree-ssa-loop-manip.h"
>  #include "cfgloop.h"
> +#include "explow.h"
>  #include "tree-ssa-loop.h"
>  #include "tree-scalar-evolution.h"
>  #include "tree-vectorizer.h"
> @@ -52,6 +53,7 @@ along with GCC; see the file COPYING3.
>  #include "vec-perm-indices.h"
>  #include "tree-ssa-loop-niter.h"
>  #include "gimple-fold.h"
> +#include "regs.h"
>  
>  /* For lang_hooks.types.type_for_mode.  */
>  #include "langhooks.h"
> @@ -948,6 +950,37 @@ vect_model_promotion_demotion_cost (stmt
>                       "prologue_cost = %d .\n", inside_cost, prologue_cost);
>  }
>  
> +/* Returns true if the current function returns DECL.  */
> +
> +static bool
> +cfun_returns (tree decl)
> +{
> +  edge_iterator ei;
> +  edge e;
> +  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
> +    {
> +      greturn *ret = safe_dyn_cast <greturn *> (last_stmt (e->src));
> +      if (!ret)
> +	continue;
> +      if (gimple_return_retval (ret) == decl)
> +	return true;
> +      /* We often end up with an aggregate copy to the result decl,
> +         handle that case as well.  First skip intermediate clobbers
> +	 though.  */
> +      gimple *def = ret;
> +      do
> +	{
> +	  def = SSA_NAME_DEF_STMT (gimple_vuse (def));
> +	}
> +      while (gimple_clobber_p (def));
> +      if (is_a <gassign *> (def)
> +	  && gimple_assign_lhs (def) == gimple_return_retval (ret)
> +	  && gimple_assign_rhs1 (def) == decl)
> +	return true;
> +    }
> +  return false;
> +}
> +
>  /* Function vect_model_store_cost
>  
>     Models cost for stores.  In the case of grouped accesses, one access
> @@ -1032,6 +1065,37 @@ vect_model_store_cost (stmt_vec_info stm
>  				       vec_to_scalar, stmt_info, 0, vect_body);
>      }
>  
> +  /* When vectorizing a store into the function result assign
> +     a penalty if the function returns in a multi-register location.
> +     In this case we assume we'll end up with having to spill the
> +     vector result and do piecewise loads.  */
> +  tree base = get_base_address (STMT_VINFO_DATA_REF (stmt_info)->ref);
> +  if (base
> +      && (TREE_CODE (base) == RESULT_DECL
> +	  || (DECL_P (base) && cfun_returns (base)))
> +      && !aggregate_value_p (base, cfun->decl))
> +    {
> +      rtx reg = hard_function_value (TREE_TYPE (base), cfun->decl, 0, 1);
> +      /* ???  Handle PARALLEL in some way.  */
> +      if (REG_P (reg))
> +	{
> +	  int nregs = hard_regno_nregs (REGNO (reg), GET_MODE (reg));
> +	  /* Assume that a reg-reg move is possible and cheap,
> +	     do not account for vector to gp register move cost.  */
> +	  if (nregs > 1)

Looks OK to me FWIW, but maybe it would be worth having a check like:

    targetm.secondary_memory_needed (TYPE_MODE (vectype), ALL_REGS,
				     REGNO_REG_CLASS (REGNO (reg)))

as well as the above, so that we don't accidentally penalise values
that are returned in multiple vector registers.  Looks like the i386.c
definition will return true in the cases that matter.

Thanks,
Richard

> +	    {
> +	      /* Spill.  */
> +	      prologue_cost += record_stmt_cost (cost_vec, ncopies,
> +						 vector_store,
> +						 stmt_info, 0, vect_epilogue);
> +	      /* Loads.  */
> +	      prologue_cost += record_stmt_cost (cost_vec, ncopies * nregs,
> +						 scalar_load,
> +						 stmt_info, 0, vect_epilogue);
> +	    }
> +	}
> +    }
> +
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location,
>                       "vect_model_store_cost: inside_cost = %d, "
> Index: gcc/testsuite/gcc.target/i386/pr84101.c
> ===================================================================
> --- gcc/testsuite/gcc.target/i386/pr84101.c	(nonexistent)
> +++ gcc/testsuite/gcc.target/i386/pr84101.c	(working copy)
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-tree-slp2-details" } */
> +
> +typedef struct uint64_pair uint64_pair_t ;
> +struct uint64_pair
> +{
> +  unsigned long w0 ;
> +  unsigned long w1 ;
> +} ;
> +
> +uint64_pair_t pair(int num)
> +{
> +  uint64_pair_t p ;
> +
> +  p.w0 = num << 1 ;
> +  p.w1 = num >> 1 ;
> +
> +  return p ;
> +}
> +
> +/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp2" } } */
Richard Biener April 3, 2019, 8:41 a.m. UTC | #2
On Mon, 1 Apr 2019, Richard Sandiford wrote:

> Richard Biener <rguenther@suse.de> writes:
> > This is an update from last years attempt to tame down vectorization
> > when it runs in to ABI inefficiencies at function return.  I've
> > updated the patch to look for multi-reg returns instead of
> > !vector ones (due to word_mode vectorization) and handle a bit
> > more returns, esp. the common pattern of
> >
> >   ret = vector;
> >   D.1234 = ret;
> >   ret = {CLOBBER};
> >   return D.1234;
> >
> > Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?
> >
> > I know this isn't the ultimate solution but we keep getting
> > duplicates of the PR.
> >
> > Richard.
> >
> > 2019-03-28  Richard Biener  <rguenther@suse.de>
> >
> > 	PR tree-optimization/84101
> > 	* tree-vect-stmts.c: Include explow.h for hard_function_value,
> > 	regs.h for hard_regno_nregs.
> > 	(cfun_returns): New helper.
> > 	(vect_model_store_cost): When vectorizing a store to a decl
> > 	we return and the function ABI returns in a multi-reg location
> > 	account for the possible spilling that will happen.
> >
> > 	* gcc.target/i386/pr84101.c: New testcase.
> >
> > Index: gcc/tree-vect-stmts.c
> > ===================================================================
> > --- gcc/tree-vect-stmts.c	(revision 269987)
> > +++ gcc/tree-vect-stmts.c	(working copy)
> > @@ -43,6 +43,7 @@ along with GCC; see the file COPYING3.
> >  #include "tree-cfg.h"
> >  #include "tree-ssa-loop-manip.h"
> >  #include "cfgloop.h"
> > +#include "explow.h"
> >  #include "tree-ssa-loop.h"
> >  #include "tree-scalar-evolution.h"
> >  #include "tree-vectorizer.h"
> > @@ -52,6 +53,7 @@ along with GCC; see the file COPYING3.
> >  #include "vec-perm-indices.h"
> >  #include "tree-ssa-loop-niter.h"
> >  #include "gimple-fold.h"
> > +#include "regs.h"
> >  
> >  /* For lang_hooks.types.type_for_mode.  */
> >  #include "langhooks.h"
> > @@ -948,6 +950,37 @@ vect_model_promotion_demotion_cost (stmt
> >                       "prologue_cost = %d .\n", inside_cost, prologue_cost);
> >  }
> >  
> > +/* Returns true if the current function returns DECL.  */
> > +
> > +static bool
> > +cfun_returns (tree decl)
> > +{
> > +  edge_iterator ei;
> > +  edge e;
> > +  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
> > +    {
> > +      greturn *ret = safe_dyn_cast <greturn *> (last_stmt (e->src));
> > +      if (!ret)
> > +	continue;
> > +      if (gimple_return_retval (ret) == decl)
> > +	return true;
> > +      /* We often end up with an aggregate copy to the result decl,
> > +         handle that case as well.  First skip intermediate clobbers
> > +	 though.  */
> > +      gimple *def = ret;
> > +      do
> > +	{
> > +	  def = SSA_NAME_DEF_STMT (gimple_vuse (def));
> > +	}
> > +      while (gimple_clobber_p (def));
> > +      if (is_a <gassign *> (def)
> > +	  && gimple_assign_lhs (def) == gimple_return_retval (ret)
> > +	  && gimple_assign_rhs1 (def) == decl)
> > +	return true;
> > +    }
> > +  return false;
> > +}
> > +
> >  /* Function vect_model_store_cost
> >  
> >     Models cost for stores.  In the case of grouped accesses, one access
> > @@ -1032,6 +1065,37 @@ vect_model_store_cost (stmt_vec_info stm
> >  				       vec_to_scalar, stmt_info, 0, vect_body);
> >      }
> >  
> > +  /* When vectorizing a store into the function result assign
> > +     a penalty if the function returns in a multi-register location.
> > +     In this case we assume we'll end up with having to spill the
> > +     vector result and do piecewise loads.  */
> > +  tree base = get_base_address (STMT_VINFO_DATA_REF (stmt_info)->ref);
> > +  if (base
> > +      && (TREE_CODE (base) == RESULT_DECL
> > +	  || (DECL_P (base) && cfun_returns (base)))
> > +      && !aggregate_value_p (base, cfun->decl))
> > +    {
> > +      rtx reg = hard_function_value (TREE_TYPE (base), cfun->decl, 0, 1);
> > +      /* ???  Handle PARALLEL in some way.  */
> > +      if (REG_P (reg))
> > +	{
> > +	  int nregs = hard_regno_nregs (REGNO (reg), GET_MODE (reg));
> > +	  /* Assume that a reg-reg move is possible and cheap,
> > +	     do not account for vector to gp register move cost.  */
> > +	  if (nregs > 1)
> 
> Looks OK to me FWIW, but maybe it would be worth having a check like:
> 
>     targetm.secondary_memory_needed (TYPE_MODE (vectype), ALL_REGS,
> 				     REGNO_REG_CLASS (REGNO (reg)))
> 
> as well as the above, so that we don't accidentally penalise values
> that are returned in multiple vector registers.  Looks like the i386.c
> definition will return true in the cases that matter.

I wonder if what this hook returns makes sense if you feed it modes
with different sizes?  Say for the testcase V2DImode and
the general regs class (DImode).  No "direct" moves exist because the
result doesn't fit anyway so I wonder about the semantic of
secondary_memory_needed here.  Wouldn't it be more appropriate to
specify the correct register class instead of ALL_REGS here and
choose GET_MODE (reg) for the mode instead?  Because in the end
we'll do N GET_MODE (reg) moves from the vector register class to
the reg regclass?  OTOH I have no idea how to get at the register
class of 'vectype' ...?

But yes, something like that might be an improvement.  Still
x86 _can_ do direct moves between xmm and general regs
(some tunings prefer to go through memory) but the issue is
really that the RTL expander will emit code that later the
register allocator is not able to mangle into a form w/o
spills.  The issue there is really the upper half extracts
of the vector register.  So I'm not so sure that
targetm.secondary_memory_needed is the correct tool to use
for this heuristic - it is after all an RTL "optimization"
issue we are working around.  And we'd even like to penalize
the non-memory case because going from

  pair:
   lea    (%rdi,%rdi,1),%eax
   sar    %edi
   movslq %edi,%rdi
   cltq   
   mov    %rax,-0x18(%rsp)
   movq   -0x18(%rsp),%xmm0
   mov    %rdi,-0x18(%rsp)
   movhps -0x18(%rsp),%xmm0
   movaps %xmm0,-0x18(%rsp)
   mov    -0x18(%rsp),%rax
   mov    -0x10(%rsp),%rdx
   retq

to

  pair:
   lea    (%rdi,%rdi,1),%eax
   sar    %edi
   movslq %edi,%rdi
   cltq   
   mov    %rax,-0x18(%rsp)
   movq   -0x18(%rsp),%xmm0
   mov    %rdi,-0x18(%rsp)
   movhps -0x18(%rsp),%xmm0
   movlps %xmm0, %rdx
   movhps %xmm0, %rax
   retq

is still worse than not vectorizing that copy (just in
case targetm.secondary_memory_needed says false which
it might eventually do because we strictly do not need
secondary memory here).

Quoting the x86 implementation of the hook it seems we're
"saved" by the mode size check only:

  if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
    {
      /* SSE1 doesn't have any direct moves from other classes.  */
      if (!TARGET_SSE2)
        return true;

      /* If the target says that inter-unit moves are more expensive
         than moving through memory, then don't generate them.  */
      if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
          || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
        return true;

      /* Between SSE and general, we have moves no larger than word size.  
*/
      if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
        return true;

Your case where a target returns in multiple vector registers is one
not well handled by the patch in general - there isn't any attempt
at looking at the shape of the RESULT_DECL, we just see if a
vector store goes somewhere into it.  And we do this when processing
each store.  As said this is mostly a hack that covers the cases
the issue was reported for and shouldn't penaltize most other
cases.  Yes, it might pessimize a target that returns large
aggregates in multiple vector registers, but is there any?

So, may I go with the original patch?

Thanks,
Richard.

> Thanks,
> Richard
> 
> > +	    {
> > +	      /* Spill.  */
> > +	      prologue_cost += record_stmt_cost (cost_vec, ncopies,
> > +						 vector_store,
> > +						 stmt_info, 0, vect_epilogue);
> > +	      /* Loads.  */
> > +	      prologue_cost += record_stmt_cost (cost_vec, ncopies * nregs,
> > +						 scalar_load,
> > +						 stmt_info, 0, vect_epilogue);
> > +	    }
> > +	}
> > +    }
> > +
> >    if (dump_enabled_p ())
> >      dump_printf_loc (MSG_NOTE, vect_location,
> >                       "vect_model_store_cost: inside_cost = %d, "
> > Index: gcc/testsuite/gcc.target/i386/pr84101.c
> > ===================================================================
> > --- gcc/testsuite/gcc.target/i386/pr84101.c	(nonexistent)
> > +++ gcc/testsuite/gcc.target/i386/pr84101.c	(working copy)
> > @@ -0,0 +1,21 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -fdump-tree-slp2-details" } */
> > +
> > +typedef struct uint64_pair uint64_pair_t ;
> > +struct uint64_pair
> > +{
> > +  unsigned long w0 ;
> > +  unsigned long w1 ;
> > +} ;
> > +
> > +uint64_pair_t pair(int num)
> > +{
> > +  uint64_pair_t p ;
> > +
> > +  p.w0 = num << 1 ;
> > +  p.w1 = num >> 1 ;
> > +
> > +  return p ;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp2" } } */
>
Richard Sandiford April 3, 2019, 10:45 a.m. UTC | #3
Richard Biener <rguenther@suse.de> writes:
> On Mon, 1 Apr 2019, Richard Sandiford wrote:
>
>> Richard Biener <rguenther@suse.de> writes:
>> > This is an update from last years attempt to tame down vectorization
>> > when it runs in to ABI inefficiencies at function return.  I've
>> > updated the patch to look for multi-reg returns instead of
>> > !vector ones (due to word_mode vectorization) and handle a bit
>> > more returns, esp. the common pattern of
>> >
>> >   ret = vector;
>> >   D.1234 = ret;
>> >   ret = {CLOBBER};
>> >   return D.1234;
>> >
>> > Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?
>> >
>> > I know this isn't the ultimate solution but we keep getting
>> > duplicates of the PR.
>> >
>> > Richard.
>> >
>> > 2019-03-28  Richard Biener  <rguenther@suse.de>
>> >
>> > 	PR tree-optimization/84101
>> > 	* tree-vect-stmts.c: Include explow.h for hard_function_value,
>> > 	regs.h for hard_regno_nregs.
>> > 	(cfun_returns): New helper.
>> > 	(vect_model_store_cost): When vectorizing a store to a decl
>> > 	we return and the function ABI returns in a multi-reg location
>> > 	account for the possible spilling that will happen.
>> >
>> > 	* gcc.target/i386/pr84101.c: New testcase.
>> >
>> > Index: gcc/tree-vect-stmts.c
>> > ===================================================================
>> > --- gcc/tree-vect-stmts.c	(revision 269987)
>> > +++ gcc/tree-vect-stmts.c	(working copy)
>> > @@ -43,6 +43,7 @@ along with GCC; see the file COPYING3.
>> >  #include "tree-cfg.h"
>> >  #include "tree-ssa-loop-manip.h"
>> >  #include "cfgloop.h"
>> > +#include "explow.h"
>> >  #include "tree-ssa-loop.h"
>> >  #include "tree-scalar-evolution.h"
>> >  #include "tree-vectorizer.h"
>> > @@ -52,6 +53,7 @@ along with GCC; see the file COPYING3.
>> >  #include "vec-perm-indices.h"
>> >  #include "tree-ssa-loop-niter.h"
>> >  #include "gimple-fold.h"
>> > +#include "regs.h"
>> >  
>> >  /* For lang_hooks.types.type_for_mode.  */
>> >  #include "langhooks.h"
>> > @@ -948,6 +950,37 @@ vect_model_promotion_demotion_cost (stmt
>> >                       "prologue_cost = %d .\n", inside_cost, prologue_cost);
>> >  }
>> >  
>> > +/* Returns true if the current function returns DECL.  */
>> > +
>> > +static bool
>> > +cfun_returns (tree decl)
>> > +{
>> > +  edge_iterator ei;
>> > +  edge e;
>> > +  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
>> > +    {
>> > +      greturn *ret = safe_dyn_cast <greturn *> (last_stmt (e->src));
>> > +      if (!ret)
>> > +	continue;
>> > +      if (gimple_return_retval (ret) == decl)
>> > +	return true;
>> > +      /* We often end up with an aggregate copy to the result decl,
>> > +         handle that case as well.  First skip intermediate clobbers
>> > +	 though.  */
>> > +      gimple *def = ret;
>> > +      do
>> > +	{
>> > +	  def = SSA_NAME_DEF_STMT (gimple_vuse (def));
>> > +	}
>> > +      while (gimple_clobber_p (def));
>> > +      if (is_a <gassign *> (def)
>> > +	  && gimple_assign_lhs (def) == gimple_return_retval (ret)
>> > +	  && gimple_assign_rhs1 (def) == decl)
>> > +	return true;
>> > +    }
>> > +  return false;
>> > +}
>> > +
>> >  /* Function vect_model_store_cost
>> >  
>> >     Models cost for stores.  In the case of grouped accesses, one access
>> > @@ -1032,6 +1065,37 @@ vect_model_store_cost (stmt_vec_info stm
>> >  				       vec_to_scalar, stmt_info, 0, vect_body);
>> >      }
>> >  
>> > +  /* When vectorizing a store into the function result assign
>> > +     a penalty if the function returns in a multi-register location.
>> > +     In this case we assume we'll end up with having to spill the
>> > +     vector result and do piecewise loads.  */
>> > +  tree base = get_base_address (STMT_VINFO_DATA_REF (stmt_info)->ref);
>> > +  if (base
>> > +      && (TREE_CODE (base) == RESULT_DECL
>> > +	  || (DECL_P (base) && cfun_returns (base)))
>> > +      && !aggregate_value_p (base, cfun->decl))
>> > +    {
>> > +      rtx reg = hard_function_value (TREE_TYPE (base), cfun->decl, 0, 1);
>> > +      /* ???  Handle PARALLEL in some way.  */
>> > +      if (REG_P (reg))
>> > +	{
>> > +	  int nregs = hard_regno_nregs (REGNO (reg), GET_MODE (reg));
>> > +	  /* Assume that a reg-reg move is possible and cheap,
>> > +	     do not account for vector to gp register move cost.  */
>> > +	  if (nregs > 1)
>> 
>> Looks OK to me FWIW, but maybe it would be worth having a check like:
>> 
>>     targetm.secondary_memory_needed (TYPE_MODE (vectype), ALL_REGS,
>> 				     REGNO_REG_CLASS (REGNO (reg)))
>> 
>> as well as the above, so that we don't accidentally penalise values
>> that are returned in multiple vector registers.  Looks like the i386.c
>> definition will return true in the cases that matter.
>
> I wonder if what this hook returns makes sense if you feed it modes
> with different sizes?  Say for the testcase V2DImode and
> the general regs class (DImode).

In this case I think it's between V2DImode and TImode.  I went with
the vector mode because ABIs are the one place where BLKmode registers
are a thing.  Guess that's not a problem here though, because nregs > 1
would be false for BLKmode.  So yeah, GET_MODE (reg) should be fine too.

> No "direct" moves exist because the
> result doesn't fit anyway so I wonder about the semantic of
> secondary_memory_needed here.  Wouldn't it be more appropriate to
> specify the correct register class instead of ALL_REGS here and
> choose GET_MODE (reg) for the mode instead?  Because in the end
> we'll do N GET_MODE (reg) moves from the vector register class to
> the reg regclass?  OTOH I have no idea how to get at the register
> class of 'vectype' ...?

I think we only end up asking this question if the target lets both
register classes hold as many bytes as are in the mode.  It might need
multiple registers, but that's OK.

And yeah, it would be nice to be more precise than ALL_REGS, but I don't
think there's a reliable way of getting the "preferred" register class
for a mode in this context.

The hook is supposed to be conservatively correct for ALL_REGS,
so the above should err on the side of including the cost.

> But yes, something like that might be an improvement.  Still
> x86 _can_ do direct moves between xmm and general regs
> (some tunings prefer to go through memory) but the issue is
> really that the RTL expander will emit code that later the
> register allocator is not able to mangle into a form w/o
> spills.  The issue there is really the upper half extracts
> of the vector register.  So I'm not so sure that
> targetm.secondary_memory_needed is the correct tool to use
> for this heuristic - it is after all an RTL "optimization"
> issue we are working around.

But the moves do seem to be "proper" 128-bit moves:

(insn 11 10 12 2 (set (reg:TI 87 [ D.1914 ])
        (subreg:TI (reg:V2DI 90) 0)) "/tmp/f2.c":18:10 -1
     (nil))
(insn 12 11 16 2 (set (reg:TI 88 [ <retval> ])
        (reg:TI 87 [ D.1914 ])) "/tmp/f2.c":18:10 -1
     (nil))
(insn 16 12 17 2 (set (reg/i:TI 0 ax)
        (reg:TI 88 [ <retval> ])) "/tmp/f2.c":19:1 -1
     (nil))

which eventually becomes:

(insn 16 10 17 2 (set (reg/i:TI 0 ax)
        (subreg:TI (reg:V2DI 90) 0)) "/tmp/f2.c":19:1 65 {*movti_internal}
     (expr_list:REG_DEAD (reg:V2DI 90)
        (nil)))

So I don't think it's an RTL optimisation problem as such.  Some targets
(like AArch64) can do the equivalent of this without spills in either
direction.  Similarly, MIPS32 can use mthc1/mtc1 and mfhc1/mfc1 for
moving 64-bit FPRs to and from 32-bit GPRs.

So I think it really is up to the target to decide whether it can/wants
to do this without spills or not.  It seems wrong to cost in a spill
that should never happen. :-)

If there is a case that needs spills, secondary_memory_reload has to
return true for that case.  So I think it's the right hook to test.

> And we'd even like to penalize the non-memory case because going from
>
>   pair:
>    lea    (%rdi,%rdi,1),%eax
>    sar    %edi
>    movslq %edi,%rdi
>    cltq   
>    mov    %rax,-0x18(%rsp)
>    movq   -0x18(%rsp),%xmm0
>    mov    %rdi,-0x18(%rsp)
>    movhps -0x18(%rsp),%xmm0
>    movaps %xmm0,-0x18(%rsp)
>    mov    -0x18(%rsp),%rax
>    mov    -0x10(%rsp),%rdx
>    retq
>
> to
>
>   pair:
>    lea    (%rdi,%rdi,1),%eax
>    sar    %edi
>    movslq %edi,%rdi
>    cltq   
>    mov    %rax,-0x18(%rsp)
>    movq   -0x18(%rsp),%xmm0
>    mov    %rdi,-0x18(%rsp)
>    movhps -0x18(%rsp),%xmm0
>    movlps %xmm0, %rdx
>    movhps %xmm0, %rax
>    retq
>
> is still worse than not vectorizing that copy (just in
> case targetm.secondary_memory_needed says false which
> it might eventually do because we strictly do not need
> secondary memory here).

But isn't the vector construction the expensive part of that?
I assume we wouldn't want it even if %xmm0 was stored directly
to memory rather than returned in GPRs.

> Quoting the x86 implementation of the hook it seems we're
> "saved" by the mode size check only:
>
>   if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
>     {
>       /* SSE1 doesn't have any direct moves from other classes.  */
>       if (!TARGET_SSE2)
>         return true;
>
>       /* If the target says that inter-unit moves are more expensive
>          than moving through memory, then don't generate them.  */
>       if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
>           || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
>         return true;
>
>       /* Between SSE and general, we have moves no larger than word size.  
> */
>       if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
>         return true;

Yeah.

> Your case where a target returns in multiple vector registers is one
> not well handled by the patch in general - there isn't any attempt
> at looking at the shape of the RESULT_DECL, we just see if a
> vector store goes somewhere into it.  And we do this when processing
> each store.  As said this is mostly a hack that covers the cases
> the issue was reported for and shouldn't penaltize most other
> cases.  Yes, it might pessimize a target that returns large
> aggregates in multiple vector registers, but is there any?

SVE does this in some cases, but I guess none of them are relevant here.

> So, may I go with the original patch?

Still feels like we're counting spills on targets that shouldn't need them.
But going back to:

	  /* Assume that a reg-reg move is possible and cheap,
	     do not account for vector to gp register move cost.  */

I guess we could gloss over the "unnecessary" spill cost by saying that,
even if the spill isn't needed, this is a conservative estimate of the
vector to gp register move cost?

Thanks,
Richard
Richard Biener April 3, 2019, 10:58 a.m. UTC | #4
On Wed, 3 Apr 2019, Richard Sandiford wrote:

> Richard Biener <rguenther@suse.de> writes:
> > On Mon, 1 Apr 2019, Richard Sandiford wrote:
> >
> >> Richard Biener <rguenther@suse.de> writes:
> >> > This is an update from last years attempt to tame down vectorization
> >> > when it runs in to ABI inefficiencies at function return.  I've
> >> > updated the patch to look for multi-reg returns instead of
> >> > !vector ones (due to word_mode vectorization) and handle a bit
> >> > more returns, esp. the common pattern of
> >> >
> >> >   ret = vector;
> >> >   D.1234 = ret;
> >> >   ret = {CLOBBER};
> >> >   return D.1234;
> >> >
> >> > Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?
> >> >
> >> > I know this isn't the ultimate solution but we keep getting
> >> > duplicates of the PR.
> >> >
> >> > Richard.
> >> >
> >> > 2019-03-28  Richard Biener  <rguenther@suse.de>
> >> >
> >> > 	PR tree-optimization/84101
> >> > 	* tree-vect-stmts.c: Include explow.h for hard_function_value,
> >> > 	regs.h for hard_regno_nregs.
> >> > 	(cfun_returns): New helper.
> >> > 	(vect_model_store_cost): When vectorizing a store to a decl
> >> > 	we return and the function ABI returns in a multi-reg location
> >> > 	account for the possible spilling that will happen.
> >> >
> >> > 	* gcc.target/i386/pr84101.c: New testcase.
> >> >
> >> > Index: gcc/tree-vect-stmts.c
> >> > ===================================================================
> >> > --- gcc/tree-vect-stmts.c	(revision 269987)
> >> > +++ gcc/tree-vect-stmts.c	(working copy)
> >> > @@ -43,6 +43,7 @@ along with GCC; see the file COPYING3.
> >> >  #include "tree-cfg.h"
> >> >  #include "tree-ssa-loop-manip.h"
> >> >  #include "cfgloop.h"
> >> > +#include "explow.h"
> >> >  #include "tree-ssa-loop.h"
> >> >  #include "tree-scalar-evolution.h"
> >> >  #include "tree-vectorizer.h"
> >> > @@ -52,6 +53,7 @@ along with GCC; see the file COPYING3.
> >> >  #include "vec-perm-indices.h"
> >> >  #include "tree-ssa-loop-niter.h"
> >> >  #include "gimple-fold.h"
> >> > +#include "regs.h"
> >> >  
> >> >  /* For lang_hooks.types.type_for_mode.  */
> >> >  #include "langhooks.h"
> >> > @@ -948,6 +950,37 @@ vect_model_promotion_demotion_cost (stmt
> >> >                       "prologue_cost = %d .\n", inside_cost, prologue_cost);
> >> >  }
> >> >  
> >> > +/* Returns true if the current function returns DECL.  */
> >> > +
> >> > +static bool
> >> > +cfun_returns (tree decl)
> >> > +{
> >> > +  edge_iterator ei;
> >> > +  edge e;
> >> > +  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
> >> > +    {
> >> > +      greturn *ret = safe_dyn_cast <greturn *> (last_stmt (e->src));
> >> > +      if (!ret)
> >> > +	continue;
> >> > +      if (gimple_return_retval (ret) == decl)
> >> > +	return true;
> >> > +      /* We often end up with an aggregate copy to the result decl,
> >> > +         handle that case as well.  First skip intermediate clobbers
> >> > +	 though.  */
> >> > +      gimple *def = ret;
> >> > +      do
> >> > +	{
> >> > +	  def = SSA_NAME_DEF_STMT (gimple_vuse (def));
> >> > +	}
> >> > +      while (gimple_clobber_p (def));
> >> > +      if (is_a <gassign *> (def)
> >> > +	  && gimple_assign_lhs (def) == gimple_return_retval (ret)
> >> > +	  && gimple_assign_rhs1 (def) == decl)
> >> > +	return true;
> >> > +    }
> >> > +  return false;
> >> > +}
> >> > +
> >> >  /* Function vect_model_store_cost
> >> >  
> >> >     Models cost for stores.  In the case of grouped accesses, one access
> >> > @@ -1032,6 +1065,37 @@ vect_model_store_cost (stmt_vec_info stm
> >> >  				       vec_to_scalar, stmt_info, 0, vect_body);
> >> >      }
> >> >  
> >> > +  /* When vectorizing a store into the function result assign
> >> > +     a penalty if the function returns in a multi-register location.
> >> > +     In this case we assume we'll end up with having to spill the
> >> > +     vector result and do piecewise loads.  */
> >> > +  tree base = get_base_address (STMT_VINFO_DATA_REF (stmt_info)->ref);
> >> > +  if (base
> >> > +      && (TREE_CODE (base) == RESULT_DECL
> >> > +	  || (DECL_P (base) && cfun_returns (base)))
> >> > +      && !aggregate_value_p (base, cfun->decl))
> >> > +    {
> >> > +      rtx reg = hard_function_value (TREE_TYPE (base), cfun->decl, 0, 1);
> >> > +      /* ???  Handle PARALLEL in some way.  */
> >> > +      if (REG_P (reg))
> >> > +	{
> >> > +	  int nregs = hard_regno_nregs (REGNO (reg), GET_MODE (reg));
> >> > +	  /* Assume that a reg-reg move is possible and cheap,
> >> > +	     do not account for vector to gp register move cost.  */
> >> > +	  if (nregs > 1)
> >> 
> >> Looks OK to me FWIW, but maybe it would be worth having a check like:
> >> 
> >>     targetm.secondary_memory_needed (TYPE_MODE (vectype), ALL_REGS,
> >> 				     REGNO_REG_CLASS (REGNO (reg)))
> >> 
> >> as well as the above, so that we don't accidentally penalise values
> >> that are returned in multiple vector registers.  Looks like the i386.c
> >> definition will return true in the cases that matter.
> >
> > I wonder if what this hook returns makes sense if you feed it modes
> > with different sizes?  Say for the testcase V2DImode and
> > the general regs class (DImode).
> 
> In this case I think it's between V2DImode and TImode.  I went with
> the vector mode because ABIs are the one place where BLKmode registers
> are a thing.  Guess that's not a problem here though, because nregs > 1
> would be false for BLKmode.  So yeah, GET_MODE (reg) should be fine too.

Ah, OK.  Though then the target would see TImode, ALL_REGS, general-regs
which isn't what we want to ask...  Leaves us wit V2DImode, ALL_REGS,
general-regs then.

> > No "direct" moves exist because the
> > result doesn't fit anyway so I wonder about the semantic of
> > secondary_memory_needed here.  Wouldn't it be more appropriate to
> > specify the correct register class instead of ALL_REGS here and
> > choose GET_MODE (reg) for the mode instead?  Because in the end
> > we'll do N GET_MODE (reg) moves from the vector register class to
> > the reg regclass?  OTOH I have no idea how to get at the register
> > class of 'vectype' ...?
> 
> I think we only end up asking this question if the target lets both
> register classes hold as many bytes as are in the mode.  It might need
> multiple registers, but that's OK.
> 
> And yeah, it would be nice to be more precise than ALL_REGS, but I don't
> think there's a reliable way of getting the "preferred" register class
> for a mode in this context.
> 
> The hook is supposed to be conservatively correct for ALL_REGS,
> so the above should err on the side of including the cost.

OK, I see.

> > But yes, something like that might be an improvement.  Still
> > x86 _can_ do direct moves between xmm and general regs
> > (some tunings prefer to go through memory) but the issue is
> > really that the RTL expander will emit code that later the
> > register allocator is not able to mangle into a form w/o
> > spills.  The issue there is really the upper half extracts
> > of the vector register.  So I'm not so sure that
> > targetm.secondary_memory_needed is the correct tool to use
> > for this heuristic - it is after all an RTL "optimization"
> > issue we are working around.
> 
> But the moves do seem to be "proper" 128-bit moves:
> 
> (insn 11 10 12 2 (set (reg:TI 87 [ D.1914 ])
>         (subreg:TI (reg:V2DI 90) 0)) "/tmp/f2.c":18:10 -1
>      (nil))
> (insn 12 11 16 2 (set (reg:TI 88 [ <retval> ])
>         (reg:TI 87 [ D.1914 ])) "/tmp/f2.c":18:10 -1
>      (nil))
> (insn 16 12 17 2 (set (reg/i:TI 0 ax)
>         (reg:TI 88 [ <retval> ])) "/tmp/f2.c":19:1 -1
>      (nil))
> 
> which eventually becomes:
> 
> (insn 16 10 17 2 (set (reg/i:TI 0 ax)
>         (subreg:TI (reg:V2DI 90) 0)) "/tmp/f2.c":19:1 65 {*movti_internal}
>      (expr_list:REG_DEAD (reg:V2DI 90)
>         (nil)))
> 
> So I don't think it's an RTL optimisation problem as such.  Some targets
> (like AArch64) can do the equivalent of this without spills in either
> direction.  Similarly, MIPS32 can use mthc1/mtc1 and mfhc1/mfc1 for
> moving 64-bit FPRs to and from 32-bit GPRs.
> 
> So I think it really is up to the target to decide whether it can/wants
> to do this without spills or not.  It seems wrong to cost in a spill
> that should never happen. :-)

True.  But as you say below it's only half of the story since
the vec_concat will still happen (and that is also costly, even
more so if going through memory due to STLF issues).

> If there is a case that needs spills, secondary_memory_reload has to
> return true for that case.  So I think it's the right hook to test.
> 
> > And we'd even like to penalize the non-memory case because going from
> >
> >   pair:
> >    lea    (%rdi,%rdi,1),%eax
> >    sar    %edi
> >    movslq %edi,%rdi
> >    cltq   
> >    mov    %rax,-0x18(%rsp)
> >    movq   -0x18(%rsp),%xmm0
> >    mov    %rdi,-0x18(%rsp)
> >    movhps -0x18(%rsp),%xmm0
> >    movaps %xmm0,-0x18(%rsp)
> >    mov    -0x18(%rsp),%rax
> >    mov    -0x10(%rsp),%rdx
> >    retq
> >
> > to
> >
> >   pair:
> >    lea    (%rdi,%rdi,1),%eax
> >    sar    %edi
> >    movslq %edi,%rdi
> >    cltq   
> >    mov    %rax,-0x18(%rsp)
> >    movq   -0x18(%rsp),%xmm0
> >    mov    %rdi,-0x18(%rsp)
> >    movhps -0x18(%rsp),%xmm0
> >    movlps %xmm0, %rdx
> >    movhps %xmm0, %rax
> >    retq
> >
> > is still worse than not vectorizing that copy (just in
> > case targetm.secondary_memory_needed says false which
> > it might eventually do because we strictly do not need
> > secondary memory here).
> 
> But isn't the vector construction the expensive part of that?
> I assume we wouldn't want it even if %xmm0 was stored directly
> to memory rather than returned in GPRs.
> 
> > Quoting the x86 implementation of the hook it seems we're
> > "saved" by the mode size check only:
> >
> >   if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
> >     {
> >       /* SSE1 doesn't have any direct moves from other classes.  */
> >       if (!TARGET_SSE2)
> >         return true;
> >
> >       /* If the target says that inter-unit moves are more expensive
> >          than moving through memory, then don't generate them.  */
> >       if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
> >           || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
> >         return true;
> >
> >       /* Between SSE and general, we have moves no larger than word size.  
> > */
> >       if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
> >         return true;
> 
> Yeah.
> 
> > Your case where a target returns in multiple vector registers is one
> > not well handled by the patch in general - there isn't any attempt
> > at looking at the shape of the RESULT_DECL, we just see if a
> > vector store goes somewhere into it.  And we do this when processing
> > each store.  As said this is mostly a hack that covers the cases
> > the issue was reported for and shouldn't penaltize most other
> > cases.  Yes, it might pessimize a target that returns large
> > aggregates in multiple vector registers, but is there any?
> 
> SVE does this in some cases, but I guess none of them are relevant here.
> 
> > So, may I go with the original patch?
> 
> Still feels like we're counting spills on targets that shouldn't need them.
> But going back to:
> 
> 	  /* Assume that a reg-reg move is possible and cheap,
> 	     do not account for vector to gp register move cost.  */
> 
> I guess we could gloss over the "unnecessary" spill cost by saying that,
> even if the spill isn't needed, this is a conservative estimate of the
> vector to gp register move cost?

Yes.  What I really tried to ask is - am I going to need the
vectorized result piecewise in the end (but not being in control
of the chopping into pieces)?  I wanted to pessimize that with
an estimate of the "chopping cost".  I probably shouldn't have
talked about spilling but that's the usual straight-forward
solution of extracting sth from a larger register that works
everywhere.

So I guess I'll update the comment and install as-is?

I still hope for a better solution either on the target or the
RTL optimization side (undoing the vectorization).

Richard.
Richard Sandiford April 3, 2019, 11:04 a.m. UTC | #5
Richard Biener <rguenther@suse.de> writes:
>> > So, may I go with the original patch?
>> 
>> Still feels like we're counting spills on targets that shouldn't need them.
>> But going back to:
>> 
>> 	  /* Assume that a reg-reg move is possible and cheap,
>> 	     do not account for vector to gp register move cost.  */
>> 
>> I guess we could gloss over the "unnecessary" spill cost by saying that,
>> even if the spill isn't needed, this is a conservative estimate of the
>> vector to gp register move cost?
>
> Yes.  What I really tried to ask is - am I going to need the
> vectorized result piecewise in the end (but not being in control
> of the chopping into pieces)?  I wanted to pessimize that with
> an estimate of the "chopping cost".  I probably shouldn't have
> talked about spilling but that's the usual straight-forward
> solution of extracting sth from a larger register that works
> everywhere.
>
> So I guess I'll update the comment and install as-is?

Sounds good to me.

Richard

>
> I still hope for a better solution either on the target or the
> RTL optimization side (undoing the vectorization).
>
> Richard.
diff mbox series

Patch

Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c	(revision 269987)
+++ gcc/tree-vect-stmts.c	(working copy)
@@ -43,6 +43,7 @@  along with GCC; see the file COPYING3.
 #include "tree-cfg.h"
 #include "tree-ssa-loop-manip.h"
 #include "cfgloop.h"
+#include "explow.h"
 #include "tree-ssa-loop.h"
 #include "tree-scalar-evolution.h"
 #include "tree-vectorizer.h"
@@ -52,6 +53,7 @@  along with GCC; see the file COPYING3.
 #include "vec-perm-indices.h"
 #include "tree-ssa-loop-niter.h"
 #include "gimple-fold.h"
+#include "regs.h"
 
 /* For lang_hooks.types.type_for_mode.  */
 #include "langhooks.h"
@@ -948,6 +950,37 @@  vect_model_promotion_demotion_cost (stmt
                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
 }
 
+/* Returns true if the current function returns DECL.  */
+
+static bool
+cfun_returns (tree decl)
+{
+  edge_iterator ei;
+  edge e;
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+    {
+      greturn *ret = safe_dyn_cast <greturn *> (last_stmt (e->src));
+      if (!ret)
+	continue;
+      if (gimple_return_retval (ret) == decl)
+	return true;
+      /* We often end up with an aggregate copy to the result decl,
+         handle that case as well.  First skip intermediate clobbers
+	 though.  */
+      gimple *def = ret;
+      do
+	{
+	  def = SSA_NAME_DEF_STMT (gimple_vuse (def));
+	}
+      while (gimple_clobber_p (def));
+      if (is_a <gassign *> (def)
+	  && gimple_assign_lhs (def) == gimple_return_retval (ret)
+	  && gimple_assign_rhs1 (def) == decl)
+	return true;
+    }
+  return false;
+}
+
 /* Function vect_model_store_cost
 
    Models cost for stores.  In the case of grouped accesses, one access
@@ -1032,6 +1065,37 @@  vect_model_store_cost (stmt_vec_info stm
 				       vec_to_scalar, stmt_info, 0, vect_body);
     }
 
+  /* When vectorizing a store into the function result assign
+     a penalty if the function returns in a multi-register location.
+     In this case we assume we'll end up with having to spill the
+     vector result and do piecewise loads.  */
+  tree base = get_base_address (STMT_VINFO_DATA_REF (stmt_info)->ref);
+  if (base
+      && (TREE_CODE (base) == RESULT_DECL
+	  || (DECL_P (base) && cfun_returns (base)))
+      && !aggregate_value_p (base, cfun->decl))
+    {
+      rtx reg = hard_function_value (TREE_TYPE (base), cfun->decl, 0, 1);
+      /* ???  Handle PARALLEL in some way.  */
+      if (REG_P (reg))
+	{
+	  int nregs = hard_regno_nregs (REGNO (reg), GET_MODE (reg));
+	  /* Assume that a reg-reg move is possible and cheap,
+	     do not account for vector to gp register move cost.  */
+	  if (nregs > 1)
+	    {
+	      /* Spill.  */
+	      prologue_cost += record_stmt_cost (cost_vec, ncopies,
+						 vector_store,
+						 stmt_info, 0, vect_epilogue);
+	      /* Loads.  */
+	      prologue_cost += record_stmt_cost (cost_vec, ncopies * nregs,
+						 scalar_load,
+						 stmt_info, 0, vect_epilogue);
+	    }
+	}
+    }
+
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
                      "vect_model_store_cost: inside_cost = %d, "
Index: gcc/testsuite/gcc.target/i386/pr84101.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr84101.c	(nonexistent)
+++ gcc/testsuite/gcc.target/i386/pr84101.c	(working copy)
@@ -0,0 +1,21 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-slp2-details" } */
+
+typedef struct uint64_pair uint64_pair_t ;
+struct uint64_pair
+{
+  unsigned long w0 ;
+  unsigned long w1 ;
+} ;
+
+uint64_pair_t pair(int num)
+{
+  uint64_pair_t p ;
+
+  p.w0 = num << 1 ;
+  p.w1 = num >> 1 ;
+
+  return p ;
+}
+
+/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp2" } } */