diff mbox series

Support multi-versioning on self-recursive function (ipa/92133)

Message ID BYAPR01MB4869F706EB198F95EACD7692F76D0@BYAPR01MB4869.prod.exchangelabs.com
State New
Headers show
Series Support multi-versioning on self-recursive function (ipa/92133) | expand

Commit Message

Feng Xue OS Oct. 17, 2019, 8:23 a.m. UTC
IPA does not allow constant propagation on parameter that is used to control
function recursion. 

recur_fn (i)
{
  if ( !terminate_recursion (i))
    {
      ...  
      recur_fn (i + 1);
      ...
    }
  ...
}

This patch is composed to enable multi-versioning for self-recursive function,
and versioning copies is limited by a specified option.

Feng
---

Comments

Xionghu Luo Oct. 18, 2019, 2:03 a.m. UTC | #1
Hi Feng,

On 2019/10/17 16:23, Feng Xue OS wrote:
> IPA does not allow constant propagation on parameter that is used to control
> function recursion.
> 
> recur_fn (i)
> {
>    if ( !terminate_recursion (i))
>      {
>        ...
>        recur_fn (i + 1);
>        ...
>      }
>    ...
> }
> 
> This patch is composed to enable multi-versioning for self-recursive function,
> and versioning copies is limited by a specified option.

I noticed similar issue when analyzing the SPEC, self-recursive function is
not versioned and posted my observations in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92074.

Generally, this could be implemented well by your patch, while I am
wondering whether it is OK to convert the recursive function to
non-recursive function in a independent pass after ipa-cp and ipa-sra instead
of reuse the ipa-cp framework?
The reason is sometimes the argument is passed-by-reference, and
ipa-sra runs after ipa-cp, so this kind of optimization may not be done in
WPA.  What's your idea about this, please?   Thanks. 


Thanks
Xiong Hu

> 
> Feng
> ---
> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
> index 045072e02ec..6255a766e4d 100644
> --- a/gcc/ipa-cp.c
> +++ b/gcc/ipa-cp.c
> @@ -229,7 +229,9 @@ public:
>     inline bool set_contains_variable ();
>     bool add_value (valtype newval, cgraph_edge *cs,
>   		  ipcp_value<valtype> *src_val = NULL,
> -		  int src_idx = 0, HOST_WIDE_INT offset = -1);
> +		  int src_idx = 0, HOST_WIDE_INT offset = -1,
> +		  ipcp_value<valtype> **val_pos_p = NULL,
> +		  bool unlimited = false);
>     void print (FILE * f, bool dump_sources, bool dump_benefits);
>   };
>   
> @@ -1579,22 +1581,37 @@ allocate_and_init_ipcp_value (ipa_polymorphic_call_context source)
>   /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  CS,
>      SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
>      meaning.  OFFSET -1 means the source is scalar and not a part of an
> -   aggregate.  */
> +   aggregate.  If non-NULL, VAL_POS_P specifies position in value list,
> +   after which newly created ipcp_value will be inserted, and it is also
> +   used to record address of the added ipcp_value before function returns.
> +   UNLIMITED means whether value count should not exceed the limit given
> +   by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
>   
>   template <typename valtype>
>   bool
>   ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>   				  ipcp_value<valtype> *src_val,
> -				  int src_idx, HOST_WIDE_INT offset)
> +				  int src_idx, HOST_WIDE_INT offset,
> +				  ipcp_value<valtype> **val_pos_p,
> +				  bool unlimited)
>   {
>     ipcp_value<valtype> *val;
>   
> +  if (val_pos_p)
> +    {
> +      for (val = values; val && val != *val_pos_p; val = val->next);
> +      gcc_checking_assert (val);
> +    }
> +
>     if (bottom)
>       return false;
>   
>     for (val = values; val; val = val->next)
>       if (values_equal_for_ipcp_p (val->value, newval))
>         {
> +	if (val_pos_p)
> +	  *val_pos_p = val;
> +
>   	if (ipa_edge_within_scc (cs))
>   	  {
>   	    ipcp_value_source<valtype> *s;
> @@ -1609,7 +1626,8 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>   	return false;
>         }
>   
> -  if (values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
> +  if (!unlimited
> +      && values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
>       {
>         /* We can only free sources, not the values themselves, because sources
>   	 of other values in this SCC might point to them.   */
> @@ -1623,6 +1641,9 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>   	    }
>   	}
>   
> +      if (val_pos_p)
> +	*val_pos_p = NULL;
> +
>         values = NULL;
>         return set_to_bottom ();
>       }
> @@ -1630,8 +1651,54 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>     values_count++;
>     val = allocate_and_init_ipcp_value (newval);
>     val->add_source (cs, src_val, src_idx, offset);
> -  val->next = values;
> -  values = val;
> +  if (val_pos_p)
> +    {
> +      val->next = (*val_pos_p)->next;
> +      (*val_pos_p)->next = val;
> +      *val_pos_p = val;
> +    }
> +  else
> +    {
> +      val->next = values;
> +      values = val;
> +    }
> +
> +  return true;
> +}
> +
> +/* Return true, if a ipcp_value VAL is orginated from parameter value of
> +   self-feeding recursive function by applying non-passthrough arithmetic
> +   transformation.  */
> +
> +static bool
> +self_recursively_generated_p (ipcp_value<tree> *val)
> +{
> +  class ipa_node_params *info = NULL;
> +
> +  for (ipcp_value_source<tree> *src = val->sources; src; src = src->next)
> +    {
> +      cgraph_edge *cs = src->cs;
> +
> +      if (!src->val || cs->caller != cs->callee->function_symbol ()
> +	  || src->val == val)
> +	return false;
> +
> +      if (!info)
> +	info = IPA_NODE_REF (cs->caller);
> +
> +      class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
> +								src->index);
> +      ipcp_lattice<tree> *src_lat = src->offset == -1 ? &plats->itself
> +						      : plats->aggs;
> +      ipcp_value<tree> *src_val;
> +
> +      for (src_val = src_lat->values; src_val && src_val != val;
> +	   src_val = src_val->next);
> +
> +      if (!src_val)
> +	return false;
> +    }
> +
>     return true;
>   }
>   
> @@ -1649,20 +1716,72 @@ propagate_vals_across_pass_through (cgraph_edge *cs, ipa_jump_func *jfunc,
>     ipcp_value<tree> *src_val;
>     bool ret = false;
>   
> -  /* Do not create new values when propagating within an SCC because if there
> -     are arithmetic functions with circular dependencies, there is infinite
> -     number of them and we would just make lattices bottom.  If this condition
> -     is ever relaxed we have to detect self-feeding recursive calls in
> -     cgraph_edge_brings_value_p in a smarter way.  */
> +  /* Due to circular dependencies, propagating within an SCC through arithmetic
> +     transformation would create infinite number of values.  But for
> +     self-feeding recursive function, we could allow propagation in a limited
> +     count, and this can enable a simple kind of recursive function versioning.
> +     For other scenario, we would just make lattices bottom.  */
>     if ((ipa_get_jf_pass_through_operation (jfunc) != NOP_EXPR)
>         && ipa_edge_within_scc (cs))
> -    ret = dest_lat->set_contains_variable ();
> +    {
> +      if (src_lat != dest_lat)
> +	return dest_lat->set_contains_variable ();
> +
> +      auto_vec<ipcp_value<tree> *, 8> val_seeds;
> +
> +      for (src_val = src_lat->values; src_val; src_val = src_val->next)
> +	{
> +	  /* Now we do not use self-recursively generated value as propagation
> +	     source, this is absolutely conservative, but could avoid explosion
> +	     of lattice's value space, especially when one recursive function
> +	     calls another recursive.  */
> +	  if (self_recursively_generated_p (src_val))
> +	    {
> +	      /* If the lattice has already been propagated for the call site,
> +		 no need to do that again.  */
> +	      for (ipcp_value_source<tree> *s = src_val->sources; s;
> +		   s = s->next)
> +		if (s->cs == cs)
> +		  return dest_lat->set_contains_variable ();
> +	    }
> +	  else
> +	    val_seeds.safe_push (src_val);
> +	}
> +
> +      int clone_copies = PARAM_VALUE (PARAM_IPA_CP_MAX_RECURSION_COPIES);
> +      int i;
> +
> +      /* Recursively generate lattice values with a limited count.  */
> +      FOR_EACH_VEC_ELT (val_seeds, i, src_val)
> +	{
> +	  for (int j = 1; j < clone_copies; j++)
> +	    {
> +	      tree cstval = ipa_get_jf_pass_through_result (jfunc,
> +	      						    src_val->value,
> +							    parm_type);
> +	      if (!cstval)
> +		break;
> +
> +	      /* Try to place the new lattice value after its source, which
> +		 can decrease iterations of propagate stage.  */
> +	      ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
> +					  -1, &src_val, true);
> +	      gcc_checking_assert (src_val);
> +	    }
> +	}
> +      ret |= dest_lat->set_contains_variable ();
> +    }
>     else
>       for (src_val = src_lat->values; src_val; src_val = src_val->next)
>         {
> +	  /* Now we do not use self-recursively generated value as propagation
> +	     source, otherwise it is easy to make value space of normal lattice
> +	     overflow.  */
> +	if (self_recursively_generated_p (src_val))
> +	  continue;
> +
>   	tree cstval = ipa_get_jf_pass_through_result (jfunc, src_val->value,
>   						      parm_type);
> -
>   	if (cstval)
>   	  ret |= dest_lat->add_value (cstval, cs, src_val, src_idx);
>   	else
> @@ -3635,6 +3754,9 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
>   	      hot |= cs->maybe_hot_p ();
>   	      if (cs->caller != dest)
>   		non_self_recursive = true;
> +	      else if (src->val)
> +		gcc_assert (values_equal_for_ipcp_p (src->val->value,
> +						     val->value));
>   	    }
>   	  cs = get_next_cgraph_edge_clone (cs);
>   	}
> diff --git a/gcc/params.def b/gcc/params.def
> index 322c37f8b96..3e242e09f01 100644
> --- a/gcc/params.def
> +++ b/gcc/params.def
> @@ -1135,6 +1135,11 @@ DEFPARAM (PARAM_IPA_CP_RECURSION_PENALTY,
>   	  "are evaluated for cloning.",
>   	  40, 0, 100)
>   
> +DEFPARAM (PARAM_IPA_CP_MAX_RECURSION_COPIES,
> +	  "ipa-cp-max-recursion-copies",
> +	  "Maximum function versioning copies for a self-recursive function.",
> +	  8, 0, 0)
> +
>   DEFPARAM (PARAM_IPA_CP_SINGLE_CALL_PENALTY,
>   	  "ipa-cp-single-call-penalty",
>   	  "Percentage penalty functions containing a single call to another "
> diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
> new file mode 100644
> index 00000000000..7c1c01047c6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
> @@ -0,0 +1,47 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-ipa-cp-details -fno-early-inlining" } */
> +
> +int fn();
> +
> +int data[100];
> +
> +int recur_fn (int i)
> +{
> +  int j;
> +
> +  if (i == 6)
> +    {
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      return 10;
> +    }
> +
> +  data[i] = i;
> +
> +  for (j = 0; j < 100; j++)
> +    recur_fn (i + 1);
> +
> +  return i;
> +}
> +
> +int main ()
> +{
> +  int i;
> +
> +  for (i = 0; i < 100; i++)
> +    recur_fn (1) + recur_fn (-5);
> +
> +  return 1;
> +}
> +
> +/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 12 "cp" } } */
>
Feng Xue OS Oct. 18, 2019, 3:50 a.m. UTC | #2
> I noticed similar issue when analyzing the SPEC, self-recursive function is
> not versioned and posted my observations in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92074.

> Generally, this could be implemented well by your patch, while I am
> wondering whether it is OK to convert the recursive function to
> non-recursive function in a independent pass after ipa-cp and ipa-sra instead
> of reuse the ipa-cp framework?
> The reason is sometimes the argument is passed-by-reference, and
> ipa-sra runs after ipa-cp, so this kind of optimization may not be done in
> WPA.  What's your idea about this, please?   Thanks.

Function versioning is done in ipa-cp, there is nothing special for recursive function.
So adding a dedicated pass for recursive seems to be redundant.

We might not need to resort to ipa-sra to resolve concern you mentioned. Original
ipa-cp already supports a simple kind of propagation on by-ref argument, who must
be defined by constant. And for an extended form as:  *arg = *param OP constant,  I've
created a tracker PR91682,  also composed a patch:
https://gcc.gnu.org/ml/gcc-patches/2019-09/msg01189.html.

Feng
Xionghu Luo Oct. 24, 2019, 5:44 a.m. UTC | #3
Hi,


On 2019/10/17 16:23, Feng Xue OS wrote:
> IPA does not allow constant propagation on parameter that is used to control
> function recursion.
> 
> recur_fn (i)
> {
>    if ( !terminate_recursion (i))
>      {
>        ...
>        recur_fn (i + 1);
>        ...
>      }
>    ...
> }
> 
> This patch is composed to enable multi-versioning for self-recursive function,
> and versioning copies is limited by a specified option.
> 
> Feng
> ---
> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
> index 045072e02ec..6255a766e4d 100644
> --- a/gcc/ipa-cp.c
> +++ b/gcc/ipa-cp.c
> @@ -229,7 +229,9 @@ public:
>     inline bool set_contains_variable ();
>     bool add_value (valtype newval, cgraph_edge *cs,
>   		  ipcp_value<valtype> *src_val = NULL,
> -		  int src_idx = 0, HOST_WIDE_INT offset = -1);
> +		  int src_idx = 0, HOST_WIDE_INT offset = -1,
> +		  ipcp_value<valtype> **val_pos_p = NULL,
> +		  bool unlimited = false);
>     void print (FILE * f, bool dump_sources, bool dump_benefits);
>   };
>   
> @@ -1579,22 +1581,37 @@ allocate_and_init_ipcp_value (ipa_polymorphic_call_context source)
>   /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  CS,
>      SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
>      meaning.  OFFSET -1 means the source is scalar and not a part of an
> -   aggregate.  */
> +   aggregate.  If non-NULL, VAL_POS_P specifies position in value list,
> +   after which newly created ipcp_value will be inserted, and it is also
> +   used to record address of the added ipcp_value before function returns.
> +   UNLIMITED means whether value count should not exceed the limit given
> +   by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
>   
>   template <typename valtype>
>   bool
>   ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>   				  ipcp_value<valtype> *src_val,
> -				  int src_idx, HOST_WIDE_INT offset)
> +				  int src_idx, HOST_WIDE_INT offset,
> +				  ipcp_value<valtype> **val_pos_p,
> +				  bool unlimited)
>   {
>     ipcp_value<valtype> *val;
>   
> +  if (val_pos_p)
> +    {
> +      for (val = values; val && val != *val_pos_p; val = val->next);
> +      gcc_checking_assert (val);
> +    }
> +
>     if (bottom)
>       return false;
>   
>     for (val = values; val; val = val->next)
>       if (values_equal_for_ipcp_p (val->value, newval))
>         {
> +	if (val_pos_p)
> +	  *val_pos_p = val;
> +
>   	if (ipa_edge_within_scc (cs))
>   	  {
>   	    ipcp_value_source<valtype> *s;
> @@ -1609,7 +1626,8 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>   	return false;
>         }
>   
> -  if (values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
> +  if (!unlimited
> +      && values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
>       {
>         /* We can only free sources, not the values themselves, because sources
>   	 of other values in this SCC might point to them.   */
> @@ -1623,6 +1641,9 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>   	    }
>   	}
>   
> +      if (val_pos_p)
> +	*val_pos_p = NULL;
> +
>         values = NULL;
>         return set_to_bottom ();
>       }
> @@ -1630,8 +1651,54 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>     values_count++;
>     val = allocate_and_init_ipcp_value (newval);
>     val->add_source (cs, src_val, src_idx, offset);
> -  val->next = values;
> -  values = val;
> +  if (val_pos_p)
> +    {
> +      val->next = (*val_pos_p)->next;
> +      (*val_pos_p)->next = val;
> +      *val_pos_p = val;
> +    }
> +  else
> +    {
> +      val->next = values;
> +      values = val;
> +    }
> +
> +  return true;
> +}
> +
> +/* Return true, if a ipcp_value VAL is orginated from parameter value of
> +   self-feeding recursive function by applying non-passthrough arithmetic
> +   transformation.  */
> +
> +static bool
> +self_recursively_generated_p (ipcp_value<tree> *val)
> +{
> +  class ipa_node_params *info = NULL;
> +
> +  for (ipcp_value_source<tree> *src = val->sources; src; src = src->next)
> +    {
> +      cgraph_edge *cs = src->cs;
> +
> +      if (!src->val || cs->caller != cs->callee->function_symbol ()
> +	  || src->val == val)
> +	return false;
> +
> +      if (!info)
> +	info = IPA_NODE_REF (cs->caller);
> +
> +      class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
> +								src->index);
> +      ipcp_lattice<tree> *src_lat = src->offset == -1 ? &plats->itself
> +						      : plats->aggs;

Thanks for the patch.
This function doesn't handle the by-ref case after rebasing this patch to your
previous ipa-cp by-ref of arith patch as below (Also some conflicts need fix):

foo (int * p) { ...  return foo(*(&p) + 1); }

It will cause value explosion.

Secondly, the self_recursive doesn't check or break if the value is above than
the recursive boundary, which seems would generate redundant nodes?


IMHO, It may be helpful if adding more comments before the return and dump log
for dump-ipa-cp-all-details when adding new values and new sources like below
for debugging and others can understand the code easier? :)

newval, src_val, src_val->sources,  *src_val->sources,  caller,  callee
1, (ipcp_value<tree_node*> *) 0x12bc1828, (ipcp_value_source<tree_node*> *) 0x12bae800,  {offset = -1, cs = 0x3fffb5602768, val = 0x0, next = 0x0, index = 0}, "main", recur_fn"
2, (ipcp_value<tree_node*> *) 0x12bc1880, (ipcp_value_source<tree_node*> *) 0x12bae830,  {offset = -1, cs = 0x3fffb5602630, val = 0x12bc1828, next = 0x0, index = 0},"recur_fn","recur_fn"
...
1, (ipcp_value<tree_node*> *) 0x12bc1828, (ipcp_value_source<tree_node*> *) 0x12baea70, {offset = -1, cs = 0x3fffb5602630, val = 0x12bc1c48, next = 0x12bae800, index = 0}  "recur_fn","recur_fn"
2, (ipcp_value<tree_node*> *) 0x12bc1880, (ipcp_value_source<tree_node*> *) 0x12bae830, {offset = -1, cs = 0x3fffb5602630, val = 0x12bc1828, next = 0x0, index = 0}  "recur_fn","recur_fn"


Xiong Hu
Thanks

> +      ipcp_value<tree> *src_val;
> +
> +      for (src_val = src_lat->values; src_val && src_val != val;
> +	   src_val = src_val->next);
> +
> +      if (!src_val)
> +	return false;
> +    }
> +
>     return true;
>   }
>   
> @@ -1649,20 +1716,72 @@ propagate_vals_across_pass_through (cgraph_edge *cs, ipa_jump_func *jfunc,
>     ipcp_value<tree> *src_val;
>     bool ret = false;
>   
> -  /* Do not create new values when propagating within an SCC because if there
> -     are arithmetic functions with circular dependencies, there is infinite
> -     number of them and we would just make lattices bottom.  If this condition
> -     is ever relaxed we have to detect self-feeding recursive calls in
> -     cgraph_edge_brings_value_p in a smarter way.  */
> +  /* Due to circular dependencies, propagating within an SCC through arithmetic
> +     transformation would create infinite number of values.  But for
> +     self-feeding recursive function, we could allow propagation in a limited
> +     count, and this can enable a simple kind of recursive function versioning.
> +     For other scenario, we would just make lattices bottom.  */
>     if ((ipa_get_jf_pass_through_operation (jfunc) != NOP_EXPR)
>         && ipa_edge_within_scc (cs))
> -    ret = dest_lat->set_contains_variable ();
> +    {
> +      if (src_lat != dest_lat)
> +	return dest_lat->set_contains_variable ();
> +
> +      auto_vec<ipcp_value<tree> *, 8> val_seeds;
> +
> +      for (src_val = src_lat->values; src_val; src_val = src_val->next)
> +	{
> +	  /* Now we do not use self-recursively generated value as propagation
> +	     source, this is absolutely conservative, but could avoid explosion
> +	     of lattice's value space, especially when one recursive function
> +	     calls another recursive.  */
> +	  if (self_recursively_generated_p (src_val))
> +	    {
> +	      /* If the lattice has already been propagated for the call site,
> +		 no need to do that again.  */
> +	      for (ipcp_value_source<tree> *s = src_val->sources; s;
> +		   s = s->next)
> +		if (s->cs == cs)
> +		  return dest_lat->set_contains_variable ();
> +	    }
> +	  else
> +	    val_seeds.safe_push (src_val);
> +	}
> +
> +      int clone_copies = PARAM_VALUE (PARAM_IPA_CP_MAX_RECURSION_COPIES);
> +      int i;
> +
> +      /* Recursively generate lattice values with a limited count.  */
> +      FOR_EACH_VEC_ELT (val_seeds, i, src_val)
> +	{
> +	  for (int j = 1; j < clone_copies; j++)
> +	    {
> +	      tree cstval = ipa_get_jf_pass_through_result (jfunc,
> +	      						    src_val->value,
> +							    parm_type);
> +	      if (!cstval)
> +		break;
> +
> +	      /* Try to place the new lattice value after its source, which
> +		 can decrease iterations of propagate stage.  */
> +	      ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
> +					  -1, &src_val, true);
> +	      gcc_checking_assert (src_val);
> +	    }
> +	}
> +      ret |= dest_lat->set_contains_variable ();
> +    }
>     else
>       for (src_val = src_lat->values; src_val; src_val = src_val->next)
>         {
> +	  /* Now we do not use self-recursively generated value as propagation
> +	     source, otherwise it is easy to make value space of normal lattice
> +	     overflow.  */
> +	if (self_recursively_generated_p (src_val))
> +	  continue;
> +
>   	tree cstval = ipa_get_jf_pass_through_result (jfunc, src_val->value,
>   						      parm_type);
> -
>   	if (cstval)
>   	  ret |= dest_lat->add_value (cstval, cs, src_val, src_idx);
>   	else
> @@ -3635,6 +3754,9 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
>   	      hot |= cs->maybe_hot_p ();
>   	      if (cs->caller != dest)
>   		non_self_recursive = true;
> +	      else if (src->val)
> +		gcc_assert (values_equal_for_ipcp_p (src->val->value,
> +						     val->value));
>   	    }
>   	  cs = get_next_cgraph_edge_clone (cs);
>   	}
> diff --git a/gcc/params.def b/gcc/params.def
> index 322c37f8b96..3e242e09f01 100644
> --- a/gcc/params.def
> +++ b/gcc/params.def
> @@ -1135,6 +1135,11 @@ DEFPARAM (PARAM_IPA_CP_RECURSION_PENALTY,
>   	  "are evaluated for cloning.",
>   	  40, 0, 100)
>   
> +DEFPARAM (PARAM_IPA_CP_MAX_RECURSION_COPIES,
> +	  "ipa-cp-max-recursion-copies",
> +	  "Maximum function versioning copies for a self-recursive function.",
> +	  8, 0, 0)
> +
>   DEFPARAM (PARAM_IPA_CP_SINGLE_CALL_PENALTY,
>   	  "ipa-cp-single-call-penalty",
>   	  "Percentage penalty functions containing a single call to another "
> diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
> new file mode 100644
> index 00000000000..7c1c01047c6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
> @@ -0,0 +1,47 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-ipa-cp-details -fno-early-inlining" } */
> +
> +int fn();
> +
> +int data[100];
> +
> +int recur_fn (int i)
> +{
> +  int j;
> +
> +  if (i == 6)
> +    {
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      return 10;
> +    }
> +
> +  data[i] = i;
> +
> +  for (j = 0; j < 100; j++)
> +    recur_fn (i + 1);
> +
> +  return i;
> +}
> +
> +int main ()
> +{
> +  int i;
> +
> +  for (i = 0; i < 100; i++)
> +    recur_fn (1) + recur_fn (-5);
> +
> +  return 1;
> +}
> +
> +/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 12 "cp" } } */
>
Feng Xue OS Oct. 24, 2019, 6:25 a.m. UTC | #4
Hi,

  Actually, this patch is not final one. Since the previous cp-by-ref patch is still on the way to the trunk,
I tailored it to only handle by-value recursion, so that it is decoupled with the previous patch, and can
be reviewed independently. I attached the final patch, you can have a try.

 CP propagation stage might generate values outside of recursion ranges, but cloning stage will not
make a duplicate for invalid recursion value, with which we do not need extra code for recursion range
analysis.

Feng
Jan Hubicka Nov. 14, 2019, 1:23 p.m. UTC | #5
Hi,
I think the patch generally looks reasonable

+2019-11-13  Feng Xue <fxue@os.amperecomputing.com>
+
+	PR ipa/92133
+	* doc/invoke.texi (ipa-cp-max-recursion-depth): Document new option.
+	* params.opt (ipa-cp-max-recursion-depth): New.
+	* ipa-cp.c (ipcp_lattice<valtype>::add_value): Add two new parameters
+	val_pos_p and unlimited.
+	(self_recursively_generated_p): New function.
+	(get_val_across_arith_op): Likewise.
+	(propagate_vals_across_arith_jfunc): Add constant propagation for
+	self-recursive function.
+	(incorporate_penalties): Do not penalize pure self-recursive function.
+	(good_cloning_opportunity_p): Dump node_is_self_scc flag.
+	(propagate_constants_topo): Set node_is_self_scc flag for cgraph node.
+	(get_info_about_necessary_edges): Relax hotness check for edge to
+	self-recursive function.
+	* ipa-prop.h (ipa_node_params): Add new field node_is_self_scc.
+

In general the patch looks good to me, but I would like Martin Jambor to
comment on the ipa-prop/cp interfaces. However...
 
+@item ipa-cp-max-recursion-depth
+Maximum depth of recursive cloning for self-recursive function.
+

... I believe we will need more careful cost model for this.  I think
we want to limit the overall growth for all the clones and also probably
enable this only when ipa-predicates things the individual clones will
actualy be faster by some non-trivial percentage. For recursive inliner
we have:

--param max-inline-recursive-depth which has similar meaning to your parameter
  (so perhaps similar name would be good)
--param min-inline-recursive-probability
  which requires the inlining to happen only across edges which are
  known to be taken with reasonable chance
--param max-inline-insns-recursive
  which specifies overall size after all the recursive inlining

Those parameters are not parituclarly well tought out or tested, but
they may be good start.

Do you have some data on code size/performance effects of this change?

Honza
Feng Xue OS Nov. 14, 2019, 3:14 p.m. UTC | #6
Thanks for your review.

> In general the patch looks good to me, but I would like Martin Jambor to
> comment on the ipa-prop/cp interfaces. However...

> +@item ipa-cp-max-recursion-depth
> +Maximum depth of recursive cloning for self-recursive function.
> +

> ... I believe we will need more careful cost model for this.  I think
> we want to limit the overall growth for all the clones and also probably
> enable this only when ipa-predicates things the individual clones will
> actualy be faster by some non-trivial percentage. For recursive inliner
> we have:

Cost model used by self-recursive cloning is mainly based on existing stuffs
in ipa-cp cloning, size growth and time benefit are considered. But since
recursive cloning is a more aggressive cloning, we will actually have another
problem, which is opposite to your concern.  By default, current parameter
set used to control ipa-cp and recursive-inliner gives priority to code size,
not completely for performance. This makes ipa-cp behave somewhat
conservatively, and as a result, it can not trigger expected recursive cloning
for the case in SPEC2017.exchange2 with default setting, blocked by both
ipa-cp-eval-threshold and ipcp-unit-growth. The former is due to improper
static profile estimation, and the latter is conflicted to aggressiveness of
recursive cloning. Thus, we have to explicitly lower the threshold and raise
percentage of unit-growth.

We might not reach the destination in one leap. This patch is just first step
to enable recursive function versioning. And next, we still need further
elaborate tuning on this.

> --param max-inline-recursive-depth which has similar meaning to your parameter
>  (so perhaps similar name would be good)
> --param min-inline-recursive-probability
>  which requires the inlining to happen only across edges which are
>  known to be taken with reasonable chance
> --param max-inline-insns-recursive
>  which specifies overall size after all the recursive inlining

> Those parameters are not parituclarly well tought out or tested, but
> they may be good start.

> Do you have some data on code size/performance effects of this change?
For spec2017, no obvious code size and performance change with default setting.
Specifically, for exchange2, with ipa-cp-eval-threshold=1 and ipcp-unit-growth=80,
performance +31%, size +7%, on aarch64.

Feng
Jan Hubicka Nov. 14, 2019, 3:22 p.m. UTC | #7
> Thanks for your review.
> 
> > In general the patch looks good to me, but I would like Martin Jambor to
> > comment on the ipa-prop/cp interfaces. However...
> 
> > +@item ipa-cp-max-recursion-depth
> > +Maximum depth of recursive cloning for self-recursive function.
> > +
> 
> > ... I believe we will need more careful cost model for this.  I think
> > we want to limit the overall growth for all the clones and also probably
> > enable this only when ipa-predicates things the individual clones will
> > actualy be faster by some non-trivial percentage. For recursive inliner
> > we have:
> 
> Cost model used by self-recursive cloning is mainly based on existing stuffs
> in ipa-cp cloning, size growth and time benefit are considered. But since
> recursive cloning is a more aggressive cloning, we will actually have another
> problem, which is opposite to your concern.  By default, current parameter
> set used to control ipa-cp and recursive-inliner gives priority to code size,
> not completely for performance. This makes ipa-cp behave somewhat

Yes, for a while the cost model is quite off.  On Firefox it does just
few clonings where code size increases so it desprately needs retuning.

But since rescursive cloning is quite a different case from normal one,
perhaps having independent set of limits would help in particular ...
> conservatively, and as a result, it can not trigger expected recursive cloning
> for the case in SPEC2017.exchange2 with default setting, blocked by both
> ipa-cp-eval-threshold and ipcp-unit-growth. The former is due to improper
> static profile estimation, and the latter is conflicted to aggressiveness of
> recursive cloning. Thus, we have to explicitly lower the threshold and raise
> percentage of unit-growth.
> 
> We might not reach the destination in one leap. This patch is just first step
> to enable recursive function versioning. And next, we still need further
> elaborate tuning on this.
> 
> > --param max-inline-recursive-depth which has similar meaning to your parameter
> >  (so perhaps similar name would be good)
> > --param min-inline-recursive-probability
> >  which requires the inlining to happen only across edges which are
> >  known to be taken with reasonable chance
> > --param max-inline-insns-recursive
> >  which specifies overall size after all the recursive inlining
> 
> > Those parameters are not parituclarly well tought out or tested, but
> > they may be good start.
> 
> > Do you have some data on code size/performance effects of this change?
> For spec2017, no obvious code size and performance change with default setting.
> Specifically, for exchange2, with ipa-cp-eval-threshold=1 and ipcp-unit-growth=80,
> performance +31%, size +7%, on aarch64.

... it will help here since ipa-cp-eval-threshold value needed are quite off of what we need to do.

I wonder about the 80% of unit growth which is also more than we can
enable by default.  How it comes the overal size change is only 7%?

Honza
> 
> Feng
Feng Xue OS Nov. 14, 2019, 4:01 p.m. UTC | #8
>> Cost model used by self-recursive cloning is mainly based on existing stuffs
>> in ipa-cp cloning, size growth and time benefit are considered. But since
>> recursive cloning is a more aggressive cloning, we will actually have another
>> problem, which is opposite to your concern.  By default, current parameter
>> set used to control ipa-cp and recursive-inliner gives priority to code size,
>> not completely for performance. This makes ipa-cp behave somewhat

> Yes, for a while the cost model is quite off.  On Firefox it does just
> few clonings where code size increases so it desprately needs retuning.

> But since rescursive cloning is quite a different case from normal one,
> perhaps having independent set of limits would help in particular ...
I did consider this way, but this seems to be contradictory for normal
and recursive cloning.

> > Do you have some data on code size/performance effects of this change?
> For spec2017, no obvious code size and performance change with default setting.
> Specifically, for exchange2, with ipa-cp-eval-threshold=1 and ipcp-unit-growth=80,
> performance +31%, size +7%, on aarch64.

> ... it will help here since ipa-cp-eval-threshold value needed are quite off of what we need to do.

> I wonder about the 80% of unit growth which is also more than we can
> enable by default.  How it comes the overal size change is only 7%?
343624 -> 365632 (this contains debug info, -g)    recursion-depth=8
273488 -> 273760 (no debug info)   recursion-depth=8
Jan Hubicka Nov. 14, 2019, 8:33 p.m. UTC | #9
> >> Cost model used by self-recursive cloning is mainly based on existing stuffs
> >> in ipa-cp cloning, size growth and time benefit are considered. But since
> >> recursive cloning is a more aggressive cloning, we will actually have another
> >> problem, which is opposite to your concern.  By default, current parameter
> >> set used to control ipa-cp and recursive-inliner gives priority to code size,
> >> not completely for performance. This makes ipa-cp behave somewhat
> 
> > Yes, for a while the cost model is quite off.  On Firefox it does just
> > few clonings where code size increases so it desprately needs retuning.
> 
> > But since rescursive cloning is quite a different case from normal one,
> > perhaps having independent set of limits would help in particular ...
> I did consider this way, but this seems to be contradictory for normal
> and recursive cloning.

We could definitly discuss cost model incrementally. It is safe to do
what you do currently (rely on the existing ipa-cp's overconservative
heuristics).  

> 
> > > Do you have some data on code size/performance effects of this change?
> > For spec2017, no obvious code size and performance change with default setting.
> > Specifically, for exchange2, with ipa-cp-eval-threshold=1 and ipcp-unit-growth=80,
> > performance +31%, size +7%, on aarch64.
> 
> > ... it will help here since ipa-cp-eval-threshold value needed are quite off of what we need to do.
> 
> > I wonder about the 80% of unit growth which is also more than we can
> > enable by default.  How it comes the overal size change is only 7%?
> 343624 -> 365632 (this contains debug info, -g)    recursion-depth=8
> 273488 -> 273760 (no debug info)   recursion-depth=8

What seems bit odd is that ipcp's metrics ends up with 80% code growth.
I will try to look into it and see if I can think better what to do
about the costs.

Honza
Feng Xue OS Nov. 15, 2019, 3:32 p.m. UTC | #10
Honza,

   I made some changes: do not penalize self-recursive function, and add --param ipa-cp-min-recursive-probability, similar to recursive inline. Please review this new one.

Thanks,
Feng
Feng Xue OS Nov. 22, 2019, 3:24 a.m. UTC | #11
Honza, Martin,

  Hope your more comments on this patch. Not sure you base option on it. I think this can be a start point for
recursive versioning. And later we definitely need further cost-model tuning not only on this, but also whole
 ipa-cp to enable more aggressive cloning.

Thanks,
Feng
Martin Jambor Nov. 22, 2019, 11:18 a.m. UTC | #12
Hi,

On Fri, Nov 15 2019, Feng Xue OS wrote:
> Honza,
>
> I made some changes: do not penalize self-recursive function, and
> add --param ipa-cp-min-recursive-probability, similar to recursive
> inline. Please review this new one.

The patch and its effect on exchange is intriguing, I only have a few
comments, see below, otherwise I am happy about it.

> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index fe8daf40888..c84f4d73ed6 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,23 @@
> +2019-11-15  Feng Xue <fxue@os.amperecomputing.com>
> +
> +	PR ipa/92133
> +	* doc/invoke.texi (ipa-cp-max-recursive-depth): Document new option.
> +	(ipa-cp-min-recursive-probability): Likewise.
> +	* params.opt (ipa-cp-max-recursive-depth): New.
> +	(ipa-cp-min-recursive-probability): Likewise.
> +	* ipa-cp.c (ipcp_lattice<valtype>::add_value): Add two new parameters
> +	val_pos_p and unlimited.
> +	(self_recursively_generated_p): New function.
> +	(get_val_across_arith_op): Likewise.
> +	(propagate_vals_across_arith_jfunc): Add constant propagation for
> +	self-recursive function.
> +	(incorporate_penalties): Do not penalize pure self-recursive function.
> +	(good_cloning_opportunity_p): Dump node_is_self_scc flag.
> +	(propagate_constants_topo): Set node_is_self_scc flag for cgraph node.
> +	(get_info_about_necessary_edges): Relax hotness check for edge to
> +	self-recursive function.
> +	* ipa-prop.h (ipa_node_params): Add new field node_is_self_scc.
> +

The least important comment: Thanks for providing the ChangeLog but
sending ChangeLog in the patch itself makes it difficult for people to
apply the patch because the ChangeLog hunks will never apply cleanly.
That's why people send them in the email body when they email
gcc-patches.  Hopefully we'll be putting ChangeLogs only into the commit
message soon and all of this will be a non-issue.

>  2019-11-15  Feng Xue  <fxue@os.amperecomputing.com>
>  
>  	PR ipa/92528
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index fe79ca2247a..c30adfd31c2 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -12045,6 +12045,13 @@ IPA-CP calculates its own score of cloning profitability heuristics
>  and performs those cloning opportunities with scores that exceed
>  @option{ipa-cp-eval-threshold}.
>  
> +@item ipa-cp-max-recursive-depth
> +Maximum depth of recursive cloning for self-recursive function.
> +
> +@item ipa-cp-min-recursive-probability
> +Recursive cloning only when the probability of call being executed exceeds
> +the parameter.
> +
>  @item ipa-cp-recursion-penalty
>  Percentage penalty the recursive functions will receive when they
>  are evaluated for cloning.
> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
> index 8372dfaa771..bbf508bca6c 100644
> --- a/gcc/ipa-cp.c
> +++ b/gcc/ipa-cp.c
> @@ -228,7 +228,9 @@ public:
>    inline bool set_contains_variable ();
>    bool add_value (valtype newval, cgraph_edge *cs,
>  		  ipcp_value<valtype> *src_val = NULL,
> -		  int src_idx = 0, HOST_WIDE_INT offset = -1);
> +		  int src_idx = 0, HOST_WIDE_INT offset = -1,
> +		  ipcp_value<valtype> **val_pos_p = NULL,
> +		  bool unlimited = false);
>    void print (FILE * f, bool dump_sources, bool dump_benefits);
>  };
>  
> @@ -1817,22 +1819,37 @@ allocate_and_init_ipcp_value (ipa_polymorphic_call_context source)
>  /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  CS,
>     SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
>     meaning.  OFFSET -1 means the source is scalar and not a part of an
> -   aggregate.  */
> +   aggregate.  If non-NULL, VAL_POS_P specifies position in value list,
> +   after which newly created ipcp_value will be inserted, and it is also
> +   used to record address of the added ipcp_value before function returns.
> +   UNLIMITED means whether value count should not exceed the limit given
> +   by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
>  
>  template <typename valtype>
>  bool
>  ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>  				  ipcp_value<valtype> *src_val,
> -				  int src_idx, HOST_WIDE_INT offset)
> +				  int src_idx, HOST_WIDE_INT offset,
> +				  ipcp_value<valtype> **val_pos_p,
> +				  bool unlimited)
>  {
>    ipcp_value<valtype> *val;
>  
> +  if (val_pos_p)
> +    {
> +      for (val = values; val && val != *val_pos_p; val = val->next);

Please put empty statements on a separate line (I think there is one
more in the patch IIRC).

> +      gcc_checking_assert (val);
> +    }
> +
>    if (bottom)
>      return false;
>  
>    for (val = values; val; val = val->next)
>      if (values_equal_for_ipcp_p (val->value, newval))
>        {
> +	if (val_pos_p)
> +	  *val_pos_p = val;
> +
>  	if (ipa_edge_within_scc (cs))
>  	  {
>  	    ipcp_value_source<valtype> *s;
> @@ -1847,7 +1864,7 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>  	return false;
>        }
>  
> -  if (values_count == param_ipa_cp_value_list_size)
> +  if (!unlimited && values_count == param_ipa_cp_value_list_size)
>      {
>        /* We can only free sources, not the values themselves, because sources
>  	 of other values in this SCC might point to them.   */
> @@ -1861,6 +1878,9 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>  	    }
>  	}
>  
> +      if (val_pos_p)
> +	*val_pos_p = NULL;
> +
>        values = NULL;
>        return set_to_bottom ();
>      }
> @@ -1868,11 +1888,74 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>    values_count++;
>    val = allocate_and_init_ipcp_value (newval);
>    val->add_source (cs, src_val, src_idx, offset);
> -  val->next = values;
> -  values = val;
> +  if (val_pos_p)
> +    {
> +      val->next = (*val_pos_p)->next;
> +      (*val_pos_p)->next = val;
> +      *val_pos_p = val;
> +    }

I must say I am not a big fan of the val_pos_p parameter.  Would simply
putting new values always at the end of the list work to reduce the
iterations?  At this point the function has traversed all the values
already when searching if it is present, so it can remember the last
one and just add stuff after it.

But if I'm missing something clever I'm not going to oppose it.


> +  else
> +    {
> +      val->next = values;
> +      values = val;
> +    }
> +
> +  return true;
> +}
> +
> +/* Return true, if a ipcp_value VAL is orginated from parameter value of
> +   self-feeding recursive function by applying non-passthrough arithmetic
> +   transformation.  */
> +
> +static bool
> +self_recursively_generated_p (ipcp_value<tree> *val)
> +{
> +  class ipa_node_params *info = NULL;
> +
> +  for (ipcp_value_source<tree> *src = val->sources; src; src = src->next)
> +    {
> +      cgraph_edge *cs = src->cs;
> +
> +      if (!src->val || cs->caller != cs->callee->function_symbol ()
> +	  || src->val == val)
> +	return false;

I'd suggest putting the condition calling function_symbol last.

> +
> +      if (!info)
> +	info = IPA_NODE_REF (cs->caller);
> +
> +      class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
> +								src->index);
> +      ipcp_lattice<tree> *src_lat = src->offset == -1 ? &plats->itself
> +						      : plats->aggs;
> +      ipcp_value<tree> *src_val;
> +
> +      for (src_val = src_lat->values; src_val && src_val != val;
> +	   src_val = src_val->next);

I think that GNU code style dictates that when a for does not fit on a
single line, the three bits have to be on a separate line each.  Plus
please put the empty statement on its own line too.

> +
> +      if (!src_val)
> +	return false;
> +    }
> +
>    return true;
>  }
>  
> +static tree
> +get_val_across_arith_op (enum tree_code opcode,
> +			 tree opnd1_type,
> +			 tree opnd2,
> +			 ipcp_value<tree> *src_val,
> +			 tree res_type)

The function should get at least a brief comment.

> +{
> +  tree opnd1 = src_val->value;
> +
> +  /* Skip source values that is incompatible with specified type.  */
> +  if (opnd1_type
> +      && !useless_type_conversion_p (opnd1_type, TREE_TYPE (opnd1)))
> +    return NULL_TREE;
> +
> +  return ipa_get_jf_arith_result (opcode, opnd1, opnd2, res_type);
> +}
> +
>  /* Propagate values through an arithmetic transformation described by a jump
>     function associated with edge CS, taking values from SRC_LAT and putting
>     them into DEST_LAT.  OPND1_TYPE is expected type for the values in SRC_LAT.
> @@ -1896,24 +1979,74 @@ propagate_vals_across_arith_jfunc (cgraph_edge *cs,
>    ipcp_value<tree> *src_val;
>    bool ret = false;
>  
> -  /* Do not create new values when propagating within an SCC because if there
> -     are arithmetic functions with circular dependencies, there is infinite
> -     number of them and we would just make lattices bottom.  If this condition
> -     is ever relaxed we have to detect self-feeding recursive calls in
> -     cgraph_edge_brings_value_p in a smarter way.  */
> +  /* Due to circular dependencies, propagating within an SCC through arithmetic
> +     transformation would create infinite number of values.  But for
> +     self-feeding recursive function, we could allow propagation in a limited
> +     count, and this can enable a simple kind of recursive function versioning.
> +     For other scenario, we would just make lattices bottom.  */
>    if (opcode != NOP_EXPR && ipa_edge_within_scc (cs))
> -    ret = dest_lat->set_contains_variable ();
> +    {
> +      int i;
> +
> +      if (src_lat != dest_lat || param_ipa_cp_max_recursive_depth < 1)
> +	return dest_lat->set_contains_variable ();
> +
> +      /* No benefit if recursive execution is in low probability.  */
> +      if (cs->sreal_frequency () * 100
> +	  <= ((sreal) 1) * param_ipa_cp_min_recursive_probability)
> +	return dest_lat->set_contains_variable ();
> +
> +      auto_vec<ipcp_value<tree> *, 8> val_seeds;
> +
> +      for (src_val = src_lat->values; src_val; src_val = src_val->next)
> +	{
> +	  /* Now we do not use self-recursively generated value as propagation
> +	     source, this is absolutely conservative, but could avoid explosion
> +	     of lattice's value space, especially when one recursive function
> +	     calls another recursive.  */
> +	  if (self_recursively_generated_p (src_val))
> +	    {
> +	      /* If the lattice has already been propagated for the call site,
> +		 no need to do that again.  */
> +	      for (ipcp_value_source<tree> *s = src_val->sources; s;
> +		   s = s->next)

I'm afraid you'll have to reformat this for loop too.

> +		if (s->cs == cs)
> +		  return dest_lat->set_contains_variable ();
> +	    }
> +	  else
> +	    val_seeds.safe_push (src_val);
> +	}
> +
> +      /* Recursively generate lattice values with a limited count.  */
> +      FOR_EACH_VEC_ELT (val_seeds, i, src_val)
> +	{
> +	  for (int j = 1; j < param_ipa_cp_max_recursive_depth; j++)
> +	    {
> +	      tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
> +						     src_val, res_type);
> +	      if (!cstval)
> +		break;
> +
> +	      /* Try to place the new lattice value after its source, which
> +		 can decrease iterations of propagate stage.  */
> +	      ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
> +					  src_offset, &src_val, true);
> +	      gcc_checking_assert (src_val);
> +	    }
> +	}
> +      ret |= dest_lat->set_contains_variable ();
> +    }
>    else
>      for (src_val = src_lat->values; src_val; src_val = src_val->next)
>        {
> -	tree opnd1 = src_val->value;
> -	tree cstval = NULL_TREE;
> -
> -	/* Skip source values that is incompatible with specified type.  */
> -	if (!opnd1_type
> -	    || useless_type_conversion_p (opnd1_type, TREE_TYPE (opnd1)))
> -	  cstval = ipa_get_jf_arith_result (opcode, opnd1, opnd2, res_type);
> +	/* Now we do not use self-recursively generated value as propagation
> +	   source, otherwise it is easy to make value space of normal lattice
> +	   overflow.  */
> +	if (self_recursively_generated_p (src_val))
> +	  continue;

I think you are missing a necessary call to
dest_lat->set_contains_variable() here.  Either call it or initialize
cstval to zero and call get_val_across_arith_op only when
self_recursively_generated_p returns false;

>  
> +	tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
> +					       src_val, res_type);
>  	if (cstval)
>  	  ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
>  				      src_offset);
> @@ -3032,7 +3165,7 @@ hint_time_bonus (ipa_hints hints)
>  static inline int64_t
>  incorporate_penalties (ipa_node_params *info, int64_t evaluation)
>  {
> -  if (info->node_within_scc)
> +  if (info->node_within_scc && !info->node_is_self_scc)
>      evaluation = (evaluation
>  		  * (100 - param_ipa_cp_recursion_penalty)) / 100;
>  

The rest looks good to me, thanks for the patch, I hope to see it in the
trunk soon.

I'll be looking into IPA-CP tuning shortly too.  If you already have any
ideas, I'd be interested in hearing them.

Thanks,

Martin
Feng Xue OS Nov. 25, 2019, 2:17 p.m. UTC | #13
Martin,

    Thanks for your review. I updated the patch with your comments.

Feng
---
2019-11-15  Feng Xue <fxue@os.amperecomputing.com>

	PR ipa/92133
	* doc/invoke.texi (ipa-cp-max-recursive-depth): Document new option.
	(ipa-cp-min-recursive-probability): Likewise.
	* params.opt (ipa-cp-max-recursive-depth): New.
	(ipa-cp-min-recursive-probability): Likewise.
	* ipa-cp.c (ipcp_lattice<valtype>::add_value): Add two new parameters
	val_p and unlimited.
	(self_recursively_generated_p): New function.
	(get_val_across_arith_op): Likewise.
	(propagate_vals_across_arith_jfunc): Add constant propagation for
	self-recursive function.
	(incorporate_penalties): Do not penalize pure self-recursive function.
	(good_cloning_opportunity_p): Dump node_is_self_scc flag.
	(propagate_constants_topo): Set node_is_self_scc flag for cgraph node.
	(get_info_about_necessary_edges): Relax hotness check for edge to
	self-recursive function.
	* ipa-prop.h (ipa_node_params): Add new field node_is_self_scc.
---

> The least important comment: Thanks for providing the ChangeLog but
> sending ChangeLog in the patch itself makes it difficult for people to
> apply the patch because the ChangeLog hunks will never apply cleanly.
> That's why people send them in the email body when they email
> gcc-patches.  Hopefully we'll be putting ChangeLogs only into the commit
> message soon and all of this will be a non-issue.
Ok.

>> +  if (val_pos_p)
>> +    {
>> +      for (val = values; val && val != *val_pos_p; val = val->next);

> Please put empty statements on a separate line (I think there is one
> more in the patch IIRC).
Done.

>> +  if (val_pos_p)
>> +    {
>> +      val->next = (*val_pos_p)->next;
>> +      (*val_pos_p)->next = val;
>> +      *val_pos_p = val;
>> +    }

> I must say I am not a big fan of the val_pos_p parameter.  Would simply
> putting new values always at the end of the list work to reduce the
> iterations?  At this point the function has traversed all the values
> already when searching if it is present, so it can remember the last
> one and just add stuff after it.
We need a parameter to record address of the added value, which will be used
in constructing next one. To place new value at the end of list is a good idea,
thus meaning of val_pos_p becomes simpler, it is only an out parameter now.

>> +      if (!src->val || cs->caller != cs->callee->function_symbol ()
>> +       || src->val == val)
>> +     return false;

> I'd suggest putting the condition calling function_symbol last.
Original condition order can reduce unnecessary evaluations on src->val==val,
which is false in most situation, while cs->caller != cs->callee->function_symbol ()
is true. 

>> +      for (src_val = src_lat->values; src_val && src_val != val;
>> +        src_val = src_val->next);

> I think that GNU code style dictates that when a for does not fit on a
> single line, the three bits have to be on a separate line each.  Plus
> please put the empty statement on its own line too.
Done.

>> +static tree
>> +get_val_across_arith_op (enum tree_code opcode,
>> +                      tree opnd1_type,
>> +                      tree opnd2,
>> +                      ipcp_value<tree> *src_val,
>> +                      tree res_type)

> The function should get at least a brief comment.
Done.

>> +           for (ipcp_value_source<tree> *s = src_val->sources; s;
>> +                s = s->next)

> I'm afraid you'll have to reformat this for loop too.
Done.
>> +     if (self_recursively_generated_p (src_val))
>> +       continue;

> I think you are missing a necessary call to
> dest_lat->set_contains_variable() here.  Either call it or initialize
> cstval to zero and call get_val_across_arith_op only when
> self_recursively_generated_p returns false;
Yes, this is a bug. Fixed.
Feng Xue OS Nov. 27, 2019, 1:44 a.m. UTC | #14
Hi, Richard,

  This patch is a not bugfix, while it is small. Martin and Honza are fine with it.
But now we are in stage 3, is it ok to commit?

Thanks,
Feng
Jan Hubicka Nov. 27, 2019, 2:27 p.m. UTC | #15
> 2019-11-15  Feng Xue <fxue@os.amperecomputing.com>
> 
>         PR ipa/92133
>         * doc/invoke.texi (ipa-cp-max-recursive-depth): Document new option.
>         (ipa-cp-min-recursive-probability): Likewise.
>         * params.opt (ipa-cp-max-recursive-depth): New.
>         (ipa-cp-min-recursive-probability): Likewise.
>         * ipa-cp.c (ipcp_lattice<valtype>::add_value): Add two new parameters
>         val_p and unlimited.
>         (self_recursively_generated_p): New function.
>         (get_val_across_arith_op): Likewise.
>         (propagate_vals_across_arith_jfunc): Add constant propagation for
>         self-recursive function.
>         (incorporate_penalties): Do not penalize pure self-recursive function.
>         (good_cloning_opportunity_p): Dump node_is_self_scc flag.
>         (propagate_constants_topo): Set node_is_self_scc flag for cgraph node.
>         (get_info_about_necessary_edges): Relax hotness check for edge to
>         self-recursive function.
>         * ipa-prop.h (ipa_node_params): Add new field node_is_self_scc.

OK, thanks!
do you have some plans on the better cost model for the recursive
cloning? Also it would be nice to have this info available in recursive
inliner and give it a higher priority when inlining is going to turn
previously recrusive call into non-recursive.

Honza
Feng Xue OS Nov. 28, 2019, 2:51 a.m. UTC | #16
Have no a complete idea for new cost model, but there are something we can try:
o.  adjust estimated profile for recursive function so that we can get a higher threshold.
o.  introduce a strength level for ipa-cp-clone, by default, it is same as current configuration,
and with larger number, ipa-cp works more aggressively.
o. integrate frequency information to computation of prop_time_benefit.

Feng
Jeff Law Dec. 1, 2019, 8:33 p.m. UTC | #17
On 11/26/19 6:44 PM, Feng Xue OS wrote:
> Hi, Richard,
> 
>   This patch is a not bugfix, while it is small. Martin and Honza are fine with it.
> But now we are in stage 3, is it ok to commit?
Yes.  It was posted well in advance of the stage1->stage3 change.
Please commit it ASAP so the testers can pick it up.

Thanks,
jeff
Feng Xue OS Dec. 2, 2019, 7:07 a.m. UTC | #18
Done.   -Thanks

Feng
diff mbox series

Patch

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 045072e02ec..6255a766e4d 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -229,7 +229,9 @@  public:
   inline bool set_contains_variable ();
   bool add_value (valtype newval, cgraph_edge *cs,
 		  ipcp_value<valtype> *src_val = NULL,
-		  int src_idx = 0, HOST_WIDE_INT offset = -1);
+		  int src_idx = 0, HOST_WIDE_INT offset = -1,
+		  ipcp_value<valtype> **val_pos_p = NULL,
+		  bool unlimited = false);
   void print (FILE * f, bool dump_sources, bool dump_benefits);
 };
 
@@ -1579,22 +1581,37 @@  allocate_and_init_ipcp_value (ipa_polymorphic_call_context source)
 /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  CS,
    SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
    meaning.  OFFSET -1 means the source is scalar and not a part of an
-   aggregate.  */
+   aggregate.  If non-NULL, VAL_POS_P specifies position in value list,
+   after which newly created ipcp_value will be inserted, and it is also
+   used to record address of the added ipcp_value before function returns.
+   UNLIMITED means whether value count should not exceed the limit given
+   by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
 
 template <typename valtype>
 bool
 ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 				  ipcp_value<valtype> *src_val,
-				  int src_idx, HOST_WIDE_INT offset)
+				  int src_idx, HOST_WIDE_INT offset,
+				  ipcp_value<valtype> **val_pos_p,
+				  bool unlimited)
 {
   ipcp_value<valtype> *val;
 
+  if (val_pos_p)
+    {
+      for (val = values; val && val != *val_pos_p; val = val->next);
+      gcc_checking_assert (val);
+    }
+
   if (bottom)
     return false;
 
   for (val = values; val; val = val->next)
     if (values_equal_for_ipcp_p (val->value, newval))
       {
+	if (val_pos_p)
+	  *val_pos_p = val;
+
 	if (ipa_edge_within_scc (cs))
 	  {
 	    ipcp_value_source<valtype> *s;
@@ -1609,7 +1626,8 @@  ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 	return false;
       }
 
-  if (values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
+  if (!unlimited
+      && values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
     {
       /* We can only free sources, not the values themselves, because sources
 	 of other values in this SCC might point to them.   */
@@ -1623,6 +1641,9 @@  ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 	    }
 	}
 
+      if (val_pos_p)
+	*val_pos_p = NULL;
+
       values = NULL;
       return set_to_bottom ();
     }
@@ -1630,8 +1651,54 @@  ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
   values_count++;
   val = allocate_and_init_ipcp_value (newval);
   val->add_source (cs, src_val, src_idx, offset);
-  val->next = values;
-  values = val;
+  if (val_pos_p)
+    {
+      val->next = (*val_pos_p)->next;
+      (*val_pos_p)->next = val;
+      *val_pos_p = val;
+    }
+  else
+    {
+      val->next = values;
+      values = val;
+    }
+
+  return true;
+}
+
+/* Return true, if a ipcp_value VAL is orginated from parameter value of
+   self-feeding recursive function by applying non-passthrough arithmetic
+   transformation.  */
+
+static bool
+self_recursively_generated_p (ipcp_value<tree> *val)
+{
+  class ipa_node_params *info = NULL;
+
+  for (ipcp_value_source<tree> *src = val->sources; src; src = src->next)
+    {
+      cgraph_edge *cs = src->cs;
+
+      if (!src->val || cs->caller != cs->callee->function_symbol ()
+	  || src->val == val)
+	return false;
+
+      if (!info)
+	info = IPA_NODE_REF (cs->caller);
+
+      class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
+								src->index);
+      ipcp_lattice<tree> *src_lat = src->offset == -1 ? &plats->itself
+						      : plats->aggs;
+      ipcp_value<tree> *src_val;
+
+      for (src_val = src_lat->values; src_val && src_val != val;
+	   src_val = src_val->next);
+
+      if (!src_val)
+	return false;
+    }
+
   return true;
 }
 
@@ -1649,20 +1716,72 @@  propagate_vals_across_pass_through (cgraph_edge *cs, ipa_jump_func *jfunc,
   ipcp_value<tree> *src_val;
   bool ret = false;
 
-  /* Do not create new values when propagating within an SCC because if there
-     are arithmetic functions with circular dependencies, there is infinite
-     number of them and we would just make lattices bottom.  If this condition
-     is ever relaxed we have to detect self-feeding recursive calls in
-     cgraph_edge_brings_value_p in a smarter way.  */
+  /* Due to circular dependencies, propagating within an SCC through arithmetic
+     transformation would create infinite number of values.  But for
+     self-feeding recursive function, we could allow propagation in a limited
+     count, and this can enable a simple kind of recursive function versioning.
+     For other scenario, we would just make lattices bottom.  */
   if ((ipa_get_jf_pass_through_operation (jfunc) != NOP_EXPR)
       && ipa_edge_within_scc (cs))
-    ret = dest_lat->set_contains_variable ();
+    {
+      if (src_lat != dest_lat)
+	return dest_lat->set_contains_variable ();
+
+      auto_vec<ipcp_value<tree> *, 8> val_seeds;
+
+      for (src_val = src_lat->values; src_val; src_val = src_val->next)
+	{
+	  /* Now we do not use self-recursively generated value as propagation
+	     source, this is absolutely conservative, but could avoid explosion
+	     of lattice's value space, especially when one recursive function
+	     calls another recursive.  */
+	  if (self_recursively_generated_p (src_val))
+	    {
+	      /* If the lattice has already been propagated for the call site,
+		 no need to do that again.  */
+	      for (ipcp_value_source<tree> *s = src_val->sources; s;
+		   s = s->next)
+		if (s->cs == cs)
+		  return dest_lat->set_contains_variable ();
+	    }
+	  else
+	    val_seeds.safe_push (src_val);
+	}
+
+      int clone_copies = PARAM_VALUE (PARAM_IPA_CP_MAX_RECURSION_COPIES);
+      int i;
+
+      /* Recursively generate lattice values with a limited count.  */
+      FOR_EACH_VEC_ELT (val_seeds, i, src_val)
+	{
+	  for (int j = 1; j < clone_copies; j++)
+	    {
+	      tree cstval = ipa_get_jf_pass_through_result (jfunc,
+	      						    src_val->value,
+							    parm_type);
+	      if (!cstval)
+		break;
+
+	      /* Try to place the new lattice value after its source, which
+		 can decrease iterations of propagate stage.  */
+	      ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
+					  -1, &src_val, true);
+	      gcc_checking_assert (src_val);
+	    }
+	}
+      ret |= dest_lat->set_contains_variable ();
+    }
   else
     for (src_val = src_lat->values; src_val; src_val = src_val->next)
       {
+	  /* Now we do not use self-recursively generated value as propagation
+	     source, otherwise it is easy to make value space of normal lattice
+	     overflow.  */
+	if (self_recursively_generated_p (src_val))
+	  continue;
+
 	tree cstval = ipa_get_jf_pass_through_result (jfunc, src_val->value,
 						      parm_type);
-
 	if (cstval)
 	  ret |= dest_lat->add_value (cstval, cs, src_val, src_idx);
 	else
@@ -3635,6 +3754,9 @@  get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
 	      hot |= cs->maybe_hot_p ();
 	      if (cs->caller != dest)
 		non_self_recursive = true;
+	      else if (src->val)
+		gcc_assert (values_equal_for_ipcp_p (src->val->value,
+						     val->value));
 	    }
 	  cs = get_next_cgraph_edge_clone (cs);
 	}
diff --git a/gcc/params.def b/gcc/params.def
index 322c37f8b96..3e242e09f01 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1135,6 +1135,11 @@  DEFPARAM (PARAM_IPA_CP_RECURSION_PENALTY,
 	  "are evaluated for cloning.",
 	  40, 0, 100)
 
+DEFPARAM (PARAM_IPA_CP_MAX_RECURSION_COPIES,
+	  "ipa-cp-max-recursion-copies",
+	  "Maximum function versioning copies for a self-recursive function.",
+	  8, 0, 0)
+
 DEFPARAM (PARAM_IPA_CP_SINGLE_CALL_PENALTY,
 	  "ipa-cp-single-call-penalty",
 	  "Percentage penalty functions containing a single call to another "
diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
new file mode 100644
index 00000000000..7c1c01047c6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
@@ -0,0 +1,47 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-ipa-cp-details -fno-early-inlining" } */
+
+int fn();
+
+int data[100];
+
+int recur_fn (int i)
+{
+  int j;
+   
+  if (i == 6)
+    {
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      return 10;
+    }
+
+  data[i] = i; 
+
+  for (j = 0; j < 100; j++)
+    recur_fn (i + 1);
+
+  return i; 
+}
+
+int main ()
+{
+  int i;
+
+  for (i = 0; i < 100; i++)
+    recur_fn (1) + recur_fn (-5);
+
+  return 1;
+}
+
+/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 12 "cp" } } */