diff mbox series

[v2,3/9] Introduce can_vector_compare_p function

Message ID 20190822134551.18924-4-iii@linux.ibm.com
State New
Headers show
Series S/390: Use signaling FP comparison instructions | expand

Commit Message

Ilya Leoshkevich Aug. 22, 2019, 1:45 p.m. UTC
z13 supports only non-signaling vector comparisons.  This means we
cannot vectorize LT, LE, GT, GE and LTGT when compiling for z13.
However, we cannot express this restriction today: the code only checks
whether vcond$a$b optab exists, which does not contain information about
the operation.

Introduce a function that checks whether back-end supports vector
comparisons with individual rtx codes by matching vcond expander's third
argument with a fake comparison with the corresponding rtx code.

gcc/ChangeLog:

2019-08-21  Ilya Leoshkevich  <iii@linux.ibm.com>

	* Makefile.in (GTFILES): Add optabs.c.
	* optabs-tree.c (expand_vec_cond_expr_p): Use
	can_vector_compare_p.
	* optabs.c (binop_key): Binary operation cache key.
	(binop_hasher): Binary operation cache hasher.
	(cached_binops): Binary operation cache.
	(get_cached_binop): New function that returns a cached binary
	operation or creates a new one.
	(can_vector_compare_p): New function.
	* optabs.h (enum can_vector_compare_purpose): New enum. Not
	really needed today, but can be used to extend the support to
	e.g. vec_cmp if need arises.
	(can_vector_compare_p): New function.
---
 gcc/Makefile.in   |  2 +-
 gcc/optabs-tree.c | 11 +++++--
 gcc/optabs.c      | 79 +++++++++++++++++++++++++++++++++++++++++++++++
 gcc/optabs.h      | 15 +++++++++
 4 files changed, 104 insertions(+), 3 deletions(-)

Comments

Richard Sandiford Aug. 23, 2019, 10:43 a.m. UTC | #1
Ilya Leoshkevich <iii@linux.ibm.com> writes:
> @@ -3819,6 +3820,82 @@ can_compare_p (enum rtx_code code, machine_mode mode,
>    return 0;
>  }
>  
> +/* can_vector_compare_p presents fake rtx binary operations to the the back-end
> +   in order to determine its capabilities.  In order to avoid creating fake
> +   operations on each call, values from previous calls are cached in a global
> +   cached_binops hash_table.  It contains rtxes, which can be looked up using
> +   binop_keys.  */
> +
> +struct binop_key {
> +  enum rtx_code code;        /* Operation code.  */
> +  machine_mode value_mode;   /* Result mode.     */
> +  machine_mode cmp_op_mode;  /* Operand mode.    */
> +};
> +
> +struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> {
> +  typedef rtx value_type;
> +  typedef binop_key compare_type;
> +
> +  static hashval_t
> +  hash (enum rtx_code code, machine_mode value_mode, machine_mode cmp_op_mode)
> +  {
> +    inchash::hash hstate (0);
> +    hstate.add_int (code);
> +    hstate.add_int (value_mode);
> +    hstate.add_int (cmp_op_mode);
> +    return hstate.end ();
> +  }
> +
> +  static hashval_t
> +  hash (const rtx &ref)
> +  {
> +    return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP (ref, 0)));
> +  }
> +
> +  static bool
> +  equal (const rtx &ref1, const binop_key &ref2)
> +  {
> +    return (GET_CODE (ref1) == ref2.code)
> +	   && (GET_MODE (ref1) == ref2.value_mode)
> +	   && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode);
> +  }
> +};
> +
> +static GTY ((cache)) hash_table<binop_hasher> *cached_binops;
> +
> +static rtx
> +get_cached_binop (enum rtx_code code, machine_mode value_mode,
> +		  machine_mode cmp_op_mode)
> +{
> +  if (!cached_binops)
> +    cached_binops = hash_table<binop_hasher>::create_ggc (1024);
> +  binop_key key = { code, value_mode, cmp_op_mode };
> +  hashval_t hash = binop_hasher::hash (code, value_mode, cmp_op_mode);
> +  rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT);
> +  if (!*slot)
> +    *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx (cmp_op_mode),
> +			    gen_reg_rtx (cmp_op_mode));
> +  return *slot;
> +}

Sorry, I didn't mean anything this complicated.  I just meant that
we should have a single cached rtx that we can change via PUT_CODE and
PUT_MODE_RAW for each new query, rather than allocating a new rtx each
time.

Something like:

static GTY ((cache)) rtx cached_binop;

rtx
get_cached_binop (machine_mode mode, rtx_code code, machine_mode op_mode)
{
  if (cached_binop)
    {
      PUT_CODE (cached_binop, code);
      PUT_MODE_RAW (cached_binop, mode);
      PUT_MODE_RAW (XEXP (cached_binop, 0), op_mode);
      PUT_MODE_RAW (XEXP (cached_binop, 1), op_mode);
    }
  else
    {
      rtx reg1 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1);
      rtx reg2 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 2);
      cached_binop = gen_rtx_fmt_ee (code, mode, reg1, reg2);
    }
  return cached_binop;
}

Thanks,
Richard
Richard Biener Aug. 23, 2019, 11:24 a.m. UTC | #2
On Fri, Aug 23, 2019 at 12:43 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Ilya Leoshkevich <iii@linux.ibm.com> writes:
> > @@ -3819,6 +3820,82 @@ can_compare_p (enum rtx_code code, machine_mode mode,
> >    return 0;
> >  }
> >
> > +/* can_vector_compare_p presents fake rtx binary operations to the the back-end
> > +   in order to determine its capabilities.  In order to avoid creating fake
> > +   operations on each call, values from previous calls are cached in a global
> > +   cached_binops hash_table.  It contains rtxes, which can be looked up using
> > +   binop_keys.  */
> > +
> > +struct binop_key {
> > +  enum rtx_code code;        /* Operation code.  */
> > +  machine_mode value_mode;   /* Result mode.     */
> > +  machine_mode cmp_op_mode;  /* Operand mode.    */
> > +};
> > +
> > +struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> {
> > +  typedef rtx value_type;
> > +  typedef binop_key compare_type;
> > +
> > +  static hashval_t
> > +  hash (enum rtx_code code, machine_mode value_mode, machine_mode cmp_op_mode)
> > +  {
> > +    inchash::hash hstate (0);
> > +    hstate.add_int (code);
> > +    hstate.add_int (value_mode);
> > +    hstate.add_int (cmp_op_mode);
> > +    return hstate.end ();
> > +  }
> > +
> > +  static hashval_t
> > +  hash (const rtx &ref)
> > +  {
> > +    return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP (ref, 0)));
> > +  }
> > +
> > +  static bool
> > +  equal (const rtx &ref1, const binop_key &ref2)
> > +  {
> > +    return (GET_CODE (ref1) == ref2.code)
> > +        && (GET_MODE (ref1) == ref2.value_mode)
> > +        && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode);
> > +  }
> > +};
> > +
> > +static GTY ((cache)) hash_table<binop_hasher> *cached_binops;
> > +
> > +static rtx
> > +get_cached_binop (enum rtx_code code, machine_mode value_mode,
> > +               machine_mode cmp_op_mode)
> > +{
> > +  if (!cached_binops)
> > +    cached_binops = hash_table<binop_hasher>::create_ggc (1024);
> > +  binop_key key = { code, value_mode, cmp_op_mode };
> > +  hashval_t hash = binop_hasher::hash (code, value_mode, cmp_op_mode);
> > +  rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT);
> > +  if (!*slot)
> > +    *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx (cmp_op_mode),
> > +                         gen_reg_rtx (cmp_op_mode));
> > +  return *slot;
> > +}
>
> Sorry, I didn't mean anything this complicated.  I just meant that
> we should have a single cached rtx that we can change via PUT_CODE and
> PUT_MODE_RAW for each new query, rather than allocating a new rtx each
> time.
>
> Something like:
>
> static GTY ((cache)) rtx cached_binop;
>
> rtx
> get_cached_binop (machine_mode mode, rtx_code code, machine_mode op_mode)
> {
>   if (cached_binop)
>     {
>       PUT_CODE (cached_binop, code);
>       PUT_MODE_RAW (cached_binop, mode);
>       PUT_MODE_RAW (XEXP (cached_binop, 0), op_mode);
>       PUT_MODE_RAW (XEXP (cached_binop, 1), op_mode);
>     }
>   else
>     {
>       rtx reg1 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1);
>       rtx reg2 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 2);
>       cached_binop = gen_rtx_fmt_ee (code, mode, reg1, reg2);
>     }
>   return cached_binop;
> }

Hmm, maybe we need  auto_rtx (code) that constructs such
RTX on the stack instead of wasting a GC root (and causing
issues for future threading of GCC ;)).

Richard.

>
> Thanks,
> Richard
Ilya Leoshkevich Aug. 23, 2019, 11:26 a.m. UTC | #3
> Am 23.08.2019 um 12:43 schrieb Richard Sandiford <richard.sandiford@arm.com>:
> 
> Ilya Leoshkevich <iii@linux.ibm.com> writes:
>> @@ -3819,6 +3820,82 @@ can_compare_p (enum rtx_code code, machine_mode mode,
>>   return 0;
>> }
>> 
>> +/* can_vector_compare_p presents fake rtx binary operations to the the back-end
>> +   in order to determine its capabilities.  In order to avoid creating fake
>> +   operations on each call, values from previous calls are cached in a global
>> +   cached_binops hash_table.  It contains rtxes, which can be looked up using
>> +   binop_keys.  */
>> +
>> +struct binop_key {
>> +  enum rtx_code code;        /* Operation code.  */
>> +  machine_mode value_mode;   /* Result mode.     */
>> +  machine_mode cmp_op_mode;  /* Operand mode.    */
>> +};
>> +
>> +struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> {
>> +  typedef rtx value_type;
>> +  typedef binop_key compare_type;
>> +
>> +  static hashval_t
>> +  hash (enum rtx_code code, machine_mode value_mode, machine_mode cmp_op_mode)
>> +  {
>> +    inchash::hash hstate (0);
>> +    hstate.add_int (code);
>> +    hstate.add_int (value_mode);
>> +    hstate.add_int (cmp_op_mode);
>> +    return hstate.end ();
>> +  }
>> +
>> +  static hashval_t
>> +  hash (const rtx &ref)
>> +  {
>> +    return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP (ref, 0)));
>> +  }
>> +
>> +  static bool
>> +  equal (const rtx &ref1, const binop_key &ref2)
>> +  {
>> +    return (GET_CODE (ref1) == ref2.code)
>> +	   && (GET_MODE (ref1) == ref2.value_mode)
>> +	   && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode);
>> +  }
>> +};
>> +
>> +static GTY ((cache)) hash_table<binop_hasher> *cached_binops;
>> +
>> +static rtx
>> +get_cached_binop (enum rtx_code code, machine_mode value_mode,
>> +		  machine_mode cmp_op_mode)
>> +{
>> +  if (!cached_binops)
>> +    cached_binops = hash_table<binop_hasher>::create_ggc (1024);
>> +  binop_key key = { code, value_mode, cmp_op_mode };
>> +  hashval_t hash = binop_hasher::hash (code, value_mode, cmp_op_mode);
>> +  rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT);
>> +  if (!*slot)
>> +    *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx (cmp_op_mode),
>> +			    gen_reg_rtx (cmp_op_mode));
>> +  return *slot;
>> +}
> 
> Sorry, I didn't mean anything this complicated.  I just meant that
> we should have a single cached rtx that we can change via PUT_CODE and
> PUT_MODE_RAW for each new query, rather than allocating a new rtx each
> time.
> 
> Something like:
> 
> static GTY ((cache)) rtx cached_binop;
> 
> rtx
> get_cached_binop (machine_mode mode, rtx_code code, machine_mode op_mode)
> {
>  if (cached_binop)
>    {
>      PUT_CODE (cached_binop, code);
>      PUT_MODE_RAW (cached_binop, mode);
>      PUT_MODE_RAW (XEXP (cached_binop, 0), op_mode);
>      PUT_MODE_RAW (XEXP (cached_binop, 1), op_mode);
>    }
>  else
>    {
>      rtx reg1 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1);
>      rtx reg2 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 2);
>      cached_binop = gen_rtx_fmt_ee (code, mode, reg1, reg2);
>    }
>  return cached_binop;
> }
> 
> Thanks,
> Richard

Oh, I must have completely missed the point: the cache is only for
storage, and stored values themselves don't really matter.

To make rtx usable with GTY ((cache)) I had to do:

--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -4427,4 +4427,9 @@ extern void gt_ggc_mx (rtx &);
 extern void gt_pch_nx (rtx &);
 extern void gt_pch_nx (rtx &, gt_pointer_operator, void *);

+inline void
+gt_cleare_cache (rtx)
+{
+}
+
 #endif /* ! GCC_RTL_H */

Does that look ok?

Another think that might turn out being important: in your first
suggestion you use

 if (insn_operand_predicate_fn pred = insn_data[icode].operand[3].predicate)
   {
     machine_mode cmp_mode = insn_data[icode].operand[3].mode;

instead of simply insn_operand_matches - is there any difference?
Ilya Leoshkevich Aug. 23, 2019, 11:35 a.m. UTC | #4
> Am 23.08.2019 um 13:24 schrieb Richard Biener <richard.guenther@gmail.com>:
> 
> On Fri, Aug 23, 2019 at 12:43 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>> 
>> Ilya Leoshkevich <iii@linux.ibm.com> writes:
>>> @@ -3819,6 +3820,82 @@ can_compare_p (enum rtx_code code, machine_mode mode,
>>>   return 0;
>>> }
>>> 
>>> +/* can_vector_compare_p presents fake rtx binary operations to the the back-end
>>> +   in order to determine its capabilities.  In order to avoid creating fake
>>> +   operations on each call, values from previous calls are cached in a global
>>> +   cached_binops hash_table.  It contains rtxes, which can be looked up using
>>> +   binop_keys.  */
>>> +
>>> +struct binop_key {
>>> +  enum rtx_code code;        /* Operation code.  */
>>> +  machine_mode value_mode;   /* Result mode.     */
>>> +  machine_mode cmp_op_mode;  /* Operand mode.    */
>>> +};
>>> +
>>> +struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> {
>>> +  typedef rtx value_type;
>>> +  typedef binop_key compare_type;
>>> +
>>> +  static hashval_t
>>> +  hash (enum rtx_code code, machine_mode value_mode, machine_mode cmp_op_mode)
>>> +  {
>>> +    inchash::hash hstate (0);
>>> +    hstate.add_int (code);
>>> +    hstate.add_int (value_mode);
>>> +    hstate.add_int (cmp_op_mode);
>>> +    return hstate.end ();
>>> +  }
>>> +
>>> +  static hashval_t
>>> +  hash (const rtx &ref)
>>> +  {
>>> +    return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP (ref, 0)));
>>> +  }
>>> +
>>> +  static bool
>>> +  equal (const rtx &ref1, const binop_key &ref2)
>>> +  {
>>> +    return (GET_CODE (ref1) == ref2.code)
>>> +        && (GET_MODE (ref1) == ref2.value_mode)
>>> +        && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode);
>>> +  }
>>> +};
>>> +
>>> +static GTY ((cache)) hash_table<binop_hasher> *cached_binops;
>>> +
>>> +static rtx
>>> +get_cached_binop (enum rtx_code code, machine_mode value_mode,
>>> +               machine_mode cmp_op_mode)
>>> +{
>>> +  if (!cached_binops)
>>> +    cached_binops = hash_table<binop_hasher>::create_ggc (1024);
>>> +  binop_key key = { code, value_mode, cmp_op_mode };
>>> +  hashval_t hash = binop_hasher::hash (code, value_mode, cmp_op_mode);
>>> +  rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT);
>>> +  if (!*slot)
>>> +    *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx (cmp_op_mode),
>>> +                         gen_reg_rtx (cmp_op_mode));
>>> +  return *slot;
>>> +}
>> 
>> Sorry, I didn't mean anything this complicated.  I just meant that
>> we should have a single cached rtx that we can change via PUT_CODE and
>> PUT_MODE_RAW for each new query, rather than allocating a new rtx each
>> time.
>> 
>> Something like:
>> 
>> static GTY ((cache)) rtx cached_binop;
>> 
>> rtx
>> get_cached_binop (machine_mode mode, rtx_code code, machine_mode op_mode)
>> {
>>  if (cached_binop)
>>    {
>>      PUT_CODE (cached_binop, code);
>>      PUT_MODE_RAW (cached_binop, mode);
>>      PUT_MODE_RAW (XEXP (cached_binop, 0), op_mode);
>>      PUT_MODE_RAW (XEXP (cached_binop, 1), op_mode);
>>    }
>>  else
>>    {
>>      rtx reg1 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1);
>>      rtx reg2 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 2);
>>      cached_binop = gen_rtx_fmt_ee (code, mode, reg1, reg2);
>>    }
>>  return cached_binop;
>> }
> 
> Hmm, maybe we need  auto_rtx (code) that constructs such
> RTX on the stack instead of wasting a GC root (and causing
> issues for future threading of GCC ;)).

Do you mean something like this?

union {
  char raw[rtx_code_size[code]];
  rtx rtx;
} binop;

Does this exist already (git grep auto.*rtx / rtx.*auto doesn't show
anything useful), or should I implement this?
Richard Sandiford Aug. 23, 2019, 1:47 p.m. UTC | #5
Ilya Leoshkevich <iii@linux.ibm.com> writes:
>> Am 23.08.2019 um 12:43 schrieb Richard Sandiford <richard.sandiford@arm.com>:
>> 
>> Ilya Leoshkevich <iii@linux.ibm.com> writes:
>>> @@ -3819,6 +3820,82 @@ can_compare_p (enum rtx_code code, machine_mode mode,
>>>   return 0;
>>> }
>>> 
>>> +/* can_vector_compare_p presents fake rtx binary operations to the the back-end
>>> +   in order to determine its capabilities.  In order to avoid creating fake
>>> +   operations on each call, values from previous calls are cached in a global
>>> +   cached_binops hash_table.  It contains rtxes, which can be looked up using
>>> +   binop_keys.  */
>>> +
>>> +struct binop_key {
>>> +  enum rtx_code code;        /* Operation code.  */
>>> +  machine_mode value_mode;   /* Result mode.     */
>>> +  machine_mode cmp_op_mode;  /* Operand mode.    */
>>> +};
>>> +
>>> +struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> {
>>> +  typedef rtx value_type;
>>> +  typedef binop_key compare_type;
>>> +
>>> +  static hashval_t
>>> +  hash (enum rtx_code code, machine_mode value_mode, machine_mode cmp_op_mode)
>>> +  {
>>> +    inchash::hash hstate (0);
>>> +    hstate.add_int (code);
>>> +    hstate.add_int (value_mode);
>>> +    hstate.add_int (cmp_op_mode);
>>> +    return hstate.end ();
>>> +  }
>>> +
>>> +  static hashval_t
>>> +  hash (const rtx &ref)
>>> +  {
>>> +    return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP (ref, 0)));
>>> +  }
>>> +
>>> +  static bool
>>> +  equal (const rtx &ref1, const binop_key &ref2)
>>> +  {
>>> +    return (GET_CODE (ref1) == ref2.code)
>>> +	   && (GET_MODE (ref1) == ref2.value_mode)
>>> +	   && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode);
>>> +  }
>>> +};
>>> +
>>> +static GTY ((cache)) hash_table<binop_hasher> *cached_binops;
>>> +
>>> +static rtx
>>> +get_cached_binop (enum rtx_code code, machine_mode value_mode,
>>> +		  machine_mode cmp_op_mode)
>>> +{
>>> +  if (!cached_binops)
>>> +    cached_binops = hash_table<binop_hasher>::create_ggc (1024);
>>> +  binop_key key = { code, value_mode, cmp_op_mode };
>>> +  hashval_t hash = binop_hasher::hash (code, value_mode, cmp_op_mode);
>>> +  rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT);
>>> +  if (!*slot)
>>> +    *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx (cmp_op_mode),
>>> +			    gen_reg_rtx (cmp_op_mode));
>>> +  return *slot;
>>> +}
>> 
>> Sorry, I didn't mean anything this complicated.  I just meant that
>> we should have a single cached rtx that we can change via PUT_CODE and
>> PUT_MODE_RAW for each new query, rather than allocating a new rtx each
>> time.
>> 
>> Something like:
>> 
>> static GTY ((cache)) rtx cached_binop;
>> 
>> rtx
>> get_cached_binop (machine_mode mode, rtx_code code, machine_mode op_mode)
>> {
>>  if (cached_binop)
>>    {
>>      PUT_CODE (cached_binop, code);
>>      PUT_MODE_RAW (cached_binop, mode);
>>      PUT_MODE_RAW (XEXP (cached_binop, 0), op_mode);
>>      PUT_MODE_RAW (XEXP (cached_binop, 1), op_mode);
>>    }
>>  else
>>    {
>>      rtx reg1 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1);
>>      rtx reg2 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 2);
>>      cached_binop = gen_rtx_fmt_ee (code, mode, reg1, reg2);
>>    }
>>  return cached_binop;
>> }
>> 
>> Thanks,
>> Richard
>
> Oh, I must have completely missed the point: the cache is only for
> storage, and stored values themselves don't really matter.
>
> To make rtx usable with GTY ((cache)) I had to do:
>
> --- a/gcc/rtl.h
> +++ b/gcc/rtl.h
> @@ -4427,4 +4427,9 @@ extern void gt_ggc_mx (rtx &);
>  extern void gt_pch_nx (rtx &);
>  extern void gt_pch_nx (rtx &, gt_pointer_operator, void *);
>
> +inline void
> +gt_cleare_cache (rtx)
> +{
> +}
> +
>  #endif /* ! GCC_RTL_H */
>
> Does that look ok?

Ah, turns out I was thinking of "deletable" rather than "cache", sorry.

> Another think that might turn out being important: in your first
> suggestion you use
>
>  if (insn_operand_predicate_fn pred = insn_data[icode].operand[3].predicate)
>    {
>      machine_mode cmp_mode = insn_data[icode].operand[3].mode;
>
> instead of simply insn_operand_matches - is there any difference?

I guess it was premature optimisation: if the .md file doesn't specify a
predicate, we don't even need to create the rtx.  But since most targets
probably do specify a predicate, using insn_matches is fine too.

Thanks,
Richard
Richard Biener Aug. 26, 2019, 8:49 a.m. UTC | #6
On Fri, Aug 23, 2019 at 1:35 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>
> > Am 23.08.2019 um 13:24 schrieb Richard Biener <richard.guenther@gmail.com>:
> >
> > On Fri, Aug 23, 2019 at 12:43 PM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Ilya Leoshkevich <iii@linux.ibm.com> writes:
> >>> @@ -3819,6 +3820,82 @@ can_compare_p (enum rtx_code code, machine_mode mode,
> >>>   return 0;
> >>> }
> >>>
> >>> +/* can_vector_compare_p presents fake rtx binary operations to the the back-end
> >>> +   in order to determine its capabilities.  In order to avoid creating fake
> >>> +   operations on each call, values from previous calls are cached in a global
> >>> +   cached_binops hash_table.  It contains rtxes, which can be looked up using
> >>> +   binop_keys.  */
> >>> +
> >>> +struct binop_key {
> >>> +  enum rtx_code code;        /* Operation code.  */
> >>> +  machine_mode value_mode;   /* Result mode.     */
> >>> +  machine_mode cmp_op_mode;  /* Operand mode.    */
> >>> +};
> >>> +
> >>> +struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> {
> >>> +  typedef rtx value_type;
> >>> +  typedef binop_key compare_type;
> >>> +
> >>> +  static hashval_t
> >>> +  hash (enum rtx_code code, machine_mode value_mode, machine_mode cmp_op_mode)
> >>> +  {
> >>> +    inchash::hash hstate (0);
> >>> +    hstate.add_int (code);
> >>> +    hstate.add_int (value_mode);
> >>> +    hstate.add_int (cmp_op_mode);
> >>> +    return hstate.end ();
> >>> +  }
> >>> +
> >>> +  static hashval_t
> >>> +  hash (const rtx &ref)
> >>> +  {
> >>> +    return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP (ref, 0)));
> >>> +  }
> >>> +
> >>> +  static bool
> >>> +  equal (const rtx &ref1, const binop_key &ref2)
> >>> +  {
> >>> +    return (GET_CODE (ref1) == ref2.code)
> >>> +        && (GET_MODE (ref1) == ref2.value_mode)
> >>> +        && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode);
> >>> +  }
> >>> +};
> >>> +
> >>> +static GTY ((cache)) hash_table<binop_hasher> *cached_binops;
> >>> +
> >>> +static rtx
> >>> +get_cached_binop (enum rtx_code code, machine_mode value_mode,
> >>> +               machine_mode cmp_op_mode)
> >>> +{
> >>> +  if (!cached_binops)
> >>> +    cached_binops = hash_table<binop_hasher>::create_ggc (1024);
> >>> +  binop_key key = { code, value_mode, cmp_op_mode };
> >>> +  hashval_t hash = binop_hasher::hash (code, value_mode, cmp_op_mode);
> >>> +  rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT);
> >>> +  if (!*slot)
> >>> +    *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx (cmp_op_mode),
> >>> +                         gen_reg_rtx (cmp_op_mode));
> >>> +  return *slot;
> >>> +}
> >>
> >> Sorry, I didn't mean anything this complicated.  I just meant that
> >> we should have a single cached rtx that we can change via PUT_CODE and
> >> PUT_MODE_RAW for each new query, rather than allocating a new rtx each
> >> time.
> >>
> >> Something like:
> >>
> >> static GTY ((cache)) rtx cached_binop;
> >>
> >> rtx
> >> get_cached_binop (machine_mode mode, rtx_code code, machine_mode op_mode)
> >> {
> >>  if (cached_binop)
> >>    {
> >>      PUT_CODE (cached_binop, code);
> >>      PUT_MODE_RAW (cached_binop, mode);
> >>      PUT_MODE_RAW (XEXP (cached_binop, 0), op_mode);
> >>      PUT_MODE_RAW (XEXP (cached_binop, 1), op_mode);
> >>    }
> >>  else
> >>    {
> >>      rtx reg1 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1);
> >>      rtx reg2 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 2);
> >>      cached_binop = gen_rtx_fmt_ee (code, mode, reg1, reg2);
> >>    }
> >>  return cached_binop;
> >> }
> >
> > Hmm, maybe we need  auto_rtx (code) that constructs such
> > RTX on the stack instead of wasting a GC root (and causing
> > issues for future threading of GCC ;)).
>
> Do you mean something like this?
>
> union {
>   char raw[rtx_code_size[code]];
>   rtx rtx;
> } binop;
>
> Does this exist already (git grep auto.*rtx / rtx.*auto doesn't show
> anything useful), or should I implement this?

It doesn't exist AFAIK, I thought about using alloca like

 rtx tem;
 rtx_alloca (tem, PLUS);

and due to using alloca rtx_alloca has to be a macro like

#define rtx_alloca(r, code) r = (rtx)alloca (RTX_CODE_SIZE(code));
memset (r, 0, RTX_HDR_SIZE); PUT_CODE (r, code);

maybe C++ can help making this prettier but of course
since we use alloca we have to avoid opening new scopes.

I guess templates like with auto_vec doesn't work unless
we can make RTX_CODE_SIZE constant-evaluated.

Richard.
Ilya Leoshkevich Aug. 26, 2019, 11:54 a.m. UTC | #7
> Am 26.08.2019 um 10:49 schrieb Richard Biener <richard.guenther@gmail.com>:
> 
> On Fri, Aug 23, 2019 at 1:35 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>> 
>>> Am 23.08.2019 um 13:24 schrieb Richard Biener <richard.guenther@gmail.com>:
>>> 
>>> On Fri, Aug 23, 2019 at 12:43 PM Richard Sandiford
>>> <richard.sandiford@arm.com> wrote:
>>>> 
>>>> Ilya Leoshkevich <iii@linux.ibm.com> writes:
>>>>> @@ -3819,6 +3820,82 @@ can_compare_p (enum rtx_code code, machine_mode mode,
>>>>>  return 0;
>>>>> }
>>>>> 
>>>>> +/* can_vector_compare_p presents fake rtx binary operations to the the back-end
>>>>> +   in order to determine its capabilities.  In order to avoid creating fake
>>>>> +   operations on each call, values from previous calls are cached in a global
>>>>> +   cached_binops hash_table.  It contains rtxes, which can be looked up using
>>>>> +   binop_keys.  */
>>>>> +
>>>>> +struct binop_key {
>>>>> +  enum rtx_code code;        /* Operation code.  */
>>>>> +  machine_mode value_mode;   /* Result mode.     */
>>>>> +  machine_mode cmp_op_mode;  /* Operand mode.    */
>>>>> +};
>>>>> +
>>>>> +struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> {
>>>>> +  typedef rtx value_type;
>>>>> +  typedef binop_key compare_type;
>>>>> +
>>>>> +  static hashval_t
>>>>> +  hash (enum rtx_code code, machine_mode value_mode, machine_mode cmp_op_mode)
>>>>> +  {
>>>>> +    inchash::hash hstate (0);
>>>>> +    hstate.add_int (code);
>>>>> +    hstate.add_int (value_mode);
>>>>> +    hstate.add_int (cmp_op_mode);
>>>>> +    return hstate.end ();
>>>>> +  }
>>>>> +
>>>>> +  static hashval_t
>>>>> +  hash (const rtx &ref)
>>>>> +  {
>>>>> +    return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP (ref, 0)));
>>>>> +  }
>>>>> +
>>>>> +  static bool
>>>>> +  equal (const rtx &ref1, const binop_key &ref2)
>>>>> +  {
>>>>> +    return (GET_CODE (ref1) == ref2.code)
>>>>> +        && (GET_MODE (ref1) == ref2.value_mode)
>>>>> +        && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode);
>>>>> +  }
>>>>> +};
>>>>> +
>>>>> +static GTY ((cache)) hash_table<binop_hasher> *cached_binops;
>>>>> +
>>>>> +static rtx
>>>>> +get_cached_binop (enum rtx_code code, machine_mode value_mode,
>>>>> +               machine_mode cmp_op_mode)
>>>>> +{
>>>>> +  if (!cached_binops)
>>>>> +    cached_binops = hash_table<binop_hasher>::create_ggc (1024);
>>>>> +  binop_key key = { code, value_mode, cmp_op_mode };
>>>>> +  hashval_t hash = binop_hasher::hash (code, value_mode, cmp_op_mode);
>>>>> +  rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT);
>>>>> +  if (!*slot)
>>>>> +    *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx (cmp_op_mode),
>>>>> +                         gen_reg_rtx (cmp_op_mode));
>>>>> +  return *slot;
>>>>> +}
>>>> 
>>>> Sorry, I didn't mean anything this complicated.  I just meant that
>>>> we should have a single cached rtx that we can change via PUT_CODE and
>>>> PUT_MODE_RAW for each new query, rather than allocating a new rtx each
>>>> time.
>>>> 
>>>> Something like:
>>>> 
>>>> static GTY ((cache)) rtx cached_binop;
>>>> 
>>>> rtx
>>>> get_cached_binop (machine_mode mode, rtx_code code, machine_mode op_mode)
>>>> {
>>>> if (cached_binop)
>>>>   {
>>>>     PUT_CODE (cached_binop, code);
>>>>     PUT_MODE_RAW (cached_binop, mode);
>>>>     PUT_MODE_RAW (XEXP (cached_binop, 0), op_mode);
>>>>     PUT_MODE_RAW (XEXP (cached_binop, 1), op_mode);
>>>>   }
>>>> else
>>>>   {
>>>>     rtx reg1 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1);
>>>>     rtx reg2 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 2);
>>>>     cached_binop = gen_rtx_fmt_ee (code, mode, reg1, reg2);
>>>>   }
>>>> return cached_binop;
>>>> }
>>> 
>>> Hmm, maybe we need  auto_rtx (code) that constructs such
>>> RTX on the stack instead of wasting a GC root (and causing
>>> issues for future threading of GCC ;)).
>> 
>> Do you mean something like this?
>> 
>> union {
>>  char raw[rtx_code_size[code]];
>>  rtx rtx;
>> } binop;
>> 
>> Does this exist already (git grep auto.*rtx / rtx.*auto doesn't show
>> anything useful), or should I implement this?
> 
> It doesn't exist AFAIK, I thought about using alloca like
> 
> rtx tem;
> rtx_alloca (tem, PLUS);
> 
> and due to using alloca rtx_alloca has to be a macro like
> 
> #define rtx_alloca(r, code) r = (rtx)alloca (RTX_CODE_SIZE(code));
> memset (r, 0, RTX_HDR_SIZE); PUT_CODE (r, code);
> 
> maybe C++ can help making this prettier but of course
> since we use alloca we have to avoid opening new scopes.
> 
> I guess templates like with auto_vec doesn't work unless
> we can make RTX_CODE_SIZE constant-evaluated.
> 
> Richard.

I ended up with the following change:

diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
index a667cdab94e..97aa2144e95 100644
--- a/gcc/emit-rtl.c
+++ b/gcc/emit-rtl.c
@@ -466,17 +466,25 @@ set_mode_and_regno (rtx x, machine_mode mode, unsigned int regno)
   set_regno_raw (x, regno, nregs);
 }
 
-/* Generate a new REG rtx.  Make sure ORIGINAL_REGNO is set properly, and
+/* Initialize a REG rtx.  Make sure ORIGINAL_REGNO is set properly, and
    don't attempt to share with the various global pieces of rtl (such as
    frame_pointer_rtx).  */
 
-rtx
-gen_raw_REG (machine_mode mode, unsigned int regno)
+void
+init_raw_REG (rtx x, machine_mode mode, unsigned int regno)
 {
-  rtx x = rtx_alloc (REG MEM_STAT_INFO);
   set_mode_and_regno (x, mode, regno);
   REG_ATTRS (x) = NULL;
   ORIGINAL_REGNO (x) = regno;
+}
+
+/* Generate a new REG rtx.  */
+
+rtx
+gen_raw_REG (machine_mode mode, unsigned int regno)
+{
+  rtx x = rtx_alloc (REG MEM_STAT_INFO);
+  init_raw_REG (x, mode, regno);
   return x;
 }
 
diff --git a/gcc/gengenrtl.c b/gcc/gengenrtl.c
index 5c78fabfb50..bb2087da258 100644
--- a/gcc/gengenrtl.c
+++ b/gcc/gengenrtl.c
@@ -231,8 +231,7 @@ genmacro (int idx)
   puts (")");
 }
 
-/* Generate the code for the function to generate RTL whose
-   format is FORMAT.  */
+/* Generate the code for functions to generate RTL whose format is FORMAT.  */
 
 static void
 gendef (const char *format)
@@ -240,22 +239,18 @@ gendef (const char *format)
   const char *p;
   int i, j;
 
-  /* Start by writing the definition of the function name and the types
+  /* Write the definition of the init function name and the types
      of the arguments.  */
 
-  printf ("static inline rtx\ngen_rtx_fmt_%s_stat (RTX_CODE code, machine_mode mode", format);
+  puts ("static inline void");
+  printf ("init_rtx_fmt_%s (rtx rt, machine_mode mode", format);
   for (p = format, i = 0; *p != 0; p++)
     if (*p != '0')
       printf (",\n\t%sarg%d", type_from_format (*p), i++);
+  puts (")");
 
-  puts (" MEM_STAT_DECL)");
-
-  /* Now write out the body of the function itself, which allocates
-     the memory and initializes it.  */
+  /* Now write out the body of the init function itself.  */
   puts ("{");
-  puts ("  rtx rt;");
-  puts ("  rt = rtx_alloc (code PASS_MEM_STAT);\n");
-
   puts ("  PUT_MODE_RAW (rt, mode);");
 
   for (p = format, i = j = 0; *p ; ++p, ++i)
@@ -266,16 +261,56 @@ gendef (const char *format)
     else
       printf ("  %s (rt, %d) = arg%d;\n", accessor_from_format (*p), i, j++);
 
-  puts ("\n  return rt;\n}\n");
+  puts ("}\n");
+
+  /* Write the definition of the gen function name and the types
+     of the arguments.  */
+
+  puts ("static inline rtx");
+  printf ("gen_rtx_fmt_%s_stat (RTX_CODE code, machine_mode mode", format);
+  for (p = format, i = 0; *p != 0; p++)
+    if (*p != '0')
+      printf (",\n\t%sarg%d", type_from_format (*p), i++);
+  puts (" MEM_STAT_DECL)");
+
+  /* Now write out the body of the function itself, which allocates
+     the memory and initializes it.  */
+  puts ("{");
+  puts ("  rtx rt;\n");
+
+  puts ("  rt = rtx_alloc (code PASS_MEM_STAT);");
+  printf ("  init_rtx_fmt_%s (rt, mode", format);
+  for (p = format, i = 0; *p != 0; p++)
+    if (*p != '0')
+      printf (", arg%d", i++);
+  puts (");\n");
+
+  puts ("  return rt;\n}\n");
+
+  /* Write the definition of gen macro.  */
+
   printf ("#define gen_rtx_fmt_%s(c, m", format);
   for (p = format, i = 0; *p != 0; p++)
     if (*p != '0')
-      printf (", p%i",i++);
-  printf (")\\\n        gen_rtx_fmt_%s_stat (c, m", format);
+      printf (", arg%d", i++);
+  printf (") \\\n  gen_rtx_fmt_%s_stat ((c), (m)", format);
   for (p = format, i = 0; *p != 0; p++)
     if (*p != '0')
-      printf (", p%i",i++);
+      printf (", (arg%d)", i++);
   printf (" MEM_STAT_INFO)\n\n");
+
+  /* Write the definition of alloca macro.  */
+
+  printf ("#define alloca_rtx_fmt_%s(rt, c, m", format);
+  for (p = format, i = 0; *p != 0; p++)
+    if (*p != '0')
+      printf (", arg%d", i++);
+  printf (") \\\n  rtx_alloca ((rt), (c)); \\\n");
+  printf ("  init_rtx_fmt_%s ((rt), (m)", format);
+  for (p = format, i = 0; *p != 0; p++)
+    if (*p != '0')
+      printf (", (arg%d)", i++);
+  printf (")\n\n");
 }
 
 /* Generate the documentation header for files we write.  */
diff --git a/gcc/rtl.h b/gcc/rtl.h
index efb9b3ce40d..44733d8a39e 100644
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -2933,6 +2933,10 @@ extern HOST_WIDE_INT get_stack_check_protect (void);
 
 /* In rtl.c */
 extern rtx rtx_alloc (RTX_CODE CXX_MEM_STAT_INFO);
+#define rtx_alloca(rt, code) \
+  (rt) = (rtx) alloca (RTX_CODE_SIZE ((code))); \
+  memset ((rt), 0, RTX_HDR_SIZE); \
+  PUT_CODE ((rt), (code));
 extern rtx rtx_alloc_stat_v (RTX_CODE MEM_STAT_DECL, int);
 #define rtx_alloc_v(c, SZ) rtx_alloc_stat_v (c MEM_STAT_INFO, SZ)
 #define const_wide_int_alloc(NWORDS)				\
@@ -3797,7 +3801,11 @@ gen_rtx_INSN (machine_mode mode, rtx_insn *prev_insn, rtx_insn *next_insn,
 extern rtx gen_rtx_CONST_INT (machine_mode, HOST_WIDE_INT);
 extern rtx gen_rtx_CONST_VECTOR (machine_mode, rtvec);
 extern void set_mode_and_regno (rtx, machine_mode, unsigned int);
+extern void init_raw_REG (rtx, machine_mode, unsigned int);
 extern rtx gen_raw_REG (machine_mode, unsigned int);
+#define alloca_raw_REG(rt, mode, regno) \
+  rtx_alloca ((rt), REG); \
+  init_raw_REG ((rt), (mode), (regno))
 extern rtx gen_rtx_REG (machine_mode, unsigned int);
 extern rtx gen_rtx_SUBREG (machine_mode, rtx, poly_uint64);
 extern rtx gen_rtx_MEM (machine_mode, rtx);

which now allows me to write:

rtx reg1, reg2, test;
alloca_raw_REG (reg1, cmp_op_mode, LAST_VIRTUAL_REGISTER + 1);
alloca_raw_REG (reg2, cmp_op_mode, LAST_VIRTUAL_REGISTER + 2);
alloca_rtx_fmt_ee (test, code, value_mode, reg1, reg2);

If that looks ok, I'll resend the series.
Richard Biener Aug. 26, 2019, 1:06 p.m. UTC | #8
On Mon, Aug 26, 2019 at 1:54 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>
> > Am 26.08.2019 um 10:49 schrieb Richard Biener <richard.guenther@gmail.com>:
> >
> > On Fri, Aug 23, 2019 at 1:35 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
> >>
> >>> Am 23.08.2019 um 13:24 schrieb Richard Biener <richard.guenther@gmail.com>:
> >>>
> >>> On Fri, Aug 23, 2019 at 12:43 PM Richard Sandiford
> >>> <richard.sandiford@arm.com> wrote:
> >>>>
> >>>> Ilya Leoshkevich <iii@linux.ibm.com> writes:
> >>>>> @@ -3819,6 +3820,82 @@ can_compare_p (enum rtx_code code, machine_mode mode,
> >>>>>  return 0;
> >>>>> }
> >>>>>
> >>>>> +/* can_vector_compare_p presents fake rtx binary operations to the the back-end
> >>>>> +   in order to determine its capabilities.  In order to avoid creating fake
> >>>>> +   operations on each call, values from previous calls are cached in a global
> >>>>> +   cached_binops hash_table.  It contains rtxes, which can be looked up using
> >>>>> +   binop_keys.  */
> >>>>> +
> >>>>> +struct binop_key {
> >>>>> +  enum rtx_code code;        /* Operation code.  */
> >>>>> +  machine_mode value_mode;   /* Result mode.     */
> >>>>> +  machine_mode cmp_op_mode;  /* Operand mode.    */
> >>>>> +};
> >>>>> +
> >>>>> +struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> {
> >>>>> +  typedef rtx value_type;
> >>>>> +  typedef binop_key compare_type;
> >>>>> +
> >>>>> +  static hashval_t
> >>>>> +  hash (enum rtx_code code, machine_mode value_mode, machine_mode cmp_op_mode)
> >>>>> +  {
> >>>>> +    inchash::hash hstate (0);
> >>>>> +    hstate.add_int (code);
> >>>>> +    hstate.add_int (value_mode);
> >>>>> +    hstate.add_int (cmp_op_mode);
> >>>>> +    return hstate.end ();
> >>>>> +  }
> >>>>> +
> >>>>> +  static hashval_t
> >>>>> +  hash (const rtx &ref)
> >>>>> +  {
> >>>>> +    return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP (ref, 0)));
> >>>>> +  }
> >>>>> +
> >>>>> +  static bool
> >>>>> +  equal (const rtx &ref1, const binop_key &ref2)
> >>>>> +  {
> >>>>> +    return (GET_CODE (ref1) == ref2.code)
> >>>>> +        && (GET_MODE (ref1) == ref2.value_mode)
> >>>>> +        && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode);
> >>>>> +  }
> >>>>> +};
> >>>>> +
> >>>>> +static GTY ((cache)) hash_table<binop_hasher> *cached_binops;
> >>>>> +
> >>>>> +static rtx
> >>>>> +get_cached_binop (enum rtx_code code, machine_mode value_mode,
> >>>>> +               machine_mode cmp_op_mode)
> >>>>> +{
> >>>>> +  if (!cached_binops)
> >>>>> +    cached_binops = hash_table<binop_hasher>::create_ggc (1024);
> >>>>> +  binop_key key = { code, value_mode, cmp_op_mode };
> >>>>> +  hashval_t hash = binop_hasher::hash (code, value_mode, cmp_op_mode);
> >>>>> +  rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT);
> >>>>> +  if (!*slot)
> >>>>> +    *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx (cmp_op_mode),
> >>>>> +                         gen_reg_rtx (cmp_op_mode));
> >>>>> +  return *slot;
> >>>>> +}
> >>>>
> >>>> Sorry, I didn't mean anything this complicated.  I just meant that
> >>>> we should have a single cached rtx that we can change via PUT_CODE and
> >>>> PUT_MODE_RAW for each new query, rather than allocating a new rtx each
> >>>> time.
> >>>>
> >>>> Something like:
> >>>>
> >>>> static GTY ((cache)) rtx cached_binop;
> >>>>
> >>>> rtx
> >>>> get_cached_binop (machine_mode mode, rtx_code code, machine_mode op_mode)
> >>>> {
> >>>> if (cached_binop)
> >>>>   {
> >>>>     PUT_CODE (cached_binop, code);
> >>>>     PUT_MODE_RAW (cached_binop, mode);
> >>>>     PUT_MODE_RAW (XEXP (cached_binop, 0), op_mode);
> >>>>     PUT_MODE_RAW (XEXP (cached_binop, 1), op_mode);
> >>>>   }
> >>>> else
> >>>>   {
> >>>>     rtx reg1 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1);
> >>>>     rtx reg2 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 2);
> >>>>     cached_binop = gen_rtx_fmt_ee (code, mode, reg1, reg2);
> >>>>   }
> >>>> return cached_binop;
> >>>> }
> >>>
> >>> Hmm, maybe we need  auto_rtx (code) that constructs such
> >>> RTX on the stack instead of wasting a GC root (and causing
> >>> issues for future threading of GCC ;)).
> >>
> >> Do you mean something like this?
> >>
> >> union {
> >>  char raw[rtx_code_size[code]];
> >>  rtx rtx;
> >> } binop;
> >>
> >> Does this exist already (git grep auto.*rtx / rtx.*auto doesn't show
> >> anything useful), or should I implement this?
> >
> > It doesn't exist AFAIK, I thought about using alloca like
> >
> > rtx tem;
> > rtx_alloca (tem, PLUS);
> >
> > and due to using alloca rtx_alloca has to be a macro like
> >
> > #define rtx_alloca(r, code) r = (rtx)alloca (RTX_CODE_SIZE(code));
> > memset (r, 0, RTX_HDR_SIZE); PUT_CODE (r, code);
> >
> > maybe C++ can help making this prettier but of course
> > since we use alloca we have to avoid opening new scopes.
> >
> > I guess templates like with auto_vec doesn't work unless
> > we can make RTX_CODE_SIZE constant-evaluated.
> >
> > Richard.
>
> I ended up with the following change:
>
> diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
> index a667cdab94e..97aa2144e95 100644
> --- a/gcc/emit-rtl.c
> +++ b/gcc/emit-rtl.c
> @@ -466,17 +466,25 @@ set_mode_and_regno (rtx x, machine_mode mode, unsigned int regno)
>    set_regno_raw (x, regno, nregs);
>  }
>
> -/* Generate a new REG rtx.  Make sure ORIGINAL_REGNO is set properly, and
> +/* Initialize a REG rtx.  Make sure ORIGINAL_REGNO is set properly, and
>     don't attempt to share with the various global pieces of rtl (such as
>     frame_pointer_rtx).  */
>
> -rtx
> -gen_raw_REG (machine_mode mode, unsigned int regno)
> +void
> +init_raw_REG (rtx x, machine_mode mode, unsigned int regno)
>  {
> -  rtx x = rtx_alloc (REG MEM_STAT_INFO);
>    set_mode_and_regno (x, mode, regno);
>    REG_ATTRS (x) = NULL;
>    ORIGINAL_REGNO (x) = regno;
> +}
> +
> +/* Generate a new REG rtx.  */
> +
> +rtx
> +gen_raw_REG (machine_mode mode, unsigned int regno)
> +{
> +  rtx x = rtx_alloc (REG MEM_STAT_INFO);
> +  init_raw_REG (x, mode, regno);
>    return x;
>  }
>
> diff --git a/gcc/gengenrtl.c b/gcc/gengenrtl.c
> index 5c78fabfb50..bb2087da258 100644
> --- a/gcc/gengenrtl.c
> +++ b/gcc/gengenrtl.c
> @@ -231,8 +231,7 @@ genmacro (int idx)
>    puts (")");
>  }
>
> -/* Generate the code for the function to generate RTL whose
> -   format is FORMAT.  */
> +/* Generate the code for functions to generate RTL whose format is FORMAT.  */
>
>  static void
>  gendef (const char *format)
> @@ -240,22 +239,18 @@ gendef (const char *format)
>    const char *p;
>    int i, j;
>
> -  /* Start by writing the definition of the function name and the types
> +  /* Write the definition of the init function name and the types
>       of the arguments.  */
>
> -  printf ("static inline rtx\ngen_rtx_fmt_%s_stat (RTX_CODE code, machine_mode mode", format);
> +  puts ("static inline void");
> +  printf ("init_rtx_fmt_%s (rtx rt, machine_mode mode", format);
>    for (p = format, i = 0; *p != 0; p++)
>      if (*p != '0')
>        printf (",\n\t%sarg%d", type_from_format (*p), i++);
> +  puts (")");
>
> -  puts (" MEM_STAT_DECL)");
> -
> -  /* Now write out the body of the function itself, which allocates
> -     the memory and initializes it.  */
> +  /* Now write out the body of the init function itself.  */
>    puts ("{");
> -  puts ("  rtx rt;");
> -  puts ("  rt = rtx_alloc (code PASS_MEM_STAT);\n");
> -
>    puts ("  PUT_MODE_RAW (rt, mode);");
>
>    for (p = format, i = j = 0; *p ; ++p, ++i)
> @@ -266,16 +261,56 @@ gendef (const char *format)
>      else
>        printf ("  %s (rt, %d) = arg%d;\n", accessor_from_format (*p), i, j++);
>
> -  puts ("\n  return rt;\n}\n");
> +  puts ("}\n");
> +
> +  /* Write the definition of the gen function name and the types
> +     of the arguments.  */
> +
> +  puts ("static inline rtx");
> +  printf ("gen_rtx_fmt_%s_stat (RTX_CODE code, machine_mode mode", format);
> +  for (p = format, i = 0; *p != 0; p++)
> +    if (*p != '0')
> +      printf (",\n\t%sarg%d", type_from_format (*p), i++);
> +  puts (" MEM_STAT_DECL)");
> +
> +  /* Now write out the body of the function itself, which allocates
> +     the memory and initializes it.  */
> +  puts ("{");
> +  puts ("  rtx rt;\n");
> +
> +  puts ("  rt = rtx_alloc (code PASS_MEM_STAT);");
> +  printf ("  init_rtx_fmt_%s (rt, mode", format);
> +  for (p = format, i = 0; *p != 0; p++)
> +    if (*p != '0')
> +      printf (", arg%d", i++);
> +  puts (");\n");
> +
> +  puts ("  return rt;\n}\n");
> +
> +  /* Write the definition of gen macro.  */
> +
>    printf ("#define gen_rtx_fmt_%s(c, m", format);
>    for (p = format, i = 0; *p != 0; p++)
>      if (*p != '0')
> -      printf (", p%i",i++);
> -  printf (")\\\n        gen_rtx_fmt_%s_stat (c, m", format);
> +      printf (", arg%d", i++);
> +  printf (") \\\n  gen_rtx_fmt_%s_stat ((c), (m)", format);
>    for (p = format, i = 0; *p != 0; p++)
>      if (*p != '0')
> -      printf (", p%i",i++);
> +      printf (", (arg%d)", i++);
>    printf (" MEM_STAT_INFO)\n\n");
> +
> +  /* Write the definition of alloca macro.  */
> +
> +  printf ("#define alloca_rtx_fmt_%s(rt, c, m", format);
> +  for (p = format, i = 0; *p != 0; p++)
> +    if (*p != '0')
> +      printf (", arg%d", i++);
> +  printf (") \\\n  rtx_alloca ((rt), (c)); \\\n");
> +  printf ("  init_rtx_fmt_%s ((rt), (m)", format);
> +  for (p = format, i = 0; *p != 0; p++)
> +    if (*p != '0')
> +      printf (", (arg%d)", i++);
> +  printf (")\n\n");
>  }
>
>  /* Generate the documentation header for files we write.  */
> diff --git a/gcc/rtl.h b/gcc/rtl.h
> index efb9b3ce40d..44733d8a39e 100644
> --- a/gcc/rtl.h
> +++ b/gcc/rtl.h
> @@ -2933,6 +2933,10 @@ extern HOST_WIDE_INT get_stack_check_protect (void);
>
>  /* In rtl.c */
>  extern rtx rtx_alloc (RTX_CODE CXX_MEM_STAT_INFO);
> +#define rtx_alloca(rt, code) \
> +  (rt) = (rtx) alloca (RTX_CODE_SIZE ((code))); \
> +  memset ((rt), 0, RTX_HDR_SIZE); \
> +  PUT_CODE ((rt), (code));
>  extern rtx rtx_alloc_stat_v (RTX_CODE MEM_STAT_DECL, int);
>  #define rtx_alloc_v(c, SZ) rtx_alloc_stat_v (c MEM_STAT_INFO, SZ)
>  #define const_wide_int_alloc(NWORDS)                           \
> @@ -3797,7 +3801,11 @@ gen_rtx_INSN (machine_mode mode, rtx_insn *prev_insn, rtx_insn *next_insn,
>  extern rtx gen_rtx_CONST_INT (machine_mode, HOST_WIDE_INT);
>  extern rtx gen_rtx_CONST_VECTOR (machine_mode, rtvec);
>  extern void set_mode_and_regno (rtx, machine_mode, unsigned int);
> +extern void init_raw_REG (rtx, machine_mode, unsigned int);
>  extern rtx gen_raw_REG (machine_mode, unsigned int);
> +#define alloca_raw_REG(rt, mode, regno) \
> +  rtx_alloca ((rt), REG); \
> +  init_raw_REG ((rt), (mode), (regno))
>  extern rtx gen_rtx_REG (machine_mode, unsigned int);
>  extern rtx gen_rtx_SUBREG (machine_mode, rtx, poly_uint64);
>  extern rtx gen_rtx_MEM (machine_mode, rtx);
>
> which now allows me to write:
>
> rtx reg1, reg2, test;
> alloca_raw_REG (reg1, cmp_op_mode, LAST_VIRTUAL_REGISTER + 1);
> alloca_raw_REG (reg2, cmp_op_mode, LAST_VIRTUAL_REGISTER + 2);
> alloca_rtx_fmt_ee (test, code, value_mode, reg1, reg2);
>
> If that looks ok, I'll resend the series.

that looks OK to me - please leave Richard S. time to comment.  Also while
I'd like to see

rtx reg1 = alloca_raw_REG (cmp_op_mode, LAST_VIRTUAL_REGISTER + 1);

I don't really see a way to write that portably (or at all), do you all agree?
GCC doesn't seem to convert alloca() calls to __builtin_stack_save/restore
nor place CLOBBERs to end their lifetime.  But is it guaranteed that the
alloca result is valid until frame termination?

Thanks,
Richard.
Ilya Leoshkevich Aug. 26, 2019, 1:17 p.m. UTC | #9
> Am 26.08.2019 um 15:06 schrieb Richard Biener <richard.guenther@gmail.com>:
> 
> On Mon, Aug 26, 2019 at 1:54 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>> 
>>> Am 26.08.2019 um 10:49 schrieb Richard Biener <richard.guenther@gmail.com>:
>>> 
>>> On Fri, Aug 23, 2019 at 1:35 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>>>> 
>>>>> Am 23.08.2019 um 13:24 schrieb Richard Biener <richard.guenther@gmail.com>:
>>>>> 
>>>>> On Fri, Aug 23, 2019 at 12:43 PM Richard Sandiford
>>>>> <richard.sandiford@arm.com> wrote:
>>>>>> 
>>>>>> Ilya Leoshkevich <iii@linux.ibm.com> writes:
>>>>>>> @@ -3819,6 +3820,82 @@ can_compare_p (enum rtx_code code, machine_mode mode,
>>>>>>> return 0;
>>>>>>> }
>>>>>>> 
>>>>>>> +/* can_vector_compare_p presents fake rtx binary operations to the the back-end
>>>>>>> +   in order to determine its capabilities.  In order to avoid creating fake
>>>>>>> +   operations on each call, values from previous calls are cached in a global
>>>>>>> +   cached_binops hash_table.  It contains rtxes, which can be looked up using
>>>>>>> +   binop_keys.  */
>>>>>>> +
>>>>>>> +struct binop_key {
>>>>>>> +  enum rtx_code code;        /* Operation code.  */
>>>>>>> +  machine_mode value_mode;   /* Result mode.     */
>>>>>>> +  machine_mode cmp_op_mode;  /* Operand mode.    */
>>>>>>> +};
>>>>>>> +
>>>>>>> +struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> {
>>>>>>> +  typedef rtx value_type;
>>>>>>> +  typedef binop_key compare_type;
>>>>>>> +
>>>>>>> +  static hashval_t
>>>>>>> +  hash (enum rtx_code code, machine_mode value_mode, machine_mode cmp_op_mode)
>>>>>>> +  {
>>>>>>> +    inchash::hash hstate (0);
>>>>>>> +    hstate.add_int (code);
>>>>>>> +    hstate.add_int (value_mode);
>>>>>>> +    hstate.add_int (cmp_op_mode);
>>>>>>> +    return hstate.end ();
>>>>>>> +  }
>>>>>>> +
>>>>>>> +  static hashval_t
>>>>>>> +  hash (const rtx &ref)
>>>>>>> +  {
>>>>>>> +    return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP (ref, 0)));
>>>>>>> +  }
>>>>>>> +
>>>>>>> +  static bool
>>>>>>> +  equal (const rtx &ref1, const binop_key &ref2)
>>>>>>> +  {
>>>>>>> +    return (GET_CODE (ref1) == ref2.code)
>>>>>>> +        && (GET_MODE (ref1) == ref2.value_mode)
>>>>>>> +        && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode);
>>>>>>> +  }
>>>>>>> +};
>>>>>>> +
>>>>>>> +static GTY ((cache)) hash_table<binop_hasher> *cached_binops;
>>>>>>> +
>>>>>>> +static rtx
>>>>>>> +get_cached_binop (enum rtx_code code, machine_mode value_mode,
>>>>>>> +               machine_mode cmp_op_mode)
>>>>>>> +{
>>>>>>> +  if (!cached_binops)
>>>>>>> +    cached_binops = hash_table<binop_hasher>::create_ggc (1024);
>>>>>>> +  binop_key key = { code, value_mode, cmp_op_mode };
>>>>>>> +  hashval_t hash = binop_hasher::hash (code, value_mode, cmp_op_mode);
>>>>>>> +  rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT);
>>>>>>> +  if (!*slot)
>>>>>>> +    *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx (cmp_op_mode),
>>>>>>> +                         gen_reg_rtx (cmp_op_mode));
>>>>>>> +  return *slot;
>>>>>>> +}
>>>>>> 
>>>>>> Sorry, I didn't mean anything this complicated.  I just meant that
>>>>>> we should have a single cached rtx that we can change via PUT_CODE and
>>>>>> PUT_MODE_RAW for each new query, rather than allocating a new rtx each
>>>>>> time.
>>>>>> 
>>>>>> Something like:
>>>>>> 
>>>>>> static GTY ((cache)) rtx cached_binop;
>>>>>> 
>>>>>> rtx
>>>>>> get_cached_binop (machine_mode mode, rtx_code code, machine_mode op_mode)
>>>>>> {
>>>>>> if (cached_binop)
>>>>>>  {
>>>>>>    PUT_CODE (cached_binop, code);
>>>>>>    PUT_MODE_RAW (cached_binop, mode);
>>>>>>    PUT_MODE_RAW (XEXP (cached_binop, 0), op_mode);
>>>>>>    PUT_MODE_RAW (XEXP (cached_binop, 1), op_mode);
>>>>>>  }
>>>>>> else
>>>>>>  {
>>>>>>    rtx reg1 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1);
>>>>>>    rtx reg2 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 2);
>>>>>>    cached_binop = gen_rtx_fmt_ee (code, mode, reg1, reg2);
>>>>>>  }
>>>>>> return cached_binop;
>>>>>> }
>>>>> 
>>>>> Hmm, maybe we need  auto_rtx (code) that constructs such
>>>>> RTX on the stack instead of wasting a GC root (and causing
>>>>> issues for future threading of GCC ;)).
>>>> 
>>>> Do you mean something like this?
>>>> 
>>>> union {
>>>> char raw[rtx_code_size[code]];
>>>> rtx rtx;
>>>> } binop;
>>>> 
>>>> Does this exist already (git grep auto.*rtx / rtx.*auto doesn't show
>>>> anything useful), or should I implement this?
>>> 
>>> It doesn't exist AFAIK, I thought about using alloca like
>>> 
>>> rtx tem;
>>> rtx_alloca (tem, PLUS);
>>> 
>>> and due to using alloca rtx_alloca has to be a macro like
>>> 
>>> #define rtx_alloca(r, code) r = (rtx)alloca (RTX_CODE_SIZE(code));
>>> memset (r, 0, RTX_HDR_SIZE); PUT_CODE (r, code);
>>> 
>>> maybe C++ can help making this prettier but of course
>>> since we use alloca we have to avoid opening new scopes.
>>> 
>>> I guess templates like with auto_vec doesn't work unless
>>> we can make RTX_CODE_SIZE constant-evaluated.
>>> 
>>> Richard.
>> 
>> I ended up with the following change:
>> 
>> diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
>> index a667cdab94e..97aa2144e95 100644
>> --- a/gcc/emit-rtl.c
>> +++ b/gcc/emit-rtl.c
>> @@ -466,17 +466,25 @@ set_mode_and_regno (rtx x, machine_mode mode, unsigned int regno)
>>   set_regno_raw (x, regno, nregs);
>> }
>> 
>> -/* Generate a new REG rtx.  Make sure ORIGINAL_REGNO is set properly, and
>> +/* Initialize a REG rtx.  Make sure ORIGINAL_REGNO is set properly, and
>>    don't attempt to share with the various global pieces of rtl (such as
>>    frame_pointer_rtx).  */
>> 
>> -rtx
>> -gen_raw_REG (machine_mode mode, unsigned int regno)
>> +void
>> +init_raw_REG (rtx x, machine_mode mode, unsigned int regno)
>> {
>> -  rtx x = rtx_alloc (REG MEM_STAT_INFO);
>>   set_mode_and_regno (x, mode, regno);
>>   REG_ATTRS (x) = NULL;
>>   ORIGINAL_REGNO (x) = regno;
>> +}
>> +
>> +/* Generate a new REG rtx.  */
>> +
>> +rtx
>> +gen_raw_REG (machine_mode mode, unsigned int regno)
>> +{
>> +  rtx x = rtx_alloc (REG MEM_STAT_INFO);
>> +  init_raw_REG (x, mode, regno);
>>   return x;
>> }
>> 
>> diff --git a/gcc/gengenrtl.c b/gcc/gengenrtl.c
>> index 5c78fabfb50..bb2087da258 100644
>> --- a/gcc/gengenrtl.c
>> +++ b/gcc/gengenrtl.c
>> @@ -231,8 +231,7 @@ genmacro (int idx)
>>   puts (")");
>> }
>> 
>> -/* Generate the code for the function to generate RTL whose
>> -   format is FORMAT.  */
>> +/* Generate the code for functions to generate RTL whose format is FORMAT.  */
>> 
>> static void
>> gendef (const char *format)
>> @@ -240,22 +239,18 @@ gendef (const char *format)
>>   const char *p;
>>   int i, j;
>> 
>> -  /* Start by writing the definition of the function name and the types
>> +  /* Write the definition of the init function name and the types
>>      of the arguments.  */
>> 
>> -  printf ("static inline rtx\ngen_rtx_fmt_%s_stat (RTX_CODE code, machine_mode mode", format);
>> +  puts ("static inline void");
>> +  printf ("init_rtx_fmt_%s (rtx rt, machine_mode mode", format);
>>   for (p = format, i = 0; *p != 0; p++)
>>     if (*p != '0')
>>       printf (",\n\t%sarg%d", type_from_format (*p), i++);
>> +  puts (")");
>> 
>> -  puts (" MEM_STAT_DECL)");
>> -
>> -  /* Now write out the body of the function itself, which allocates
>> -     the memory and initializes it.  */
>> +  /* Now write out the body of the init function itself.  */
>>   puts ("{");
>> -  puts ("  rtx rt;");
>> -  puts ("  rt = rtx_alloc (code PASS_MEM_STAT);\n");
>> -
>>   puts ("  PUT_MODE_RAW (rt, mode);");
>> 
>>   for (p = format, i = j = 0; *p ; ++p, ++i)
>> @@ -266,16 +261,56 @@ gendef (const char *format)
>>     else
>>       printf ("  %s (rt, %d) = arg%d;\n", accessor_from_format (*p), i, j++);
>> 
>> -  puts ("\n  return rt;\n}\n");
>> +  puts ("}\n");
>> +
>> +  /* Write the definition of the gen function name and the types
>> +     of the arguments.  */
>> +
>> +  puts ("static inline rtx");
>> +  printf ("gen_rtx_fmt_%s_stat (RTX_CODE code, machine_mode mode", format);
>> +  for (p = format, i = 0; *p != 0; p++)
>> +    if (*p != '0')
>> +      printf (",\n\t%sarg%d", type_from_format (*p), i++);
>> +  puts (" MEM_STAT_DECL)");
>> +
>> +  /* Now write out the body of the function itself, which allocates
>> +     the memory and initializes it.  */
>> +  puts ("{");
>> +  puts ("  rtx rt;\n");
>> +
>> +  puts ("  rt = rtx_alloc (code PASS_MEM_STAT);");
>> +  printf ("  init_rtx_fmt_%s (rt, mode", format);
>> +  for (p = format, i = 0; *p != 0; p++)
>> +    if (*p != '0')
>> +      printf (", arg%d", i++);
>> +  puts (");\n");
>> +
>> +  puts ("  return rt;\n}\n");
>> +
>> +  /* Write the definition of gen macro.  */
>> +
>>   printf ("#define gen_rtx_fmt_%s(c, m", format);
>>   for (p = format, i = 0; *p != 0; p++)
>>     if (*p != '0')
>> -      printf (", p%i",i++);
>> -  printf (")\\\n        gen_rtx_fmt_%s_stat (c, m", format);
>> +      printf (", arg%d", i++);
>> +  printf (") \\\n  gen_rtx_fmt_%s_stat ((c), (m)", format);
>>   for (p = format, i = 0; *p != 0; p++)
>>     if (*p != '0')
>> -      printf (", p%i",i++);
>> +      printf (", (arg%d)", i++);
>>   printf (" MEM_STAT_INFO)\n\n");
>> +
>> +  /* Write the definition of alloca macro.  */
>> +
>> +  printf ("#define alloca_rtx_fmt_%s(rt, c, m", format);
>> +  for (p = format, i = 0; *p != 0; p++)
>> +    if (*p != '0')
>> +      printf (", arg%d", i++);
>> +  printf (") \\\n  rtx_alloca ((rt), (c)); \\\n");
>> +  printf ("  init_rtx_fmt_%s ((rt), (m)", format);
>> +  for (p = format, i = 0; *p != 0; p++)
>> +    if (*p != '0')
>> +      printf (", (arg%d)", i++);
>> +  printf (")\n\n");
>> }
>> 
>> /* Generate the documentation header for files we write.  */
>> diff --git a/gcc/rtl.h b/gcc/rtl.h
>> index efb9b3ce40d..44733d8a39e 100644
>> --- a/gcc/rtl.h
>> +++ b/gcc/rtl.h
>> @@ -2933,6 +2933,10 @@ extern HOST_WIDE_INT get_stack_check_protect (void);
>> 
>> /* In rtl.c */
>> extern rtx rtx_alloc (RTX_CODE CXX_MEM_STAT_INFO);
>> +#define rtx_alloca(rt, code) \
>> +  (rt) = (rtx) alloca (RTX_CODE_SIZE ((code))); \
>> +  memset ((rt), 0, RTX_HDR_SIZE); \
>> +  PUT_CODE ((rt), (code));
>> extern rtx rtx_alloc_stat_v (RTX_CODE MEM_STAT_DECL, int);
>> #define rtx_alloc_v(c, SZ) rtx_alloc_stat_v (c MEM_STAT_INFO, SZ)
>> #define const_wide_int_alloc(NWORDS)                           \
>> @@ -3797,7 +3801,11 @@ gen_rtx_INSN (machine_mode mode, rtx_insn *prev_insn, rtx_insn *next_insn,
>> extern rtx gen_rtx_CONST_INT (machine_mode, HOST_WIDE_INT);
>> extern rtx gen_rtx_CONST_VECTOR (machine_mode, rtvec);
>> extern void set_mode_and_regno (rtx, machine_mode, unsigned int);
>> +extern void init_raw_REG (rtx, machine_mode, unsigned int);
>> extern rtx gen_raw_REG (machine_mode, unsigned int);
>> +#define alloca_raw_REG(rt, mode, regno) \
>> +  rtx_alloca ((rt), REG); \
>> +  init_raw_REG ((rt), (mode), (regno))
>> extern rtx gen_rtx_REG (machine_mode, unsigned int);
>> extern rtx gen_rtx_SUBREG (machine_mode, rtx, poly_uint64);
>> extern rtx gen_rtx_MEM (machine_mode, rtx);
>> 
>> which now allows me to write:
>> 
>> rtx reg1, reg2, test;
>> alloca_raw_REG (reg1, cmp_op_mode, LAST_VIRTUAL_REGISTER + 1);
>> alloca_raw_REG (reg2, cmp_op_mode, LAST_VIRTUAL_REGISTER + 2);
>> alloca_rtx_fmt_ee (test, code, value_mode, reg1, reg2);
>> 
>> If that looks ok, I'll resend the series.
> 
> that looks OK to me - please leave Richard S. time to comment.  Also while
> I'd like to see
> 
> rtx reg1 = alloca_raw_REG (cmp_op_mode, LAST_VIRTUAL_REGISTER + 1);
> 
> I don't really see a way to write that portably (or at all), do you all agree?
> GCC doesn't seem to convert alloca() calls to __builtin_stack_save/restore
> nor place CLOBBERs to end their lifetime.  But is it guaranteed that the
> alloca result is valid until frame termination?

Hmm, the alloca man page says:

       The alloca() function allocates size bytes of space in the stack
       frame of the caller.  This temporary space is automatically freed
       when the function that called alloca() returns to its caller.
...
       The space allocated by alloca() is not automatically deallocated if
       the pointer that refers to it simply goes out of scope.

A quick experiment with gcc and clang confirms this.  I think this means
I can make alloca_raw_REG macro return the allocated pointer using the
return-from-block GNU extension.
Ilya Leoshkevich Aug. 26, 2019, 2:50 p.m. UTC | #10
> Am 26.08.2019 um 15:17 schrieb Ilya Leoshkevich <iii@linux.ibm.com>:
> 
>> Am 26.08.2019 um 15:06 schrieb Richard Biener <richard.guenther@gmail.com>:
>> 
>> On Mon, Aug 26, 2019 at 1:54 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>>> 
>>>> Am 26.08.2019 um 10:49 schrieb Richard Biener <richard.guenther@gmail.com>:
>>>> 
>>>> On Fri, Aug 23, 2019 at 1:35 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>>>>> 
>>>>>> Am 23.08.2019 um 13:24 schrieb Richard Biener <richard.guenther@gmail.com>:
>>>>>> 
>>>>>> On Fri, Aug 23, 2019 at 12:43 PM Richard Sandiford
>>>>>> <richard.sandiford@arm.com> wrote:
>>>>>>> 
>>>>>>> Ilya Leoshkevich <iii@linux.ibm.com> writes:
>>>>>>>> @@ -3819,6 +3820,82 @@ can_compare_p (enum rtx_code code, machine_mode mode,
>>>>>>>> return 0;
>>>>>>>> }
>>>>>>>> 
>>>>>>>> +/* can_vector_compare_p presents fake rtx binary operations to the the back-end
>>>>>>>> +   in order to determine its capabilities.  In order to avoid creating fake
>>>>>>>> +   operations on each call, values from previous calls are cached in a global
>>>>>>>> +   cached_binops hash_table.  It contains rtxes, which can be looked up using
>>>>>>>> +   binop_keys.  */
>>>>>>>> +
>>>>>>>> +struct binop_key {
>>>>>>>> +  enum rtx_code code;        /* Operation code.  */
>>>>>>>> +  machine_mode value_mode;   /* Result mode.     */
>>>>>>>> +  machine_mode cmp_op_mode;  /* Operand mode.    */
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> {
>>>>>>>> +  typedef rtx value_type;
>>>>>>>> +  typedef binop_key compare_type;
>>>>>>>> +
>>>>>>>> +  static hashval_t
>>>>>>>> +  hash (enum rtx_code code, machine_mode value_mode, machine_mode cmp_op_mode)
>>>>>>>> +  {
>>>>>>>> +    inchash::hash hstate (0);
>>>>>>>> +    hstate.add_int (code);
>>>>>>>> +    hstate.add_int (value_mode);
>>>>>>>> +    hstate.add_int (cmp_op_mode);
>>>>>>>> +    return hstate.end ();
>>>>>>>> +  }
>>>>>>>> +
>>>>>>>> +  static hashval_t
>>>>>>>> +  hash (const rtx &ref)
>>>>>>>> +  {
>>>>>>>> +    return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP (ref, 0)));
>>>>>>>> +  }
>>>>>>>> +
>>>>>>>> +  static bool
>>>>>>>> +  equal (const rtx &ref1, const binop_key &ref2)
>>>>>>>> +  {
>>>>>>>> +    return (GET_CODE (ref1) == ref2.code)
>>>>>>>> +        && (GET_MODE (ref1) == ref2.value_mode)
>>>>>>>> +        && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode);
>>>>>>>> +  }
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +static GTY ((cache)) hash_table<binop_hasher> *cached_binops;
>>>>>>>> +
>>>>>>>> +static rtx
>>>>>>>> +get_cached_binop (enum rtx_code code, machine_mode value_mode,
>>>>>>>> +               machine_mode cmp_op_mode)
>>>>>>>> +{
>>>>>>>> +  if (!cached_binops)
>>>>>>>> +    cached_binops = hash_table<binop_hasher>::create_ggc (1024);
>>>>>>>> +  binop_key key = { code, value_mode, cmp_op_mode };
>>>>>>>> +  hashval_t hash = binop_hasher::hash (code, value_mode, cmp_op_mode);
>>>>>>>> +  rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT);
>>>>>>>> +  if (!*slot)
>>>>>>>> +    *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx (cmp_op_mode),
>>>>>>>> +                         gen_reg_rtx (cmp_op_mode));
>>>>>>>> +  return *slot;
>>>>>>>> +}
>>>>>>> 
>>>>>>> Sorry, I didn't mean anything this complicated.  I just meant that
>>>>>>> we should have a single cached rtx that we can change via PUT_CODE and
>>>>>>> PUT_MODE_RAW for each new query, rather than allocating a new rtx each
>>>>>>> time.
>>>>>>> 
>>>>>>> Something like:
>>>>>>> 
>>>>>>> static GTY ((cache)) rtx cached_binop;
>>>>>>> 
>>>>>>> rtx
>>>>>>> get_cached_binop (machine_mode mode, rtx_code code, machine_mode op_mode)
>>>>>>> {
>>>>>>> if (cached_binop)
>>>>>>> {
>>>>>>>   PUT_CODE (cached_binop, code);
>>>>>>>   PUT_MODE_RAW (cached_binop, mode);
>>>>>>>   PUT_MODE_RAW (XEXP (cached_binop, 0), op_mode);
>>>>>>>   PUT_MODE_RAW (XEXP (cached_binop, 1), op_mode);
>>>>>>> }
>>>>>>> else
>>>>>>> {
>>>>>>>   rtx reg1 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1);
>>>>>>>   rtx reg2 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 2);
>>>>>>>   cached_binop = gen_rtx_fmt_ee (code, mode, reg1, reg2);
>>>>>>> }
>>>>>>> return cached_binop;
>>>>>>> }
>>>>>> 
>>>>>> Hmm, maybe we need  auto_rtx (code) that constructs such
>>>>>> RTX on the stack instead of wasting a GC root (and causing
>>>>>> issues for future threading of GCC ;)).
>>>>> 
>>>>> Do you mean something like this?
>>>>> 
>>>>> union {
>>>>> char raw[rtx_code_size[code]];
>>>>> rtx rtx;
>>>>> } binop;
>>>>> 
>>>>> Does this exist already (git grep auto.*rtx / rtx.*auto doesn't show
>>>>> anything useful), or should I implement this?
>>>> 
>>>> It doesn't exist AFAIK, I thought about using alloca like
>>>> 
>>>> rtx tem;
>>>> rtx_alloca (tem, PLUS);
>>>> 
>>>> and due to using alloca rtx_alloca has to be a macro like
>>>> 
>>>> #define rtx_alloca(r, code) r = (rtx)alloca (RTX_CODE_SIZE(code));
>>>> memset (r, 0, RTX_HDR_SIZE); PUT_CODE (r, code);
>>>> 
>>>> maybe C++ can help making this prettier but of course
>>>> since we use alloca we have to avoid opening new scopes.
>>>> 
>>>> I guess templates like with auto_vec doesn't work unless
>>>> we can make RTX_CODE_SIZE constant-evaluated.
>>>> 
>>>> Richard.
>>> 
>>> I ended up with the following change:
>>> 
>>> diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
>>> index a667cdab94e..97aa2144e95 100644
>>> --- a/gcc/emit-rtl.c
>>> +++ b/gcc/emit-rtl.c
>>> @@ -466,17 +466,25 @@ set_mode_and_regno (rtx x, machine_mode mode, unsigned int regno)
>>>  set_regno_raw (x, regno, nregs);
>>> }
>>> 
>>> -/* Generate a new REG rtx.  Make sure ORIGINAL_REGNO is set properly, and
>>> +/* Initialize a REG rtx.  Make sure ORIGINAL_REGNO is set properly, and
>>>   don't attempt to share with the various global pieces of rtl (such as
>>>   frame_pointer_rtx).  */
>>> 
>>> -rtx
>>> -gen_raw_REG (machine_mode mode, unsigned int regno)
>>> +void
>>> +init_raw_REG (rtx x, machine_mode mode, unsigned int regno)
>>> {
>>> -  rtx x = rtx_alloc (REG MEM_STAT_INFO);
>>>  set_mode_and_regno (x, mode, regno);
>>>  REG_ATTRS (x) = NULL;
>>>  ORIGINAL_REGNO (x) = regno;
>>> +}
>>> +
>>> +/* Generate a new REG rtx.  */
>>> +
>>> +rtx
>>> +gen_raw_REG (machine_mode mode, unsigned int regno)
>>> +{
>>> +  rtx x = rtx_alloc (REG MEM_STAT_INFO);
>>> +  init_raw_REG (x, mode, regno);
>>>  return x;
>>> }
>>> 
>>> diff --git a/gcc/gengenrtl.c b/gcc/gengenrtl.c
>>> index 5c78fabfb50..bb2087da258 100644
>>> --- a/gcc/gengenrtl.c
>>> +++ b/gcc/gengenrtl.c
>>> @@ -231,8 +231,7 @@ genmacro (int idx)
>>>  puts (")");
>>> }
>>> 
>>> -/* Generate the code for the function to generate RTL whose
>>> -   format is FORMAT.  */
>>> +/* Generate the code for functions to generate RTL whose format is FORMAT.  */
>>> 
>>> static void
>>> gendef (const char *format)
>>> @@ -240,22 +239,18 @@ gendef (const char *format)
>>>  const char *p;
>>>  int i, j;
>>> 
>>> -  /* Start by writing the definition of the function name and the types
>>> +  /* Write the definition of the init function name and the types
>>>     of the arguments.  */
>>> 
>>> -  printf ("static inline rtx\ngen_rtx_fmt_%s_stat (RTX_CODE code, machine_mode mode", format);
>>> +  puts ("static inline void");
>>> +  printf ("init_rtx_fmt_%s (rtx rt, machine_mode mode", format);
>>>  for (p = format, i = 0; *p != 0; p++)
>>>    if (*p != '0')
>>>      printf (",\n\t%sarg%d", type_from_format (*p), i++);
>>> +  puts (")");
>>> 
>>> -  puts (" MEM_STAT_DECL)");
>>> -
>>> -  /* Now write out the body of the function itself, which allocates
>>> -     the memory and initializes it.  */
>>> +  /* Now write out the body of the init function itself.  */
>>>  puts ("{");
>>> -  puts ("  rtx rt;");
>>> -  puts ("  rt = rtx_alloc (code PASS_MEM_STAT);\n");
>>> -
>>>  puts ("  PUT_MODE_RAW (rt, mode);");
>>> 
>>>  for (p = format, i = j = 0; *p ; ++p, ++i)
>>> @@ -266,16 +261,56 @@ gendef (const char *format)
>>>    else
>>>      printf ("  %s (rt, %d) = arg%d;\n", accessor_from_format (*p), i, j++);
>>> 
>>> -  puts ("\n  return rt;\n}\n");
>>> +  puts ("}\n");
>>> +
>>> +  /* Write the definition of the gen function name and the types
>>> +     of the arguments.  */
>>> +
>>> +  puts ("static inline rtx");
>>> +  printf ("gen_rtx_fmt_%s_stat (RTX_CODE code, machine_mode mode", format);
>>> +  for (p = format, i = 0; *p != 0; p++)
>>> +    if (*p != '0')
>>> +      printf (",\n\t%sarg%d", type_from_format (*p), i++);
>>> +  puts (" MEM_STAT_DECL)");
>>> +
>>> +  /* Now write out the body of the function itself, which allocates
>>> +     the memory and initializes it.  */
>>> +  puts ("{");
>>> +  puts ("  rtx rt;\n");
>>> +
>>> +  puts ("  rt = rtx_alloc (code PASS_MEM_STAT);");
>>> +  printf ("  init_rtx_fmt_%s (rt, mode", format);
>>> +  for (p = format, i = 0; *p != 0; p++)
>>> +    if (*p != '0')
>>> +      printf (", arg%d", i++);
>>> +  puts (");\n");
>>> +
>>> +  puts ("  return rt;\n}\n");
>>> +
>>> +  /* Write the definition of gen macro.  */
>>> +
>>>  printf ("#define gen_rtx_fmt_%s(c, m", format);
>>>  for (p = format, i = 0; *p != 0; p++)
>>>    if (*p != '0')
>>> -      printf (", p%i",i++);
>>> -  printf (")\\\n        gen_rtx_fmt_%s_stat (c, m", format);
>>> +      printf (", arg%d", i++);
>>> +  printf (") \\\n  gen_rtx_fmt_%s_stat ((c), (m)", format);
>>>  for (p = format, i = 0; *p != 0; p++)
>>>    if (*p != '0')
>>> -      printf (", p%i",i++);
>>> +      printf (", (arg%d)", i++);
>>>  printf (" MEM_STAT_INFO)\n\n");
>>> +
>>> +  /* Write the definition of alloca macro.  */
>>> +
>>> +  printf ("#define alloca_rtx_fmt_%s(rt, c, m", format);
>>> +  for (p = format, i = 0; *p != 0; p++)
>>> +    if (*p != '0')
>>> +      printf (", arg%d", i++);
>>> +  printf (") \\\n  rtx_alloca ((rt), (c)); \\\n");
>>> +  printf ("  init_rtx_fmt_%s ((rt), (m)", format);
>>> +  for (p = format, i = 0; *p != 0; p++)
>>> +    if (*p != '0')
>>> +      printf (", (arg%d)", i++);
>>> +  printf (")\n\n");
>>> }
>>> 
>>> /* Generate the documentation header for files we write.  */
>>> diff --git a/gcc/rtl.h b/gcc/rtl.h
>>> index efb9b3ce40d..44733d8a39e 100644
>>> --- a/gcc/rtl.h
>>> +++ b/gcc/rtl.h
>>> @@ -2933,6 +2933,10 @@ extern HOST_WIDE_INT get_stack_check_protect (void);
>>> 
>>> /* In rtl.c */
>>> extern rtx rtx_alloc (RTX_CODE CXX_MEM_STAT_INFO);
>>> +#define rtx_alloca(rt, code) \
>>> +  (rt) = (rtx) alloca (RTX_CODE_SIZE ((code))); \
>>> +  memset ((rt), 0, RTX_HDR_SIZE); \
>>> +  PUT_CODE ((rt), (code));
>>> extern rtx rtx_alloc_stat_v (RTX_CODE MEM_STAT_DECL, int);
>>> #define rtx_alloc_v(c, SZ) rtx_alloc_stat_v (c MEM_STAT_INFO, SZ)
>>> #define const_wide_int_alloc(NWORDS)                           \
>>> @@ -3797,7 +3801,11 @@ gen_rtx_INSN (machine_mode mode, rtx_insn *prev_insn, rtx_insn *next_insn,
>>> extern rtx gen_rtx_CONST_INT (machine_mode, HOST_WIDE_INT);
>>> extern rtx gen_rtx_CONST_VECTOR (machine_mode, rtvec);
>>> extern void set_mode_and_regno (rtx, machine_mode, unsigned int);
>>> +extern void init_raw_REG (rtx, machine_mode, unsigned int);
>>> extern rtx gen_raw_REG (machine_mode, unsigned int);
>>> +#define alloca_raw_REG(rt, mode, regno) \
>>> +  rtx_alloca ((rt), REG); \
>>> +  init_raw_REG ((rt), (mode), (regno))
>>> extern rtx gen_rtx_REG (machine_mode, unsigned int);
>>> extern rtx gen_rtx_SUBREG (machine_mode, rtx, poly_uint64);
>>> extern rtx gen_rtx_MEM (machine_mode, rtx);
>>> 
>>> which now allows me to write:
>>> 
>>> rtx reg1, reg2, test;
>>> alloca_raw_REG (reg1, cmp_op_mode, LAST_VIRTUAL_REGISTER + 1);
>>> alloca_raw_REG (reg2, cmp_op_mode, LAST_VIRTUAL_REGISTER + 2);
>>> alloca_rtx_fmt_ee (test, code, value_mode, reg1, reg2);
>>> 
>>> If that looks ok, I'll resend the series.
>> 
>> that looks OK to me - please leave Richard S. time to comment.  Also while
>> I'd like to see
>> 
>> rtx reg1 = alloca_raw_REG (cmp_op_mode, LAST_VIRTUAL_REGISTER + 1);
>> 
>> I don't really see a way to write that portably (or at all), do you all agree?
>> GCC doesn't seem to convert alloca() calls to __builtin_stack_save/restore
>> nor place CLOBBERs to end their lifetime.  But is it guaranteed that the
>> alloca result is valid until frame termination?
> 
> Hmm, the alloca man page says:
> 
>       The alloca() function allocates size bytes of space in the stack
>       frame of the caller.  This temporary space is automatically freed
>       when the function that called alloca() returns to its caller.
> ...
>       The space allocated by alloca() is not automatically deallocated if
>       the pointer that refers to it simply goes out of scope.
> 
> A quick experiment with gcc and clang confirms this.  I think this means
> I can make alloca_raw_REG macro return the allocated pointer using the
> return-from-block GNU extension.

What do you think about the following approach?

extern rtx rtx_init (rtx, RTX_CODE);
#define rtx_alloca(code) \
  rtx_init ((rtx) alloca (RTX_CODE_SIZE ((code))), (code))

...

rtx
rtx_init (rtx rt, RTX_CODE code)
{
  /* We want to clear everything up to the FLD array.  Normally, this
     is one int, but we don't want to assume that and it isn't very
     portable anyway; this is.  */
  memset (rt, 0, RTX_HDR_SIZE);
  PUT_CODE (rt, code);
  return rt;
}

with similar changes to alloca_raw_REG and gengenrtl.  This way we don't
even need a GNU extension.
Richard Sandiford Aug. 27, 2019, 7:01 a.m. UTC | #11
Ilya Leoshkevich <iii@linux.ibm.com> writes:
>> Am 26.08.2019 um 15:17 schrieb Ilya Leoshkevich <iii@linux.ibm.com>:
>> 
>>> Am 26.08.2019 um 15:06 schrieb Richard Biener <richard.guenther@gmail.com>:
>>> 
>>> On Mon, Aug 26, 2019 at 1:54 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>>>> 
>>>>> Am 26.08.2019 um 10:49 schrieb Richard Biener <richard.guenther@gmail.com>:
>>>>> 
>>>>> On Fri, Aug 23, 2019 at 1:35 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
>>>>>> 
>>>>>>> Am 23.08.2019 um 13:24 schrieb Richard Biener <richard.guenther@gmail.com>:
>>>>>>> 
>>>>>>> On Fri, Aug 23, 2019 at 12:43 PM Richard Sandiford
>>>>>>> <richard.sandiford@arm.com> wrote:
>>>>>>>> 
>>>>>>>> Ilya Leoshkevich <iii@linux.ibm.com> writes:
>>>>>>>>> @@ -3819,6 +3820,82 @@ can_compare_p (enum rtx_code code, machine_mode mode,
>>>>>>>>> return 0;
>>>>>>>>> }
>>>>>>>>> 
>>>>>>>>> +/* can_vector_compare_p presents fake rtx binary operations to the the back-end
>>>>>>>>> +   in order to determine its capabilities.  In order to avoid creating fake
>>>>>>>>> +   operations on each call, values from previous calls are cached in a global
>>>>>>>>> +   cached_binops hash_table.  It contains rtxes, which can be looked up using
>>>>>>>>> +   binop_keys.  */
>>>>>>>>> +
>>>>>>>>> +struct binop_key {
>>>>>>>>> +  enum rtx_code code;        /* Operation code.  */
>>>>>>>>> +  machine_mode value_mode;   /* Result mode.     */
>>>>>>>>> +  machine_mode cmp_op_mode;  /* Operand mode.    */
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> {
>>>>>>>>> +  typedef rtx value_type;
>>>>>>>>> +  typedef binop_key compare_type;
>>>>>>>>> +
>>>>>>>>> +  static hashval_t
>>>>>>>>> +  hash (enum rtx_code code, machine_mode value_mode, machine_mode cmp_op_mode)
>>>>>>>>> +  {
>>>>>>>>> +    inchash::hash hstate (0);
>>>>>>>>> +    hstate.add_int (code);
>>>>>>>>> +    hstate.add_int (value_mode);
>>>>>>>>> +    hstate.add_int (cmp_op_mode);
>>>>>>>>> +    return hstate.end ();
>>>>>>>>> +  }
>>>>>>>>> +
>>>>>>>>> +  static hashval_t
>>>>>>>>> +  hash (const rtx &ref)
>>>>>>>>> +  {
>>>>>>>>> +    return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP (ref, 0)));
>>>>>>>>> +  }
>>>>>>>>> +
>>>>>>>>> +  static bool
>>>>>>>>> +  equal (const rtx &ref1, const binop_key &ref2)
>>>>>>>>> +  {
>>>>>>>>> +    return (GET_CODE (ref1) == ref2.code)
>>>>>>>>> +        && (GET_MODE (ref1) == ref2.value_mode)
>>>>>>>>> +        && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode);
>>>>>>>>> +  }
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +static GTY ((cache)) hash_table<binop_hasher> *cached_binops;
>>>>>>>>> +
>>>>>>>>> +static rtx
>>>>>>>>> +get_cached_binop (enum rtx_code code, machine_mode value_mode,
>>>>>>>>> +               machine_mode cmp_op_mode)
>>>>>>>>> +{
>>>>>>>>> +  if (!cached_binops)
>>>>>>>>> +    cached_binops = hash_table<binop_hasher>::create_ggc (1024);
>>>>>>>>> +  binop_key key = { code, value_mode, cmp_op_mode };
>>>>>>>>> +  hashval_t hash = binop_hasher::hash (code, value_mode, cmp_op_mode);
>>>>>>>>> +  rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT);
>>>>>>>>> +  if (!*slot)
>>>>>>>>> +    *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx (cmp_op_mode),
>>>>>>>>> +                         gen_reg_rtx (cmp_op_mode));
>>>>>>>>> +  return *slot;
>>>>>>>>> +}
>>>>>>>> 
>>>>>>>> Sorry, I didn't mean anything this complicated.  I just meant that
>>>>>>>> we should have a single cached rtx that we can change via PUT_CODE and
>>>>>>>> PUT_MODE_RAW for each new query, rather than allocating a new rtx each
>>>>>>>> time.
>>>>>>>> 
>>>>>>>> Something like:
>>>>>>>> 
>>>>>>>> static GTY ((cache)) rtx cached_binop;
>>>>>>>> 
>>>>>>>> rtx
>>>>>>>> get_cached_binop (machine_mode mode, rtx_code code, machine_mode op_mode)
>>>>>>>> {
>>>>>>>> if (cached_binop)
>>>>>>>> {
>>>>>>>>   PUT_CODE (cached_binop, code);
>>>>>>>>   PUT_MODE_RAW (cached_binop, mode);
>>>>>>>>   PUT_MODE_RAW (XEXP (cached_binop, 0), op_mode);
>>>>>>>>   PUT_MODE_RAW (XEXP (cached_binop, 1), op_mode);
>>>>>>>> }
>>>>>>>> else
>>>>>>>> {
>>>>>>>>   rtx reg1 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1);
>>>>>>>>   rtx reg2 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 2);
>>>>>>>>   cached_binop = gen_rtx_fmt_ee (code, mode, reg1, reg2);
>>>>>>>> }
>>>>>>>> return cached_binop;
>>>>>>>> }
>>>>>>> 
>>>>>>> Hmm, maybe we need  auto_rtx (code) that constructs such
>>>>>>> RTX on the stack instead of wasting a GC root (and causing
>>>>>>> issues for future threading of GCC ;)).
>>>>>> 
>>>>>> Do you mean something like this?
>>>>>> 
>>>>>> union {
>>>>>> char raw[rtx_code_size[code]];
>>>>>> rtx rtx;
>>>>>> } binop;
>>>>>> 
>>>>>> Does this exist already (git grep auto.*rtx / rtx.*auto doesn't show
>>>>>> anything useful), or should I implement this?
>>>>> 
>>>>> It doesn't exist AFAIK, I thought about using alloca like
>>>>> 
>>>>> rtx tem;
>>>>> rtx_alloca (tem, PLUS);
>>>>> 
>>>>> and due to using alloca rtx_alloca has to be a macro like
>>>>> 
>>>>> #define rtx_alloca(r, code) r = (rtx)alloca (RTX_CODE_SIZE(code));
>>>>> memset (r, 0, RTX_HDR_SIZE); PUT_CODE (r, code);
>>>>> 
>>>>> maybe C++ can help making this prettier but of course
>>>>> since we use alloca we have to avoid opening new scopes.
>>>>> 
>>>>> I guess templates like with auto_vec doesn't work unless
>>>>> we can make RTX_CODE_SIZE constant-evaluated.
>>>>> 
>>>>> Richard.
>>>> 
>>>> I ended up with the following change:
>>>> 
>>>> diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
>>>> index a667cdab94e..97aa2144e95 100644
>>>> --- a/gcc/emit-rtl.c
>>>> +++ b/gcc/emit-rtl.c
>>>> @@ -466,17 +466,25 @@ set_mode_and_regno (rtx x, machine_mode mode, unsigned int regno)
>>>>  set_regno_raw (x, regno, nregs);
>>>> }
>>>> 
>>>> -/* Generate a new REG rtx.  Make sure ORIGINAL_REGNO is set properly, and
>>>> +/* Initialize a REG rtx.  Make sure ORIGINAL_REGNO is set properly, and
>>>>   don't attempt to share with the various global pieces of rtl (such as
>>>>   frame_pointer_rtx).  */
>>>> 
>>>> -rtx
>>>> -gen_raw_REG (machine_mode mode, unsigned int regno)
>>>> +void
>>>> +init_raw_REG (rtx x, machine_mode mode, unsigned int regno)
>>>> {
>>>> -  rtx x = rtx_alloc (REG MEM_STAT_INFO);
>>>>  set_mode_and_regno (x, mode, regno);
>>>>  REG_ATTRS (x) = NULL;
>>>>  ORIGINAL_REGNO (x) = regno;
>>>> +}
>>>> +
>>>> +/* Generate a new REG rtx.  */
>>>> +
>>>> +rtx
>>>> +gen_raw_REG (machine_mode mode, unsigned int regno)
>>>> +{
>>>> +  rtx x = rtx_alloc (REG MEM_STAT_INFO);
>>>> +  init_raw_REG (x, mode, regno);
>>>>  return x;
>>>> }
>>>> 
>>>> diff --git a/gcc/gengenrtl.c b/gcc/gengenrtl.c
>>>> index 5c78fabfb50..bb2087da258 100644
>>>> --- a/gcc/gengenrtl.c
>>>> +++ b/gcc/gengenrtl.c
>>>> @@ -231,8 +231,7 @@ genmacro (int idx)
>>>>  puts (")");
>>>> }
>>>> 
>>>> -/* Generate the code for the function to generate RTL whose
>>>> -   format is FORMAT.  */
>>>> +/* Generate the code for functions to generate RTL whose format is FORMAT.  */
>>>> 
>>>> static void
>>>> gendef (const char *format)
>>>> @@ -240,22 +239,18 @@ gendef (const char *format)
>>>>  const char *p;
>>>>  int i, j;
>>>> 
>>>> -  /* Start by writing the definition of the function name and the types
>>>> +  /* Write the definition of the init function name and the types
>>>>     of the arguments.  */
>>>> 
>>>> -  printf ("static inline rtx\ngen_rtx_fmt_%s_stat (RTX_CODE code, machine_mode mode", format);
>>>> +  puts ("static inline void");
>>>> +  printf ("init_rtx_fmt_%s (rtx rt, machine_mode mode", format);
>>>>  for (p = format, i = 0; *p != 0; p++)
>>>>    if (*p != '0')
>>>>      printf (",\n\t%sarg%d", type_from_format (*p), i++);
>>>> +  puts (")");
>>>> 
>>>> -  puts (" MEM_STAT_DECL)");
>>>> -
>>>> -  /* Now write out the body of the function itself, which allocates
>>>> -     the memory and initializes it.  */
>>>> +  /* Now write out the body of the init function itself.  */
>>>>  puts ("{");
>>>> -  puts ("  rtx rt;");
>>>> -  puts ("  rt = rtx_alloc (code PASS_MEM_STAT);\n");
>>>> -
>>>>  puts ("  PUT_MODE_RAW (rt, mode);");
>>>> 
>>>>  for (p = format, i = j = 0; *p ; ++p, ++i)
>>>> @@ -266,16 +261,56 @@ gendef (const char *format)
>>>>    else
>>>>      printf ("  %s (rt, %d) = arg%d;\n", accessor_from_format (*p), i, j++);
>>>> 
>>>> -  puts ("\n  return rt;\n}\n");
>>>> +  puts ("}\n");
>>>> +
>>>> +  /* Write the definition of the gen function name and the types
>>>> +     of the arguments.  */
>>>> +
>>>> +  puts ("static inline rtx");
>>>> +  printf ("gen_rtx_fmt_%s_stat (RTX_CODE code, machine_mode mode", format);
>>>> +  for (p = format, i = 0; *p != 0; p++)
>>>> +    if (*p != '0')
>>>> +      printf (",\n\t%sarg%d", type_from_format (*p), i++);
>>>> +  puts (" MEM_STAT_DECL)");
>>>> +
>>>> +  /* Now write out the body of the function itself, which allocates
>>>> +     the memory and initializes it.  */
>>>> +  puts ("{");
>>>> +  puts ("  rtx rt;\n");
>>>> +
>>>> +  puts ("  rt = rtx_alloc (code PASS_MEM_STAT);");
>>>> +  printf ("  init_rtx_fmt_%s (rt, mode", format);
>>>> +  for (p = format, i = 0; *p != 0; p++)
>>>> +    if (*p != '0')
>>>> +      printf (", arg%d", i++);
>>>> +  puts (");\n");
>>>> +
>>>> +  puts ("  return rt;\n}\n");
>>>> +
>>>> +  /* Write the definition of gen macro.  */
>>>> +
>>>>  printf ("#define gen_rtx_fmt_%s(c, m", format);
>>>>  for (p = format, i = 0; *p != 0; p++)
>>>>    if (*p != '0')
>>>> -      printf (", p%i",i++);
>>>> -  printf (")\\\n        gen_rtx_fmt_%s_stat (c, m", format);
>>>> +      printf (", arg%d", i++);
>>>> +  printf (") \\\n  gen_rtx_fmt_%s_stat ((c), (m)", format);
>>>>  for (p = format, i = 0; *p != 0; p++)
>>>>    if (*p != '0')
>>>> -      printf (", p%i",i++);
>>>> +      printf (", (arg%d)", i++);
>>>>  printf (" MEM_STAT_INFO)\n\n");
>>>> +
>>>> +  /* Write the definition of alloca macro.  */
>>>> +
>>>> +  printf ("#define alloca_rtx_fmt_%s(rt, c, m", format);
>>>> +  for (p = format, i = 0; *p != 0; p++)
>>>> +    if (*p != '0')
>>>> +      printf (", arg%d", i++);
>>>> +  printf (") \\\n  rtx_alloca ((rt), (c)); \\\n");
>>>> +  printf ("  init_rtx_fmt_%s ((rt), (m)", format);
>>>> +  for (p = format, i = 0; *p != 0; p++)
>>>> +    if (*p != '0')
>>>> +      printf (", (arg%d)", i++);
>>>> +  printf (")\n\n");
>>>> }
>>>> 
>>>> /* Generate the documentation header for files we write.  */
>>>> diff --git a/gcc/rtl.h b/gcc/rtl.h
>>>> index efb9b3ce40d..44733d8a39e 100644
>>>> --- a/gcc/rtl.h
>>>> +++ b/gcc/rtl.h
>>>> @@ -2933,6 +2933,10 @@ extern HOST_WIDE_INT get_stack_check_protect (void);
>>>> 
>>>> /* In rtl.c */
>>>> extern rtx rtx_alloc (RTX_CODE CXX_MEM_STAT_INFO);
>>>> +#define rtx_alloca(rt, code) \
>>>> +  (rt) = (rtx) alloca (RTX_CODE_SIZE ((code))); \
>>>> +  memset ((rt), 0, RTX_HDR_SIZE); \
>>>> +  PUT_CODE ((rt), (code));
>>>> extern rtx rtx_alloc_stat_v (RTX_CODE MEM_STAT_DECL, int);
>>>> #define rtx_alloc_v(c, SZ) rtx_alloc_stat_v (c MEM_STAT_INFO, SZ)
>>>> #define const_wide_int_alloc(NWORDS)                           \
>>>> @@ -3797,7 +3801,11 @@ gen_rtx_INSN (machine_mode mode, rtx_insn *prev_insn, rtx_insn *next_insn,
>>>> extern rtx gen_rtx_CONST_INT (machine_mode, HOST_WIDE_INT);
>>>> extern rtx gen_rtx_CONST_VECTOR (machine_mode, rtvec);
>>>> extern void set_mode_and_regno (rtx, machine_mode, unsigned int);
>>>> +extern void init_raw_REG (rtx, machine_mode, unsigned int);
>>>> extern rtx gen_raw_REG (machine_mode, unsigned int);
>>>> +#define alloca_raw_REG(rt, mode, regno) \
>>>> +  rtx_alloca ((rt), REG); \
>>>> +  init_raw_REG ((rt), (mode), (regno))
>>>> extern rtx gen_rtx_REG (machine_mode, unsigned int);
>>>> extern rtx gen_rtx_SUBREG (machine_mode, rtx, poly_uint64);
>>>> extern rtx gen_rtx_MEM (machine_mode, rtx);
>>>> 
>>>> which now allows me to write:
>>>> 
>>>> rtx reg1, reg2, test;
>>>> alloca_raw_REG (reg1, cmp_op_mode, LAST_VIRTUAL_REGISTER + 1);
>>>> alloca_raw_REG (reg2, cmp_op_mode, LAST_VIRTUAL_REGISTER + 2);
>>>> alloca_rtx_fmt_ee (test, code, value_mode, reg1, reg2);
>>>> 
>>>> If that looks ok, I'll resend the series.
>>> 
>>> that looks OK to me - please leave Richard S. time to comment.  Also while
>>> I'd like to see
>>> 
>>> rtx reg1 = alloca_raw_REG (cmp_op_mode, LAST_VIRTUAL_REGISTER + 1);
>>> 
>>> I don't really see a way to write that portably (or at all), do you all agree?
>>> GCC doesn't seem to convert alloca() calls to __builtin_stack_save/restore
>>> nor place CLOBBERs to end their lifetime.  But is it guaranteed that the
>>> alloca result is valid until frame termination?
>> 
>> Hmm, the alloca man page says:
>> 
>>       The alloca() function allocates size bytes of space in the stack
>>       frame of the caller.  This temporary space is automatically freed
>>       when the function that called alloca() returns to its caller.
>> ...
>>       The space allocated by alloca() is not automatically deallocated if
>>       the pointer that refers to it simply goes out of scope.
>> 
>> A quick experiment with gcc and clang confirms this.  I think this means
>> I can make alloca_raw_REG macro return the allocated pointer using the
>> return-from-block GNU extension.
>
> What do you think about the following approach?
>
> extern rtx rtx_init (rtx, RTX_CODE);
> #define rtx_alloca(code) \
>   rtx_init ((rtx) alloca (RTX_CODE_SIZE ((code))), (code))
>
> ...
>
> rtx
> rtx_init (rtx rt, RTX_CODE code)
> {
>   /* We want to clear everything up to the FLD array.  Normally, this
>      is one int, but we don't want to assume that and it isn't very
>      portable anyway; this is.  */
>   memset (rt, 0, RTX_HDR_SIZE);

Might as well remove this comment while you're there.  RTX_HDR_SIZE
has long ceased to be sizeof (int) for any host.

>   PUT_CODE (rt, code);
>   return rt;
> }
>
> with similar changes to alloca_raw_REG and gengenrtl.  This way we don't
> even need a GNU extension.

Sounds good to me, but rtx_init should probably be an inline function.

Thanks,
Richard
Richard Biener Aug. 27, 2019, 11:21 a.m. UTC | #12
On Tue, Aug 27, 2019 at 9:01 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Ilya Leoshkevich <iii@linux.ibm.com> writes:
> >> Am 26.08.2019 um 15:17 schrieb Ilya Leoshkevich <iii@linux.ibm.com>:
> >>
> >>> Am 26.08.2019 um 15:06 schrieb Richard Biener <richard.guenther@gmail.com>:
> >>>
> >>> On Mon, Aug 26, 2019 at 1:54 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
> >>>>
> >>>>> Am 26.08.2019 um 10:49 schrieb Richard Biener <richard.guenther@gmail.com>:
> >>>>>
> >>>>> On Fri, Aug 23, 2019 at 1:35 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote:
> >>>>>>
> >>>>>>> Am 23.08.2019 um 13:24 schrieb Richard Biener <richard.guenther@gmail.com>:
> >>>>>>>
> >>>>>>> On Fri, Aug 23, 2019 at 12:43 PM Richard Sandiford
> >>>>>>> <richard.sandiford@arm.com> wrote:
> >>>>>>>>
> >>>>>>>> Ilya Leoshkevich <iii@linux.ibm.com> writes:
> >>>>>>>>> @@ -3819,6 +3820,82 @@ can_compare_p (enum rtx_code code, machine_mode mode,
> >>>>>>>>> return 0;
> >>>>>>>>> }
> >>>>>>>>>
> >>>>>>>>> +/* can_vector_compare_p presents fake rtx binary operations to the the back-end
> >>>>>>>>> +   in order to determine its capabilities.  In order to avoid creating fake
> >>>>>>>>> +   operations on each call, values from previous calls are cached in a global
> >>>>>>>>> +   cached_binops hash_table.  It contains rtxes, which can be looked up using
> >>>>>>>>> +   binop_keys.  */
> >>>>>>>>> +
> >>>>>>>>> +struct binop_key {
> >>>>>>>>> +  enum rtx_code code;        /* Operation code.  */
> >>>>>>>>> +  machine_mode value_mode;   /* Result mode.     */
> >>>>>>>>> +  machine_mode cmp_op_mode;  /* Operand mode.    */
> >>>>>>>>> +};
> >>>>>>>>> +
> >>>>>>>>> +struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> {
> >>>>>>>>> +  typedef rtx value_type;
> >>>>>>>>> +  typedef binop_key compare_type;
> >>>>>>>>> +
> >>>>>>>>> +  static hashval_t
> >>>>>>>>> +  hash (enum rtx_code code, machine_mode value_mode, machine_mode cmp_op_mode)
> >>>>>>>>> +  {
> >>>>>>>>> +    inchash::hash hstate (0);
> >>>>>>>>> +    hstate.add_int (code);
> >>>>>>>>> +    hstate.add_int (value_mode);
> >>>>>>>>> +    hstate.add_int (cmp_op_mode);
> >>>>>>>>> +    return hstate.end ();
> >>>>>>>>> +  }
> >>>>>>>>> +
> >>>>>>>>> +  static hashval_t
> >>>>>>>>> +  hash (const rtx &ref)
> >>>>>>>>> +  {
> >>>>>>>>> +    return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP (ref, 0)));
> >>>>>>>>> +  }
> >>>>>>>>> +
> >>>>>>>>> +  static bool
> >>>>>>>>> +  equal (const rtx &ref1, const binop_key &ref2)
> >>>>>>>>> +  {
> >>>>>>>>> +    return (GET_CODE (ref1) == ref2.code)
> >>>>>>>>> +        && (GET_MODE (ref1) == ref2.value_mode)
> >>>>>>>>> +        && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode);
> >>>>>>>>> +  }
> >>>>>>>>> +};
> >>>>>>>>> +
> >>>>>>>>> +static GTY ((cache)) hash_table<binop_hasher> *cached_binops;
> >>>>>>>>> +
> >>>>>>>>> +static rtx
> >>>>>>>>> +get_cached_binop (enum rtx_code code, machine_mode value_mode,
> >>>>>>>>> +               machine_mode cmp_op_mode)
> >>>>>>>>> +{
> >>>>>>>>> +  if (!cached_binops)
> >>>>>>>>> +    cached_binops = hash_table<binop_hasher>::create_ggc (1024);
> >>>>>>>>> +  binop_key key = { code, value_mode, cmp_op_mode };
> >>>>>>>>> +  hashval_t hash = binop_hasher::hash (code, value_mode, cmp_op_mode);
> >>>>>>>>> +  rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT);
> >>>>>>>>> +  if (!*slot)
> >>>>>>>>> +    *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx (cmp_op_mode),
> >>>>>>>>> +                         gen_reg_rtx (cmp_op_mode));
> >>>>>>>>> +  return *slot;
> >>>>>>>>> +}
> >>>>>>>>
> >>>>>>>> Sorry, I didn't mean anything this complicated.  I just meant that
> >>>>>>>> we should have a single cached rtx that we can change via PUT_CODE and
> >>>>>>>> PUT_MODE_RAW for each new query, rather than allocating a new rtx each
> >>>>>>>> time.
> >>>>>>>>
> >>>>>>>> Something like:
> >>>>>>>>
> >>>>>>>> static GTY ((cache)) rtx cached_binop;
> >>>>>>>>
> >>>>>>>> rtx
> >>>>>>>> get_cached_binop (machine_mode mode, rtx_code code, machine_mode op_mode)
> >>>>>>>> {
> >>>>>>>> if (cached_binop)
> >>>>>>>> {
> >>>>>>>>   PUT_CODE (cached_binop, code);
> >>>>>>>>   PUT_MODE_RAW (cached_binop, mode);
> >>>>>>>>   PUT_MODE_RAW (XEXP (cached_binop, 0), op_mode);
> >>>>>>>>   PUT_MODE_RAW (XEXP (cached_binop, 1), op_mode);
> >>>>>>>> }
> >>>>>>>> else
> >>>>>>>> {
> >>>>>>>>   rtx reg1 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 1);
> >>>>>>>>   rtx reg2 = gen_raw_REG (op_mode, LAST_VIRTUAL_REGISTER + 2);
> >>>>>>>>   cached_binop = gen_rtx_fmt_ee (code, mode, reg1, reg2);
> >>>>>>>> }
> >>>>>>>> return cached_binop;
> >>>>>>>> }
> >>>>>>>
> >>>>>>> Hmm, maybe we need  auto_rtx (code) that constructs such
> >>>>>>> RTX on the stack instead of wasting a GC root (and causing
> >>>>>>> issues for future threading of GCC ;)).
> >>>>>>
> >>>>>> Do you mean something like this?
> >>>>>>
> >>>>>> union {
> >>>>>> char raw[rtx_code_size[code]];
> >>>>>> rtx rtx;
> >>>>>> } binop;
> >>>>>>
> >>>>>> Does this exist already (git grep auto.*rtx / rtx.*auto doesn't show
> >>>>>> anything useful), or should I implement this?
> >>>>>
> >>>>> It doesn't exist AFAIK, I thought about using alloca like
> >>>>>
> >>>>> rtx tem;
> >>>>> rtx_alloca (tem, PLUS);
> >>>>>
> >>>>> and due to using alloca rtx_alloca has to be a macro like
> >>>>>
> >>>>> #define rtx_alloca(r, code) r = (rtx)alloca (RTX_CODE_SIZE(code));
> >>>>> memset (r, 0, RTX_HDR_SIZE); PUT_CODE (r, code);
> >>>>>
> >>>>> maybe C++ can help making this prettier but of course
> >>>>> since we use alloca we have to avoid opening new scopes.
> >>>>>
> >>>>> I guess templates like with auto_vec doesn't work unless
> >>>>> we can make RTX_CODE_SIZE constant-evaluated.
> >>>>>
> >>>>> Richard.
> >>>>
> >>>> I ended up with the following change:
> >>>>
> >>>> diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
> >>>> index a667cdab94e..97aa2144e95 100644
> >>>> --- a/gcc/emit-rtl.c
> >>>> +++ b/gcc/emit-rtl.c
> >>>> @@ -466,17 +466,25 @@ set_mode_and_regno (rtx x, machine_mode mode, unsigned int regno)
> >>>>  set_regno_raw (x, regno, nregs);
> >>>> }
> >>>>
> >>>> -/* Generate a new REG rtx.  Make sure ORIGINAL_REGNO is set properly, and
> >>>> +/* Initialize a REG rtx.  Make sure ORIGINAL_REGNO is set properly, and
> >>>>   don't attempt to share with the various global pieces of rtl (such as
> >>>>   frame_pointer_rtx).  */
> >>>>
> >>>> -rtx
> >>>> -gen_raw_REG (machine_mode mode, unsigned int regno)
> >>>> +void
> >>>> +init_raw_REG (rtx x, machine_mode mode, unsigned int regno)
> >>>> {
> >>>> -  rtx x = rtx_alloc (REG MEM_STAT_INFO);
> >>>>  set_mode_and_regno (x, mode, regno);
> >>>>  REG_ATTRS (x) = NULL;
> >>>>  ORIGINAL_REGNO (x) = regno;
> >>>> +}
> >>>> +
> >>>> +/* Generate a new REG rtx.  */
> >>>> +
> >>>> +rtx
> >>>> +gen_raw_REG (machine_mode mode, unsigned int regno)
> >>>> +{
> >>>> +  rtx x = rtx_alloc (REG MEM_STAT_INFO);
> >>>> +  init_raw_REG (x, mode, regno);
> >>>>  return x;
> >>>> }
> >>>>
> >>>> diff --git a/gcc/gengenrtl.c b/gcc/gengenrtl.c
> >>>> index 5c78fabfb50..bb2087da258 100644
> >>>> --- a/gcc/gengenrtl.c
> >>>> +++ b/gcc/gengenrtl.c
> >>>> @@ -231,8 +231,7 @@ genmacro (int idx)
> >>>>  puts (")");
> >>>> }
> >>>>
> >>>> -/* Generate the code for the function to generate RTL whose
> >>>> -   format is FORMAT.  */
> >>>> +/* Generate the code for functions to generate RTL whose format is FORMAT.  */
> >>>>
> >>>> static void
> >>>> gendef (const char *format)
> >>>> @@ -240,22 +239,18 @@ gendef (const char *format)
> >>>>  const char *p;
> >>>>  int i, j;
> >>>>
> >>>> -  /* Start by writing the definition of the function name and the types
> >>>> +  /* Write the definition of the init function name and the types
> >>>>     of the arguments.  */
> >>>>
> >>>> -  printf ("static inline rtx\ngen_rtx_fmt_%s_stat (RTX_CODE code, machine_mode mode", format);
> >>>> +  puts ("static inline void");
> >>>> +  printf ("init_rtx_fmt_%s (rtx rt, machine_mode mode", format);
> >>>>  for (p = format, i = 0; *p != 0; p++)
> >>>>    if (*p != '0')
> >>>>      printf (",\n\t%sarg%d", type_from_format (*p), i++);
> >>>> +  puts (")");
> >>>>
> >>>> -  puts (" MEM_STAT_DECL)");
> >>>> -
> >>>> -  /* Now write out the body of the function itself, which allocates
> >>>> -     the memory and initializes it.  */
> >>>> +  /* Now write out the body of the init function itself.  */
> >>>>  puts ("{");
> >>>> -  puts ("  rtx rt;");
> >>>> -  puts ("  rt = rtx_alloc (code PASS_MEM_STAT);\n");
> >>>> -
> >>>>  puts ("  PUT_MODE_RAW (rt, mode);");
> >>>>
> >>>>  for (p = format, i = j = 0; *p ; ++p, ++i)
> >>>> @@ -266,16 +261,56 @@ gendef (const char *format)
> >>>>    else
> >>>>      printf ("  %s (rt, %d) = arg%d;\n", accessor_from_format (*p), i, j++);
> >>>>
> >>>> -  puts ("\n  return rt;\n}\n");
> >>>> +  puts ("}\n");
> >>>> +
> >>>> +  /* Write the definition of the gen function name and the types
> >>>> +     of the arguments.  */
> >>>> +
> >>>> +  puts ("static inline rtx");
> >>>> +  printf ("gen_rtx_fmt_%s_stat (RTX_CODE code, machine_mode mode", format);
> >>>> +  for (p = format, i = 0; *p != 0; p++)
> >>>> +    if (*p != '0')
> >>>> +      printf (",\n\t%sarg%d", type_from_format (*p), i++);
> >>>> +  puts (" MEM_STAT_DECL)");
> >>>> +
> >>>> +  /* Now write out the body of the function itself, which allocates
> >>>> +     the memory and initializes it.  */
> >>>> +  puts ("{");
> >>>> +  puts ("  rtx rt;\n");
> >>>> +
> >>>> +  puts ("  rt = rtx_alloc (code PASS_MEM_STAT);");
> >>>> +  printf ("  init_rtx_fmt_%s (rt, mode", format);
> >>>> +  for (p = format, i = 0; *p != 0; p++)
> >>>> +    if (*p != '0')
> >>>> +      printf (", arg%d", i++);
> >>>> +  puts (");\n");
> >>>> +
> >>>> +  puts ("  return rt;\n}\n");
> >>>> +
> >>>> +  /* Write the definition of gen macro.  */
> >>>> +
> >>>>  printf ("#define gen_rtx_fmt_%s(c, m", format);
> >>>>  for (p = format, i = 0; *p != 0; p++)
> >>>>    if (*p != '0')
> >>>> -      printf (", p%i",i++);
> >>>> -  printf (")\\\n        gen_rtx_fmt_%s_stat (c, m", format);
> >>>> +      printf (", arg%d", i++);
> >>>> +  printf (") \\\n  gen_rtx_fmt_%s_stat ((c), (m)", format);
> >>>>  for (p = format, i = 0; *p != 0; p++)
> >>>>    if (*p != '0')
> >>>> -      printf (", p%i",i++);
> >>>> +      printf (", (arg%d)", i++);
> >>>>  printf (" MEM_STAT_INFO)\n\n");
> >>>> +
> >>>> +  /* Write the definition of alloca macro.  */
> >>>> +
> >>>> +  printf ("#define alloca_rtx_fmt_%s(rt, c, m", format);
> >>>> +  for (p = format, i = 0; *p != 0; p++)
> >>>> +    if (*p != '0')
> >>>> +      printf (", arg%d", i++);
> >>>> +  printf (") \\\n  rtx_alloca ((rt), (c)); \\\n");
> >>>> +  printf ("  init_rtx_fmt_%s ((rt), (m)", format);
> >>>> +  for (p = format, i = 0; *p != 0; p++)
> >>>> +    if (*p != '0')
> >>>> +      printf (", (arg%d)", i++);
> >>>> +  printf (")\n\n");
> >>>> }
> >>>>
> >>>> /* Generate the documentation header for files we write.  */
> >>>> diff --git a/gcc/rtl.h b/gcc/rtl.h
> >>>> index efb9b3ce40d..44733d8a39e 100644
> >>>> --- a/gcc/rtl.h
> >>>> +++ b/gcc/rtl.h
> >>>> @@ -2933,6 +2933,10 @@ extern HOST_WIDE_INT get_stack_check_protect (void);
> >>>>
> >>>> /* In rtl.c */
> >>>> extern rtx rtx_alloc (RTX_CODE CXX_MEM_STAT_INFO);
> >>>> +#define rtx_alloca(rt, code) \
> >>>> +  (rt) = (rtx) alloca (RTX_CODE_SIZE ((code))); \
> >>>> +  memset ((rt), 0, RTX_HDR_SIZE); \
> >>>> +  PUT_CODE ((rt), (code));
> >>>> extern rtx rtx_alloc_stat_v (RTX_CODE MEM_STAT_DECL, int);
> >>>> #define rtx_alloc_v(c, SZ) rtx_alloc_stat_v (c MEM_STAT_INFO, SZ)
> >>>> #define const_wide_int_alloc(NWORDS)                           \
> >>>> @@ -3797,7 +3801,11 @@ gen_rtx_INSN (machine_mode mode, rtx_insn *prev_insn, rtx_insn *next_insn,
> >>>> extern rtx gen_rtx_CONST_INT (machine_mode, HOST_WIDE_INT);
> >>>> extern rtx gen_rtx_CONST_VECTOR (machine_mode, rtvec);
> >>>> extern void set_mode_and_regno (rtx, machine_mode, unsigned int);
> >>>> +extern void init_raw_REG (rtx, machine_mode, unsigned int);
> >>>> extern rtx gen_raw_REG (machine_mode, unsigned int);
> >>>> +#define alloca_raw_REG(rt, mode, regno) \
> >>>> +  rtx_alloca ((rt), REG); \
> >>>> +  init_raw_REG ((rt), (mode), (regno))
> >>>> extern rtx gen_rtx_REG (machine_mode, unsigned int);
> >>>> extern rtx gen_rtx_SUBREG (machine_mode, rtx, poly_uint64);
> >>>> extern rtx gen_rtx_MEM (machine_mode, rtx);
> >>>>
> >>>> which now allows me to write:
> >>>>
> >>>> rtx reg1, reg2, test;
> >>>> alloca_raw_REG (reg1, cmp_op_mode, LAST_VIRTUAL_REGISTER + 1);
> >>>> alloca_raw_REG (reg2, cmp_op_mode, LAST_VIRTUAL_REGISTER + 2);
> >>>> alloca_rtx_fmt_ee (test, code, value_mode, reg1, reg2);
> >>>>
> >>>> If that looks ok, I'll resend the series.
> >>>
> >>> that looks OK to me - please leave Richard S. time to comment.  Also while
> >>> I'd like to see
> >>>
> >>> rtx reg1 = alloca_raw_REG (cmp_op_mode, LAST_VIRTUAL_REGISTER + 1);
> >>>
> >>> I don't really see a way to write that portably (or at all), do you all agree?
> >>> GCC doesn't seem to convert alloca() calls to __builtin_stack_save/restore
> >>> nor place CLOBBERs to end their lifetime.  But is it guaranteed that the
> >>> alloca result is valid until frame termination?
> >>
> >> Hmm, the alloca man page says:
> >>
> >>       The alloca() function allocates size bytes of space in the stack
> >>       frame of the caller.  This temporary space is automatically freed
> >>       when the function that called alloca() returns to its caller.
> >> ...
> >>       The space allocated by alloca() is not automatically deallocated if
> >>       the pointer that refers to it simply goes out of scope.
> >>
> >> A quick experiment with gcc and clang confirms this.  I think this means
> >> I can make alloca_raw_REG macro return the allocated pointer using the
> >> return-from-block GNU extension.
> >
> > What do you think about the following approach?
> >
> > extern rtx rtx_init (rtx, RTX_CODE);
> > #define rtx_alloca(code) \
> >   rtx_init ((rtx) alloca (RTX_CODE_SIZE ((code))), (code))
> >
> > ...
> >
> > rtx
> > rtx_init (rtx rt, RTX_CODE code)
> > {
> >   /* We want to clear everything up to the FLD array.  Normally, this
> >      is one int, but we don't want to assume that and it isn't very
> >      portable anyway; this is.  */
> >   memset (rt, 0, RTX_HDR_SIZE);
>
> Might as well remove this comment while you're there.  RTX_HDR_SIZE
> has long ceased to be sizeof (int) for any host.
>
> >   PUT_CODE (rt, code);
> >   return rt;
> > }
> >
> > with similar changes to alloca_raw_REG and gengenrtl.  This way we don't
> > even need a GNU extension.
>
> Sounds good to me, but rtx_init should probably be an inline function.

Sounds good to me, too.

Richard.

> Thanks,
> Richard
diff mbox series

Patch

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 597dc01328b..d2207da5657 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -2541,7 +2541,7 @@  GTFILES = $(CPPLIB_H) $(srcdir)/input.h $(srcdir)/coretypes.h \
   $(srcdir)/function.c $(srcdir)/except.c \
   $(srcdir)/ggc-tests.c \
   $(srcdir)/gcse.c $(srcdir)/godump.c \
-  $(srcdir)/lists.c $(srcdir)/optabs-libfuncs.c \
+  $(srcdir)/lists.c $(srcdir)/optabs.c $(srcdir)/optabs-libfuncs.c \
   $(srcdir)/profile.c $(srcdir)/mcf.c \
   $(srcdir)/reg-stack.c $(srcdir)/cfgrtl.c \
   $(srcdir)/stor-layout.c \
diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c
index 8157798cc71..e68bb39c021 100644
--- a/gcc/optabs-tree.c
+++ b/gcc/optabs-tree.c
@@ -23,7 +23,10 @@  along with GCC; see the file COPYING3.  If not see
 #include "coretypes.h"
 #include "target.h"
 #include "insn-codes.h"
+#include "rtl.h"
 #include "tree.h"
+#include "memmodel.h"
+#include "optabs.h"
 #include "optabs-tree.h"
 #include "stor-layout.h"
 
@@ -347,8 +350,12 @@  expand_vec_cond_expr_p (tree value_type, tree cmp_op_type, enum tree_code code)
       || maybe_ne (GET_MODE_NUNITS (value_mode), GET_MODE_NUNITS (cmp_op_mode)))
     return false;
 
-  if (get_vcond_icode (TYPE_MODE (value_type), TYPE_MODE (cmp_op_type),
-		       TYPE_UNSIGNED (cmp_op_type)) == CODE_FOR_nothing
+  bool unsigned_p = TYPE_UNSIGNED (cmp_op_type);
+  if (((get_vcond_icode (TYPE_MODE (value_type), TYPE_MODE (cmp_op_type),
+			 unsigned_p) == CODE_FOR_nothing)
+       || !can_vector_compare_p (get_rtx_code (code, unsigned_p),
+				 TYPE_MODE (value_type),
+				 TYPE_MODE (cmp_op_type), cvcp_vcond))
       && ((code != EQ_EXPR && code != NE_EXPR)
 	  || get_vcond_eq_icode (TYPE_MODE (value_type),
 				 TYPE_MODE (cmp_op_type)) == CODE_FOR_nothing))
diff --git a/gcc/optabs.c b/gcc/optabs.c
index 9e54dda6e7f..07b4d824822 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -34,6 +34,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "recog.h"
 #include "diagnostic-core.h"
 #include "rtx-vector-builder.h"
+#include "hash-table.h"
 
 /* Include insn-config.h before expr.h so that HAVE_conditional_move
    is properly defined.  */
@@ -3819,6 +3820,82 @@  can_compare_p (enum rtx_code code, machine_mode mode,
   return 0;
 }
 
+/* can_vector_compare_p presents fake rtx binary operations to the the back-end
+   in order to determine its capabilities.  In order to avoid creating fake
+   operations on each call, values from previous calls are cached in a global
+   cached_binops hash_table.  It contains rtxes, which can be looked up using
+   binop_keys.  */
+
+struct binop_key {
+  enum rtx_code code;        /* Operation code.  */
+  machine_mode value_mode;   /* Result mode.     */
+  machine_mode cmp_op_mode;  /* Operand mode.    */
+};
+
+struct binop_hasher : pointer_hash_mark<rtx>, ggc_cache_remove<rtx> {
+  typedef rtx value_type;
+  typedef binop_key compare_type;
+
+  static hashval_t
+  hash (enum rtx_code code, machine_mode value_mode, machine_mode cmp_op_mode)
+  {
+    inchash::hash hstate (0);
+    hstate.add_int (code);
+    hstate.add_int (value_mode);
+    hstate.add_int (cmp_op_mode);
+    return hstate.end ();
+  }
+
+  static hashval_t
+  hash (const rtx &ref)
+  {
+    return hash (GET_CODE (ref), GET_MODE (ref), GET_MODE (XEXP (ref, 0)));
+  }
+
+  static bool
+  equal (const rtx &ref1, const binop_key &ref2)
+  {
+    return (GET_CODE (ref1) == ref2.code)
+	   && (GET_MODE (ref1) == ref2.value_mode)
+	   && (GET_MODE (XEXP (ref1, 0)) == ref2.cmp_op_mode);
+  }
+};
+
+static GTY ((cache)) hash_table<binop_hasher> *cached_binops;
+
+static rtx
+get_cached_binop (enum rtx_code code, machine_mode value_mode,
+		  machine_mode cmp_op_mode)
+{
+  if (!cached_binops)
+    cached_binops = hash_table<binop_hasher>::create_ggc (1024);
+  binop_key key = { code, value_mode, cmp_op_mode };
+  hashval_t hash = binop_hasher::hash (code, value_mode, cmp_op_mode);
+  rtx *slot = cached_binops->find_slot_with_hash (key, hash, INSERT);
+  if (!*slot)
+    *slot = gen_rtx_fmt_ee (code, value_mode, gen_reg_rtx (cmp_op_mode),
+			    gen_reg_rtx (cmp_op_mode));
+  return *slot;
+}
+
+bool
+can_vector_compare_p (enum rtx_code code, machine_mode value_mode,
+		      machine_mode cmp_op_mode,
+		      enum can_vector_compare_purpose purpose)
+{
+  enum insn_code icode;
+  bool unsigned_p = (code == LTU || code == LEU || code == GTU || code == GEU);
+  rtx test = get_cached_binop(code, value_mode, cmp_op_mode);
+
+  if (purpose == cvcp_vcond
+      && (icode = get_vcond_icode (value_mode, cmp_op_mode, unsigned_p))
+	 != CODE_FOR_nothing
+      && insn_operand_matches (icode, 3, test))
+    return true;
+
+  return false;
+}
+
 /* This function is called when we are going to emit a compare instruction that
    compares the values found in X and Y, using the rtl operator COMPARISON.
 
@@ -7481,3 +7558,5 @@  expand_jump_insn (enum insn_code icode, unsigned int nops,
   if (!maybe_expand_jump_insn (icode, nops, ops))
     gcc_unreachable ();
 }
+
+#include "gt-optabs.h"
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 897bb5d4443..2b2338a67af 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -242,6 +242,21 @@  enum can_compare_purpose
    (without splitting it into pieces).  */
 extern int can_compare_p (enum rtx_code, machine_mode,
 			  enum can_compare_purpose);
+
+/* The various uses that a vector comparison can have; used by
+   can_vector_compare_p.  So far only vcond is defined, vec_cmp is a possible
+   future extension.  */
+enum can_vector_compare_purpose
+{
+  cvcp_vcond
+};
+
+/* Return whether back-end can emit a vector comparison insn(s) using a given
+   CODE, with operands with CMP_OP_MODE, producing a result with VALUE_MODE,
+   in order to achieve a PURPOSE.  */
+extern bool can_vector_compare_p (enum rtx_code, machine_mode, machine_mode,
+				  enum can_vector_compare_purpose);
+
 extern rtx prepare_operand (enum insn_code, rtx, int, machine_mode,
 			    machine_mode, int);
 /* Emit a pair of rtl insns to compare two rtx's and to jump