diff mbox series

Rewrite memset expanders with vec_duplicate

Message ID 20210713214956.2010942-1-hjl.tools@gmail.com
State New
Headers show
Series Rewrite memset expanders with vec_duplicate | expand

Commit Message

H.J. Lu July 13, 2021, 9:49 p.m. UTC
1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
scratch register to avoid stack realignment when expanding memset.

	PR middle-end/90773
	* builtins.c (gen_memset_value_from_prev): New function.
	(gen_memset_broadcast): Likewise.
	(builtin_memset_read_str): Use gen_memset_value_from_prev
	and gen_memset_broadcast.
	(builtin_memset_gen_str): Likewise.
	* target.def (gen_memset_scratch_rtx): New hook.
	* doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
	* doc/tm.texi: Regenerated.
---
 gcc/builtins.c     | 123 +++++++++++++++++++++++++++++++++++++--------
 gcc/doc/tm.texi    |   5 ++
 gcc/doc/tm.texi.in |   2 +
 gcc/target.def     |   7 +++
 4 files changed, 116 insertions(+), 21 deletions(-)

Comments

Richard Sandiford July 16, 2021, 11:38 a.m. UTC | #1
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
> vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
> 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
> scratch register to avoid stack realignment when expanding memset.
>
> 	PR middle-end/90773
> 	* builtins.c (gen_memset_value_from_prev): New function.
> 	(gen_memset_broadcast): Likewise.
> 	(builtin_memset_read_str): Use gen_memset_value_from_prev
> 	and gen_memset_broadcast.
> 	(builtin_memset_gen_str): Likewise.
> 	* target.def (gen_memset_scratch_rtx): New hook.
> 	* doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> 	* doc/tm.texi: Regenerated.
> ---
>  gcc/builtins.c     | 123 +++++++++++++++++++++++++++++++++++++--------
>  gcc/doc/tm.texi    |   5 ++
>  gcc/doc/tm.texi.in |   2 +
>  gcc/target.def     |   7 +++
>  4 files changed, 116 insertions(+), 21 deletions(-)
>
> diff --git a/gcc/builtins.c b/gcc/builtins.c
> index 39ab139b7e1..c1758ae2efc 100644
> --- a/gcc/builtins.c
> +++ b/gcc/builtins.c
> @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
>    return NULL_RTX;
>  }
>  
> -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> -   bytes from constant string DATA + OFFSET and return it as target
> -   constant.  If PREV isn't nullptr, it has the RTL info from the
> +/* Return the RTL of a register in MODE generated from PREV in the
>     previous iteration.  */
>  
> -rtx
> -builtin_memset_read_str (void *data, void *prevp,
> -			 HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> -			 scalar_int_mode mode)
> +static rtx
> +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
>  {
> +  rtx target = nullptr;
>    by_pieces_prev *prev = (by_pieces_prev *) prevp;
>    if (prev != nullptr && prev->data != nullptr)
>      {
>        /* Use the previous data in the same mode.  */
>        if (prev->mode == mode)
>  	return prev->data;
> +
> +      rtx prev_rtx = prev->data;
> +      machine_mode prev_mode = prev->mode;
> +      unsigned int word_size = GET_MODE_SIZE (word_mode);
> +      if (word_size < GET_MODE_SIZE (prev->mode)
> +	  && word_size > GET_MODE_SIZE (mode))
> +	{
> +	  /* First generate subreg of word mode if the previous mode is
> +	     wider than word mode and word mode is wider than MODE.  */
> +	  prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> +					  prev_mode, 0);
> +	  prev_mode = word_mode;
> +	}
> +      if (prev_rtx != nullptr)
> +	target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
>      }
> +  return target;
> +}
> +
> +/* Return the RTL of a register in MODE broadcasted from DATA.  */
> +
> +static rtx
> +gen_memset_broadcast (rtx data, scalar_int_mode mode)
> +{
> +  /* Skip if regno_reg_rtx isn't initialized.  */
> +  if (!regno_reg_rtx)
> +    return nullptr;
> +
> +  rtx target = nullptr;
> +
> +  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
> +  machine_mode vector_mode;
> +  if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> +    gcc_unreachable ();

Sorry, I realise it's a bit late to be raising this objection now,
but I don't think it's a good idea to use scalar integer modes as
a proxy for vector modes.  In principle there's no reason why a
target has to define an integer mode for every vector mode.

If we want the mode to be a vector then I think the by-pieces
infrastructure should be extended to support vectors directly,
rather than assuming that each piece can be represented as
a scalar_int_mode.

Thanks,
Richard

> +
> +  enum insn_code icode = optab_handler (vec_duplicate_optab,
> +					vector_mode);
> +  if (icode != CODE_FOR_nothing)
> +    {
> +      rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
> +      if (CONST_INT_P (data))
> +	{
> +	  /* Use the move expander with CONST_VECTOR.  */
> +	  rtx const_vec = gen_const_vec_duplicate (vector_mode, data);
> +	  emit_move_insn (reg, const_vec);
> +	}
> +      else
> +	{
> +
> +	  class expand_operand ops[2];
> +	  create_output_operand (&ops[0], reg, vector_mode);
> +	  create_input_operand (&ops[1], data, QImode);
> +	  expand_insn (icode, 2, ops);
> +	  if (!rtx_equal_p (reg, ops[0].value))
> +	    emit_move_insn (reg, ops[0].value);
> +	}
> +      target = lowpart_subreg (mode, reg, vector_mode);
> +    }
> +
> +  return target;
> +}
> +
> +/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> +   bytes from constant string DATA + OFFSET and return it as target
> +   constant.  If PREV isn't nullptr, it has the RTL info from the
> +   previous iteration.  */
>  
> +rtx
> +builtin_memset_read_str (void *data, void *prev,
> +			 HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> +			 scalar_int_mode mode)
> +{
> +  rtx target;
>    const char *c = (const char *) data;
> -  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> +  char *p;
> +
> +  /* Don't use the previous value if size is 1.  */
> +  if (GET_MODE_SIZE (mode) != 1)
> +    {
> +      target = gen_memset_value_from_prev (prev, mode);
> +      if (target != nullptr)
> +	return target;
> +
> +      p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
> +      memset (p, *c, GET_MODE_SIZE (QImode));
> +      rtx src = c_readstr (p, QImode);
> +      target = gen_memset_broadcast (src, mode);
> +      if (target != nullptr)
> +	return target;
> +    }
> +
> +  p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
>  
>    memset (p, *c, GET_MODE_SIZE (mode));
>  
> @@ -6719,7 +6804,7 @@ builtin_memset_read_str (void *data, void *prevp,
>     nullptr, it has the RTL info from the previous iteration.  */
>  
>  static rtx
> -builtin_memset_gen_str (void *data, void *prevp,
> +builtin_memset_gen_str (void *data, void *prev,
>  			HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
>  			scalar_int_mode mode)
>  {
> @@ -6727,22 +6812,18 @@ builtin_memset_gen_str (void *data, void *prevp,
>    size_t size;
>    char *p;
>  
> -  by_pieces_prev *prev = (by_pieces_prev *) prevp;
> -  if (prev != nullptr && prev->data != nullptr)
> -    {
> -      /* Use the previous data in the same mode.  */
> -      if (prev->mode == mode)
> -	return prev->data;
> -
> -      target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
> -      if (target != nullptr)
> -	return target;
> -    }
> -
>    size = GET_MODE_SIZE (mode);
>    if (size == 1)
>      return (rtx) data;
>  
> +  target = gen_memset_value_from_prev (prev, mode);
> +  if (target != nullptr)
> +    return target;
> +
> +  target = gen_memset_broadcast ((rtx) data, mode);
> +  if (target != nullptr)
> +    return target;
> +
>    p = XALLOCAVEC (char, size);
>    memset (p, 1, size);
>    coeff = c_readstr (p, mode);
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 2a41ae5fba1..8849711fcf8 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -12122,6 +12122,11 @@ This function prepares to emit a conditional comparison within a sequence
>   @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
>  @end deftypefn
>  
> +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})
> +This hook should return an rtx for scratch register in @var{mode} to
> +be used by memset broadcast. The default is @code{gen_reg_rtx}.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
>  This target hook returns a new value for the number of times @var{loop}
>  should be unrolled. The parameter @var{nunroll} is the number of times
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index f881cdabe9e..a6bbf4f2667 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -7958,6 +7958,8 @@ lists.
>  
>  @hook TARGET_GEN_CCMP_NEXT
>  
> +@hook TARGET_GEN_MEMSET_SCRATCH_RTX
> +
>  @hook TARGET_LOOP_UNROLL_ADJUST
>  
>  @defmac POWI_MAX_MULTS
> diff --git a/gcc/target.def b/gcc/target.def
> index c009671c583..5fb287db3bd 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -2726,6 +2726,13 @@ DEFHOOK
>   rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
>   NULL)
>  
> +DEFHOOK
> +(gen_memset_scratch_rtx,
> + "This hook should return an rtx for scratch register in @var{mode} to\n\
> +be used by memset broadcast. The default is @code{gen_reg_rtx}.",
> + rtx, (machine_mode mode),
> + gen_reg_rtx)
> +
>  /* Return a new value for loop unroll size.  */
>  DEFHOOK
>  (loop_unroll_adjust,
H.J. Lu July 16, 2021, 12:57 p.m. UTC | #2
On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
> > scratch register to avoid stack realignment when expanding memset.
> >
> >       PR middle-end/90773
> >       * builtins.c (gen_memset_value_from_prev): New function.
> >       (gen_memset_broadcast): Likewise.
> >       (builtin_memset_read_str): Use gen_memset_value_from_prev
> >       and gen_memset_broadcast.
> >       (builtin_memset_gen_str): Likewise.
> >       * target.def (gen_memset_scratch_rtx): New hook.
> >       * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> >       * doc/tm.texi: Regenerated.
> > ---
> >  gcc/builtins.c     | 123 +++++++++++++++++++++++++++++++++++++--------
> >  gcc/doc/tm.texi    |   5 ++
> >  gcc/doc/tm.texi.in |   2 +
> >  gcc/target.def     |   7 +++
> >  4 files changed, 116 insertions(+), 21 deletions(-)
> >
> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> > index 39ab139b7e1..c1758ae2efc 100644
> > --- a/gcc/builtins.c
> > +++ b/gcc/builtins.c
> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
> >    return NULL_RTX;
> >  }
> >
> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> > -   bytes from constant string DATA + OFFSET and return it as target
> > -   constant.  If PREV isn't nullptr, it has the RTL info from the
> > +/* Return the RTL of a register in MODE generated from PREV in the
> >     previous iteration.  */
> >
> > -rtx
> > -builtin_memset_read_str (void *data, void *prevp,
> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > -                      scalar_int_mode mode)
> > +static rtx
> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
> >  {
> > +  rtx target = nullptr;
> >    by_pieces_prev *prev = (by_pieces_prev *) prevp;
> >    if (prev != nullptr && prev->data != nullptr)
> >      {
> >        /* Use the previous data in the same mode.  */
> >        if (prev->mode == mode)
> >       return prev->data;
> > +
> > +      rtx prev_rtx = prev->data;
> > +      machine_mode prev_mode = prev->mode;
> > +      unsigned int word_size = GET_MODE_SIZE (word_mode);
> > +      if (word_size < GET_MODE_SIZE (prev->mode)
> > +       && word_size > GET_MODE_SIZE (mode))
> > +     {
> > +       /* First generate subreg of word mode if the previous mode is
> > +          wider than word mode and word mode is wider than MODE.  */
> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> > +                                       prev_mode, 0);
> > +       prev_mode = word_mode;
> > +     }
> > +      if (prev_rtx != nullptr)
> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> >      }
> > +  return target;
> > +}
> > +
> > +/* Return the RTL of a register in MODE broadcasted from DATA.  */
> > +
> > +static rtx
> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)
> > +{
> > +  /* Skip if regno_reg_rtx isn't initialized.  */
> > +  if (!regno_reg_rtx)
> > +    return nullptr;
> > +
> > +  rtx target = nullptr;
> > +
> > +  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
> > +  machine_mode vector_mode;
> > +  if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> > +    gcc_unreachable ();
>
> Sorry, I realise it's a bit late to be raising this objection now,
> but I don't think it's a good idea to use scalar integer modes as
> a proxy for vector modes.  In principle there's no reason why a
> target has to define an integer mode for every vector mode.

A target always defines the largest integer mode.

> If we want the mode to be a vector then I think the by-pieces
> infrastructure should be extended to support vectors directly,
> rather than assuming that each piece can be represented as
> a scalar_int_mode.
>

The current by-pieces infrastructure operates on scalar_int_mode.
Only for memset, there is

/* Callback routine for store_by_pieces.  Return the RTL of a register
   containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
   char value given in the RTL register data.  For example, if mode is
   4 bytes wide, return the RTL for 0x01010101*data.  If PREV isn't
   nullptr, it has the RTL info from the previous iteration.  */

static rtx
builtin_memset_gen_str (void *data, void *prevp,
                        HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
                        scalar_int_mode mode)

It is a broadcast.  If a target can broadcast a byte to a wider integer,
can you suggest a way to use it in the current by-pieces infrastructure?

Thanks.

> Thanks,
> Richard
>
> > +
> > +  enum insn_code icode = optab_handler (vec_duplicate_optab,
> > +                                     vector_mode);
> > +  if (icode != CODE_FOR_nothing)
> > +    {
> > +      rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
> > +      if (CONST_INT_P (data))
> > +     {
> > +       /* Use the move expander with CONST_VECTOR.  */
> > +       rtx const_vec = gen_const_vec_duplicate (vector_mode, data);
> > +       emit_move_insn (reg, const_vec);
> > +     }
> > +      else
> > +     {
> > +
> > +       class expand_operand ops[2];
> > +       create_output_operand (&ops[0], reg, vector_mode);
> > +       create_input_operand (&ops[1], data, QImode);
> > +       expand_insn (icode, 2, ops);
> > +       if (!rtx_equal_p (reg, ops[0].value))
> > +         emit_move_insn (reg, ops[0].value);
> > +     }
> > +      target = lowpart_subreg (mode, reg, vector_mode);
> > +    }
> > +
> > +  return target;
> > +}
> > +
> > +/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> > +   bytes from constant string DATA + OFFSET and return it as target
> > +   constant.  If PREV isn't nullptr, it has the RTL info from the
> > +   previous iteration.  */
> >
> > +rtx
> > +builtin_memset_read_str (void *data, void *prev,
> > +                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > +                      scalar_int_mode mode)
> > +{
> > +  rtx target;
> >    const char *c = (const char *) data;
> > -  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> > +  char *p;
> > +
> > +  /* Don't use the previous value if size is 1.  */
> > +  if (GET_MODE_SIZE (mode) != 1)
> > +    {
> > +      target = gen_memset_value_from_prev (prev, mode);
> > +      if (target != nullptr)
> > +     return target;
> > +
> > +      p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
> > +      memset (p, *c, GET_MODE_SIZE (QImode));
> > +      rtx src = c_readstr (p, QImode);
> > +      target = gen_memset_broadcast (src, mode);
> > +      if (target != nullptr)
> > +     return target;
> > +    }
> > +
> > +  p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> >
> >    memset (p, *c, GET_MODE_SIZE (mode));
> >
> > @@ -6719,7 +6804,7 @@ builtin_memset_read_str (void *data, void *prevp,
> >     nullptr, it has the RTL info from the previous iteration.  */
> >
> >  static rtx
> > -builtin_memset_gen_str (void *data, void *prevp,
> > +builtin_memset_gen_str (void *data, void *prev,
> >                       HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> >                       scalar_int_mode mode)
> >  {
> > @@ -6727,22 +6812,18 @@ builtin_memset_gen_str (void *data, void *prevp,
> >    size_t size;
> >    char *p;
> >
> > -  by_pieces_prev *prev = (by_pieces_prev *) prevp;
> > -  if (prev != nullptr && prev->data != nullptr)
> > -    {
> > -      /* Use the previous data in the same mode.  */
> > -      if (prev->mode == mode)
> > -     return prev->data;
> > -
> > -      target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
> > -      if (target != nullptr)
> > -     return target;
> > -    }
> > -
> >    size = GET_MODE_SIZE (mode);
> >    if (size == 1)
> >      return (rtx) data;
> >
> > +  target = gen_memset_value_from_prev (prev, mode);
> > +  if (target != nullptr)
> > +    return target;
> > +
> > +  target = gen_memset_broadcast ((rtx) data, mode);
> > +  if (target != nullptr)
> > +    return target;
> > +
> >    p = XALLOCAVEC (char, size);
> >    memset (p, 1, size);
> >    coeff = c_readstr (p, mode);
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > index 2a41ae5fba1..8849711fcf8 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -12122,6 +12122,11 @@ This function prepares to emit a conditional comparison within a sequence
> >   @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
> >  @end deftypefn
> >
> > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})
> > +This hook should return an rtx for scratch register in @var{mode} to
> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.
> > +@end deftypefn
> > +
> >  @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
> >  This target hook returns a new value for the number of times @var{loop}
> >  should be unrolled. The parameter @var{nunroll} is the number of times
> > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> > index f881cdabe9e..a6bbf4f2667 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -7958,6 +7958,8 @@ lists.
> >
> >  @hook TARGET_GEN_CCMP_NEXT
> >
> > +@hook TARGET_GEN_MEMSET_SCRATCH_RTX
> > +
> >  @hook TARGET_LOOP_UNROLL_ADJUST
> >
> >  @defmac POWI_MAX_MULTS
> > diff --git a/gcc/target.def b/gcc/target.def
> > index c009671c583..5fb287db3bd 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -2726,6 +2726,13 @@ DEFHOOK
> >   rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
> >   NULL)
> >
> > +DEFHOOK
> > +(gen_memset_scratch_rtx,
> > + "This hook should return an rtx for scratch register in @var{mode} to\n\
> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.",
> > + rtx, (machine_mode mode),
> > + gen_reg_rtx)
> > +
> >  /* Return a new value for loop unroll size.  */
> >  DEFHOOK
> >  (loop_unroll_adjust,
Richard Sandiford July 16, 2021, 1:23 p.m. UTC | #3
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
>> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
>> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
>> > scratch register to avoid stack realignment when expanding memset.
>> >
>> >       PR middle-end/90773
>> >       * builtins.c (gen_memset_value_from_prev): New function.
>> >       (gen_memset_broadcast): Likewise.
>> >       (builtin_memset_read_str): Use gen_memset_value_from_prev
>> >       and gen_memset_broadcast.
>> >       (builtin_memset_gen_str): Likewise.
>> >       * target.def (gen_memset_scratch_rtx): New hook.
>> >       * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
>> >       * doc/tm.texi: Regenerated.
>> > ---
>> >  gcc/builtins.c     | 123 +++++++++++++++++++++++++++++++++++++--------
>> >  gcc/doc/tm.texi    |   5 ++
>> >  gcc/doc/tm.texi.in |   2 +
>> >  gcc/target.def     |   7 +++
>> >  4 files changed, 116 insertions(+), 21 deletions(-)
>> >
>> > diff --git a/gcc/builtins.c b/gcc/builtins.c
>> > index 39ab139b7e1..c1758ae2efc 100644
>> > --- a/gcc/builtins.c
>> > +++ b/gcc/builtins.c
>> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
>> >    return NULL_RTX;
>> >  }
>> >
>> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
>> > -   bytes from constant string DATA + OFFSET and return it as target
>> > -   constant.  If PREV isn't nullptr, it has the RTL info from the
>> > +/* Return the RTL of a register in MODE generated from PREV in the
>> >     previous iteration.  */
>> >
>> > -rtx
>> > -builtin_memset_read_str (void *data, void *prevp,
>> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
>> > -                      scalar_int_mode mode)
>> > +static rtx
>> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
>> >  {
>> > +  rtx target = nullptr;
>> >    by_pieces_prev *prev = (by_pieces_prev *) prevp;
>> >    if (prev != nullptr && prev->data != nullptr)
>> >      {
>> >        /* Use the previous data in the same mode.  */
>> >        if (prev->mode == mode)
>> >       return prev->data;
>> > +
>> > +      rtx prev_rtx = prev->data;
>> > +      machine_mode prev_mode = prev->mode;
>> > +      unsigned int word_size = GET_MODE_SIZE (word_mode);
>> > +      if (word_size < GET_MODE_SIZE (prev->mode)
>> > +       && word_size > GET_MODE_SIZE (mode))
>> > +     {
>> > +       /* First generate subreg of word mode if the previous mode is
>> > +          wider than word mode and word mode is wider than MODE.  */
>> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
>> > +                                       prev_mode, 0);
>> > +       prev_mode = word_mode;
>> > +     }
>> > +      if (prev_rtx != nullptr)
>> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
>> >      }
>> > +  return target;
>> > +}
>> > +
>> > +/* Return the RTL of a register in MODE broadcasted from DATA.  */
>> > +
>> > +static rtx
>> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)
>> > +{
>> > +  /* Skip if regno_reg_rtx isn't initialized.  */
>> > +  if (!regno_reg_rtx)
>> > +    return nullptr;
>> > +
>> > +  rtx target = nullptr;
>> > +
>> > +  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
>> > +  machine_mode vector_mode;
>> > +  if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
>> > +    gcc_unreachable ();
>>
>> Sorry, I realise it's a bit late to be raising this objection now,
>> but I don't think it's a good idea to use scalar integer modes as
>> a proxy for vector modes.  In principle there's no reason why a
>> target has to define an integer mode for every vector mode.
>
> A target always defines the largest integer mode.

Right.  But a target shouldn't *need* to define an integer mode
for every vector mode.

>> If we want the mode to be a vector then I think the by-pieces
>> infrastructure should be extended to support vectors directly,
>> rather than assuming that each piece can be represented as
>> a scalar_int_mode.
>>
>
> The current by-pieces infrastructure operates on scalar_int_mode.
> Only for memset, there is
>
> /* Callback routine for store_by_pieces.  Return the RTL of a register
>    containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
>    char value given in the RTL register data.  For example, if mode is
>    4 bytes wide, return the RTL for 0x01010101*data.  If PREV isn't
>    nullptr, it has the RTL info from the previous iteration.  */
>
> static rtx
> builtin_memset_gen_str (void *data, void *prevp,
>                         HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
>                         scalar_int_mode mode)
>
> It is a broadcast.  If a target can broadcast a byte to a wider integer,
> can you suggest a way to use it in the current by-pieces infrastructure?

I meant that we should extend the by-pieces infrastructure so that
it no longer requires scalar_int_mode, rather than work within the
current limits.

IMO the fact that we're (legitimately) going through vec_duplicate_optab
shows that we really are dealing with vectors here, not integers.

Thanks,
Richard
H.J. Lu July 16, 2021, 2:15 p.m. UTC | #4
On Fri, Jul 16, 2021 at 6:24 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
> >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
> >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
> >> > scratch register to avoid stack realignment when expanding memset.
> >> >
> >> >       PR middle-end/90773
> >> >       * builtins.c (gen_memset_value_from_prev): New function.
> >> >       (gen_memset_broadcast): Likewise.
> >> >       (builtin_memset_read_str): Use gen_memset_value_from_prev
> >> >       and gen_memset_broadcast.
> >> >       (builtin_memset_gen_str): Likewise.
> >> >       * target.def (gen_memset_scratch_rtx): New hook.
> >> >       * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> >> >       * doc/tm.texi: Regenerated.
> >> > ---
> >> >  gcc/builtins.c     | 123 +++++++++++++++++++++++++++++++++++++--------
> >> >  gcc/doc/tm.texi    |   5 ++
> >> >  gcc/doc/tm.texi.in |   2 +
> >> >  gcc/target.def     |   7 +++
> >> >  4 files changed, 116 insertions(+), 21 deletions(-)
> >> >
> >> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> >> > index 39ab139b7e1..c1758ae2efc 100644
> >> > --- a/gcc/builtins.c
> >> > +++ b/gcc/builtins.c
> >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
> >> >    return NULL_RTX;
> >> >  }
> >> >
> >> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> >> > -   bytes from constant string DATA + OFFSET and return it as target
> >> > -   constant.  If PREV isn't nullptr, it has the RTL info from the
> >> > +/* Return the RTL of a register in MODE generated from PREV in the
> >> >     previous iteration.  */
> >> >
> >> > -rtx
> >> > -builtin_memset_read_str (void *data, void *prevp,
> >> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> >> > -                      scalar_int_mode mode)
> >> > +static rtx
> >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
> >> >  {
> >> > +  rtx target = nullptr;
> >> >    by_pieces_prev *prev = (by_pieces_prev *) prevp;
> >> >    if (prev != nullptr && prev->data != nullptr)
> >> >      {
> >> >        /* Use the previous data in the same mode.  */
> >> >        if (prev->mode == mode)
> >> >       return prev->data;
> >> > +
> >> > +      rtx prev_rtx = prev->data;
> >> > +      machine_mode prev_mode = prev->mode;
> >> > +      unsigned int word_size = GET_MODE_SIZE (word_mode);
> >> > +      if (word_size < GET_MODE_SIZE (prev->mode)
> >> > +       && word_size > GET_MODE_SIZE (mode))
> >> > +     {
> >> > +       /* First generate subreg of word mode if the previous mode is
> >> > +          wider than word mode and word mode is wider than MODE.  */
> >> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> >> > +                                       prev_mode, 0);
> >> > +       prev_mode = word_mode;
> >> > +     }
> >> > +      if (prev_rtx != nullptr)
> >> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> >> >      }
> >> > +  return target;
> >> > +}
> >> > +
> >> > +/* Return the RTL of a register in MODE broadcasted from DATA.  */
> >> > +
> >> > +static rtx
> >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)
> >> > +{
> >> > +  /* Skip if regno_reg_rtx isn't initialized.  */
> >> > +  if (!regno_reg_rtx)
> >> > +    return nullptr;
> >> > +
> >> > +  rtx target = nullptr;
> >> > +
> >> > +  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
> >> > +  machine_mode vector_mode;
> >> > +  if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> >> > +    gcc_unreachable ();
> >>
> >> Sorry, I realise it's a bit late to be raising this objection now,
> >> but I don't think it's a good idea to use scalar integer modes as
> >> a proxy for vector modes.  In principle there's no reason why a
> >> target has to define an integer mode for every vector mode.
> >
> > A target always defines the largest integer mode.
>
> Right.  But a target shouldn't *need* to define an integer mode
> for every vector mode.
>
> >> If we want the mode to be a vector then I think the by-pieces
> >> infrastructure should be extended to support vectors directly,
> >> rather than assuming that each piece can be represented as
> >> a scalar_int_mode.
> >>
> >
> > The current by-pieces infrastructure operates on scalar_int_mode.
> > Only for memset, there is
> >
> > /* Callback routine for store_by_pieces.  Return the RTL of a register
> >    containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
> >    char value given in the RTL register data.  For example, if mode is
> >    4 bytes wide, return the RTL for 0x01010101*data.  If PREV isn't
> >    nullptr, it has the RTL info from the previous iteration.  */
> >
> > static rtx
> > builtin_memset_gen_str (void *data, void *prevp,
> >                         HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> >                         scalar_int_mode mode)
> >
> > It is a broadcast.  If a target can broadcast a byte to a wider integer,
> > can you suggest a way to use it in the current by-pieces infrastructure?
>
> I meant that we should extend the by-pieces infrastructure so that
> it no longer requires scalar_int_mode, rather than work within the
> current limits.
>
> IMO the fact that we're (legitimately) going through vec_duplicate_optab

If vec_duplicate_optab is the only blocking issue, can we add an integer
broadcast optab?  I want to avoid a major surgery just because we
don't support integer broadcast from byte.

> shows that we really are dealing with vectors here, not integers.
>

Are you suggesting we should add the optional vector mode
support to by-pieces for memset?
diff mbox series

Patch

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 39ab139b7e1..c1758ae2efc 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -6686,26 +6686,111 @@  expand_builtin_strncpy (tree exp, rtx target)
   return NULL_RTX;
 }
 
-/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
-   bytes from constant string DATA + OFFSET and return it as target
-   constant.  If PREV isn't nullptr, it has the RTL info from the
+/* Return the RTL of a register in MODE generated from PREV in the
    previous iteration.  */
 
-rtx
-builtin_memset_read_str (void *data, void *prevp,
-			 HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
-			 scalar_int_mode mode)
+static rtx
+gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
 {
+  rtx target = nullptr;
   by_pieces_prev *prev = (by_pieces_prev *) prevp;
   if (prev != nullptr && prev->data != nullptr)
     {
       /* Use the previous data in the same mode.  */
       if (prev->mode == mode)
 	return prev->data;
+
+      rtx prev_rtx = prev->data;
+      machine_mode prev_mode = prev->mode;
+      unsigned int word_size = GET_MODE_SIZE (word_mode);
+      if (word_size < GET_MODE_SIZE (prev->mode)
+	  && word_size > GET_MODE_SIZE (mode))
+	{
+	  /* First generate subreg of word mode if the previous mode is
+	     wider than word mode and word mode is wider than MODE.  */
+	  prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
+					  prev_mode, 0);
+	  prev_mode = word_mode;
+	}
+      if (prev_rtx != nullptr)
+	target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
     }
+  return target;
+}
+
+/* Return the RTL of a register in MODE broadcasted from DATA.  */
+
+static rtx
+gen_memset_broadcast (rtx data, scalar_int_mode mode)
+{
+  /* Skip if regno_reg_rtx isn't initialized.  */
+  if (!regno_reg_rtx)
+    return nullptr;
+
+  rtx target = nullptr;
+
+  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
+  machine_mode vector_mode;
+  if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
+    gcc_unreachable ();
+
+  enum insn_code icode = optab_handler (vec_duplicate_optab,
+					vector_mode);
+  if (icode != CODE_FOR_nothing)
+    {
+      rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
+      if (CONST_INT_P (data))
+	{
+	  /* Use the move expander with CONST_VECTOR.  */
+	  rtx const_vec = gen_const_vec_duplicate (vector_mode, data);
+	  emit_move_insn (reg, const_vec);
+	}
+      else
+	{
+
+	  class expand_operand ops[2];
+	  create_output_operand (&ops[0], reg, vector_mode);
+	  create_input_operand (&ops[1], data, QImode);
+	  expand_insn (icode, 2, ops);
+	  if (!rtx_equal_p (reg, ops[0].value))
+	    emit_move_insn (reg, ops[0].value);
+	}
+      target = lowpart_subreg (mode, reg, vector_mode);
+    }
+
+  return target;
+}
+
+/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
+   bytes from constant string DATA + OFFSET and return it as target
+   constant.  If PREV isn't nullptr, it has the RTL info from the
+   previous iteration.  */
 
+rtx
+builtin_memset_read_str (void *data, void *prev,
+			 HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
+			 scalar_int_mode mode)
+{
+  rtx target;
   const char *c = (const char *) data;
-  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
+  char *p;
+
+  /* Don't use the previous value if size is 1.  */
+  if (GET_MODE_SIZE (mode) != 1)
+    {
+      target = gen_memset_value_from_prev (prev, mode);
+      if (target != nullptr)
+	return target;
+
+      p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
+      memset (p, *c, GET_MODE_SIZE (QImode));
+      rtx src = c_readstr (p, QImode);
+      target = gen_memset_broadcast (src, mode);
+      if (target != nullptr)
+	return target;
+    }
+
+  p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
 
   memset (p, *c, GET_MODE_SIZE (mode));
 
@@ -6719,7 +6804,7 @@  builtin_memset_read_str (void *data, void *prevp,
    nullptr, it has the RTL info from the previous iteration.  */
 
 static rtx
-builtin_memset_gen_str (void *data, void *prevp,
+builtin_memset_gen_str (void *data, void *prev,
 			HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
 			scalar_int_mode mode)
 {
@@ -6727,22 +6812,18 @@  builtin_memset_gen_str (void *data, void *prevp,
   size_t size;
   char *p;
 
-  by_pieces_prev *prev = (by_pieces_prev *) prevp;
-  if (prev != nullptr && prev->data != nullptr)
-    {
-      /* Use the previous data in the same mode.  */
-      if (prev->mode == mode)
-	return prev->data;
-
-      target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
-      if (target != nullptr)
-	return target;
-    }
-
   size = GET_MODE_SIZE (mode);
   if (size == 1)
     return (rtx) data;
 
+  target = gen_memset_value_from_prev (prev, mode);
+  if (target != nullptr)
+    return target;
+
+  target = gen_memset_broadcast ((rtx) data, mode);
+  if (target != nullptr)
+    return target;
+
   p = XALLOCAVEC (char, size);
   memset (p, 1, size);
   coeff = c_readstr (p, mode);
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 2a41ae5fba1..8849711fcf8 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -12122,6 +12122,11 @@  This function prepares to emit a conditional comparison within a sequence
  @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
 @end deftypefn
 
+@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})
+This hook should return an rtx for scratch register in @var{mode} to
+be used by memset broadcast. The default is @code{gen_reg_rtx}.
+@end deftypefn
+
 @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
 This target hook returns a new value for the number of times @var{loop}
 should be unrolled. The parameter @var{nunroll} is the number of times
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index f881cdabe9e..a6bbf4f2667 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7958,6 +7958,8 @@  lists.
 
 @hook TARGET_GEN_CCMP_NEXT
 
+@hook TARGET_GEN_MEMSET_SCRATCH_RTX
+
 @hook TARGET_LOOP_UNROLL_ADJUST
 
 @defmac POWI_MAX_MULTS
diff --git a/gcc/target.def b/gcc/target.def
index c009671c583..5fb287db3bd 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2726,6 +2726,13 @@  DEFHOOK
  rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
  NULL)
 
+DEFHOOK
+(gen_memset_scratch_rtx,
+ "This hook should return an rtx for scratch register in @var{mode} to\n\
+be used by memset broadcast. The default is @code{gen_reg_rtx}.",
+ rtx, (machine_mode mode),
+ gen_reg_rtx)
+
 /* Return a new value for loop unroll size.  */
 DEFHOOK
 (loop_unroll_adjust,