Message ID | CAMe9rOr85pw8witn6zymUMnyiEgGMUDEdn12SZ1A_YwsoxQ5aw@mail.gmail.com |
---|---|
State | New |
Headers | show |
Series | Add QI vector mode support to by-pieces for memset | expand |
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > On Fri, Jul 16, 2021 at 7:15 AM H.J. Lu <hjl.tools@gmail.com> wrote: >> >> On Fri, Jul 16, 2021 at 6:24 AM Richard Sandiford >> <richard.sandiford@arm.com> wrote: >> > >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: >> > > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford >> > > <richard.sandiford@arm.com> wrote: >> > >> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: >> > >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with >> > >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value. >> > >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard >> > >> > scratch register to avoid stack realignment when expanding memset. >> > >> > >> > >> > PR middle-end/90773 >> > >> > * builtins.c (gen_memset_value_from_prev): New function. >> > >> > (gen_memset_broadcast): Likewise. >> > >> > (builtin_memset_read_str): Use gen_memset_value_from_prev >> > >> > and gen_memset_broadcast. >> > >> > (builtin_memset_gen_str): Likewise. >> > >> > * target.def (gen_memset_scratch_rtx): New hook. >> > >> > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. >> > >> > * doc/tm.texi: Regenerated. >> > >> > --- >> > >> > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++-------- >> > >> > gcc/doc/tm.texi | 5 ++ >> > >> > gcc/doc/tm.texi.in | 2 + >> > >> > gcc/target.def | 7 +++ >> > >> > 4 files changed, 116 insertions(+), 21 deletions(-) >> > >> > >> > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c >> > >> > index 39ab139b7e1..c1758ae2efc 100644 >> > >> > --- a/gcc/builtins.c >> > >> > +++ b/gcc/builtins.c >> > >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target) >> > >> > return NULL_RTX; >> > >> > } >> > >> > >> > >> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) >> > >> > - bytes from constant string DATA + OFFSET and return it as target >> > >> > - constant. If PREV isn't nullptr, it has the RTL info from the >> > >> > +/* Return the RTL of a register in MODE generated from PREV in the >> > >> > previous iteration. */ >> > >> > >> > >> > -rtx >> > >> > -builtin_memset_read_str (void *data, void *prevp, >> > >> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, >> > >> > - scalar_int_mode mode) >> > >> > +static rtx >> > >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode) >> > >> > { >> > >> > + rtx target = nullptr; >> > >> > by_pieces_prev *prev = (by_pieces_prev *) prevp; >> > >> > if (prev != nullptr && prev->data != nullptr) >> > >> > { >> > >> > /* Use the previous data in the same mode. */ >> > >> > if (prev->mode == mode) >> > >> > return prev->data; >> > >> > + >> > >> > + rtx prev_rtx = prev->data; >> > >> > + machine_mode prev_mode = prev->mode; >> > >> > + unsigned int word_size = GET_MODE_SIZE (word_mode); >> > >> > + if (word_size < GET_MODE_SIZE (prev->mode) >> > >> > + && word_size > GET_MODE_SIZE (mode)) >> > >> > + { >> > >> > + /* First generate subreg of word mode if the previous mode is >> > >> > + wider than word mode and word mode is wider than MODE. */ >> > >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, >> > >> > + prev_mode, 0); >> > >> > + prev_mode = word_mode; >> > >> > + } >> > >> > + if (prev_rtx != nullptr) >> > >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); >> > >> > } >> > >> > + return target; >> > >> > +} >> > >> > + >> > >> > +/* Return the RTL of a register in MODE broadcasted from DATA. */ >> > >> > + >> > >> > +static rtx >> > >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode) >> > >> > +{ >> > >> > + /* Skip if regno_reg_rtx isn't initialized. */ >> > >> > + if (!regno_reg_rtx) >> > >> > + return nullptr; >> > >> > + >> > >> > + rtx target = nullptr; >> > >> > + >> > >> > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode); >> > >> > + machine_mode vector_mode; >> > >> > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode)) >> > >> > + gcc_unreachable (); >> > >> >> > >> Sorry, I realise it's a bit late to be raising this objection now, >> > >> but I don't think it's a good idea to use scalar integer modes as >> > >> a proxy for vector modes. In principle there's no reason why a >> > >> target has to define an integer mode for every vector mode. >> > > >> > > A target always defines the largest integer mode. >> > >> > Right. But a target shouldn't *need* to define an integer mode >> > for every vector mode. >> > >> > >> If we want the mode to be a vector then I think the by-pieces >> > >> infrastructure should be extended to support vectors directly, >> > >> rather than assuming that each piece can be represented as >> > >> a scalar_int_mode. >> > >> >> > > >> > > The current by-pieces infrastructure operates on scalar_int_mode. >> > > Only for memset, there is >> > > >> > > /* Callback routine for store_by_pieces. Return the RTL of a register >> > > containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned >> > > char value given in the RTL register data. For example, if mode is >> > > 4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't >> > > nullptr, it has the RTL info from the previous iteration. */ >> > > >> > > static rtx >> > > builtin_memset_gen_str (void *data, void *prevp, >> > > HOST_WIDE_INT offset ATTRIBUTE_UNUSED, >> > > scalar_int_mode mode) >> > > >> > > It is a broadcast. If a target can broadcast a byte to a wider integer, >> > > can you suggest a way to use it in the current by-pieces infrastructure? >> > >> > I meant that we should extend the by-pieces infrastructure so that >> > it no longer requires scalar_int_mode, rather than work within the >> > current limits. >> > >> > IMO the fact that we're (legitimately) going through vec_duplicate_optab >> >> If vec_duplicate_optab is the only blocking issue, can we add an integer >> broadcast optab? I want to avoid a major surgery just because we >> don't support integer broadcast from byte. >> >> > shows that we really are dealing with vectors here, not integers. >> > >> >> Are you suggesting we should add the optional vector mode >> support to by-pieces for memset? >> > > Like this? > > 1. Replace scalar_int_mode with machine_mode in the by-pieces > infrastructure to allow non-integer mode. I guess it might be better to use fixed_size_mode instead of machine_mode. > 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size > to return QI vector mode for memset. > 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest > integer or QI vector mode. > > Thanks. > > -- > H.J. > > From 171f0a919300a8aac58dfe5fc9904eaa8fe0a05b Mon Sep 17 00:00:00 2001 > From: "H.J. Lu" <hjl.tools@gmail.com> > Date: Sun, 6 Mar 2016 06:38:21 -0800 > Subject: [PATCH] Add QI vector mode support to by-pieces for memset > > 1. Replace scalar_int_mode with machine_mode in the by-pieces > infrastructure to allow non-integer mode. > 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size > to return QI vector mode for memset. > 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest > integer or QI vector mode. > > gcc/ > > PR middle-end/90773 > * builtins.c (builtin_memcpy_read_str): Change the mode argument > from scalar_int_mode to machine_mode. > (builtin_strncpy_read_str): Likewise. > (gen_memset_value_from_prev): New function. > (gen_memset_broadcast): Likewise. > (builtin_memset_read_str): Change the mode argument from > scalar_int_mode to machine_mode. Use gen_memset_value_from_prev > and gen_memset_broadcast. > (builtin_memset_gen_str): Likewise. > (try_store_by_multiple_pieces): Use by_pieces_constfn to declare > constfun. > * builtins.h (builtin_strncpy_read_str): Updated. > (builtin_memset_read_str): Likewise. > * expr.c (widest_int_mode_for_size): Renamed to ... > (widest_int_or_QI_vector_mode_for_size): Add a bool argument to > indicate if QI vector mode can be used. > (by_pieces_ninsns): Call widest_int_or_QI_vector_mode_for_size > instead of widest_int_mode_for_size. > (pieces_addr::adjust): Change the mode argument from > scalar_int_mode to machine_mode. > (op_by_pieces_d): Add a bool member, m_qi_vector_mode, to > indicate that QI vector mode can be used. > (op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize > m_qi_vector_mode. Call widest_int_or_QI_vector_mode_for_size > instead of widest_int_mode_for_size. > (op_by_pieces_d::get_usable_mode): Change the mode argument from > scalar_int_mode to machine_mode. Call > widest_int_or_QI_vector_mode_for_size instead of > widest_int_mode_for_size. > (op_by_pieces_d::smallest_mode_for_size): New member function > to return the smallest integer or QI vector mode. > (op_by_pieces_d::run): Call widest_int_or_QI_vector_mode_for_size > instead of widest_int_mode_for_size. Call smallest_mode_for_size > instead of smallest_int_mode_for_size. > (store_by_pieces_d::store_by_pieces_d): Add a bool argument to > indicate that QI vector mode can be used and pass it to > op_by_pieces_d::op_by_pieces_d. > (can_store_by_pieces): Call widest_int_or_QI_vector_mode_for_size > instead of widest_int_mode_for_size. > (store_by_pieces::store_by_pieces): Pass memsetp to > store_by_pieces_d::store_by_pieces_d. > (clear_by_pieces_1): Change scalar_int_mode to machine_mode. > (string_cst_read_str): Change the mode argument from > scalar_int_mode to machine_mode. > * expr.h (by_pieces_constfn): Change scalar_int_mode to > machine_mode. > (by_pieces_prev): Likewise. > * target.def (gen_memset_scratch_rtx): New hook. > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. > * doc/tm.texi: Regenerated. > > gcc/testsuite/ > > * gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of > vmovdqu. > * gcc.target/i386/pr100865-4b.c: Likewise. > --- > gcc/builtins.c | 138 ++++++++++++++----- > gcc/builtins.h | 4 +- > gcc/doc/tm.texi | 5 + > gcc/doc/tm.texi.in | 2 + > gcc/expr.c | 145 ++++++++++++++------ > gcc/expr.h | 4 +- > gcc/target.def | 7 + > gcc/testsuite/gcc.target/i386/pr100865-3.c | 2 +- > gcc/testsuite/gcc.target/i386/pr100865-4b.c | 2 +- > 9 files changed, 229 insertions(+), 80 deletions(-) > > diff --git a/gcc/builtins.c b/gcc/builtins.c > index 39ab139b7e1..876378b467c 100644 > --- a/gcc/builtins.c > +++ b/gcc/builtins.c > @@ -3890,13 +3890,14 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode) > > static rtx > builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset, > - scalar_int_mode mode) > + machine_mode mode) > { > /* The REPresentation pointed to by DATA need not be a nul-terminated > string but the caller guarantees it's large enough for MODE. */ > const char *rep = (const char *) data; > > - return c_readstr (rep + offset, mode, /*nul_terminated=*/false); > + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode), > + /*nul_terminated=*/false); > } > > /* LEN specify length of the block of memcpy/memset operation. > @@ -6478,14 +6479,14 @@ expand_builtin_stpncpy (tree exp, rtx) > > rtx > builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset, > - scalar_int_mode mode) > + machine_mode mode) > { > const char *str = (const char *) data; > > if ((unsigned HOST_WIDE_INT) offset > strlen (str)) > return const0_rtx; > > - return c_readstr (str + offset, mode); > + return c_readstr (str + offset, as_a <scalar_int_mode> (mode)); > } > /* Helper to check the sizes of sequences and the destination of calls I think both of these as_a<scalar_int_mode>s should have a comment explaining that we don't (yet?) consider using vector modes for this operation. > @@ -6686,30 +6687,109 @@ expand_builtin_strncpy (tree exp, rtx target) > return NULL_RTX; > } > > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > - bytes from constant string DATA + OFFSET and return it as target > - constant. If PREV isn't nullptr, it has the RTL info from the > +/* Return the RTL of a register in MODE generated from PREV in the > previous iteration. */ > > -rtx > -builtin_memset_read_str (void *data, void *prevp, > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > - scalar_int_mode mode) > +static rtx > +gen_memset_value_from_prev (void *prevp, machine_mode mode) > { > + rtx target = nullptr; > by_pieces_prev *prev = (by_pieces_prev *) prevp; I think it'd be better to pass the proper type instead, since this function isn't directly implementing a callback interface. > if (prev != nullptr && prev->data != nullptr) > { > /* Use the previous data in the same mode. */ > if (prev->mode == mode) > return prev->data; > + > + rtx prev_rtx = prev->data; > + machine_mode prev_mode = prev->mode; > + unsigned int word_size = GET_MODE_SIZE (word_mode); > + if (word_size < GET_MODE_SIZE (prev->mode).to_constant () > + && word_size > GET_MODE_SIZE (mode).to_constant ()) Using fixed_size_mode would also avoid the to_constants here. UNITS_PER_WORD is more direct/canonical than GET_MODE_SIZE (word_mode). > + { > + /* First generate subreg of word mode if the previous mode is > + wider than word mode and word mode is wider than MODE. */ > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, > + prev_mode, 0); > + prev_mode = word_mode; > + } > + if (prev_rtx != nullptr) > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); This should be lowpart_subreg, since 0 isn't the right offset for big-endian targets. Using lowpart_subreg should also avoid the need for the word_size “if” above: lowpart_subreg can handle lowpart subword subregs of multiword values. > + } > + return target; > +} > + > +/* Return the RTL of a register in MODE broadcasted from DATA. */ > + > +static rtx > +gen_memset_broadcast (rtx data, machine_mode mode) > +{ > + /* Skip if regno_reg_rtx isn't initialized or not vector mode. */ Why's the regno_reg_rtx condition needed? > + if (!regno_reg_rtx || !VECTOR_MODE_P (mode)) > + return nullptr; > + > + gcc_assert (GET_MODE_INNER (mode) == QImode); > + > + enum insn_code icode = optab_handler (vec_duplicate_optab, mode); > + > + gcc_assert (icode != CODE_FOR_nothing); Might be worth a comment here saying that having vec_duplicate_optab is a precondition for picking a vector mode. > + > + rtx target = targetm.gen_memset_scratch_rtx (mode); > + if (CONST_INT_P (data)) > + { > + /* Use the move expander with CONST_VECTOR. */ > + rtx const_vec = gen_const_vec_duplicate (mode, data); > + emit_move_insn (target, const_vec); > + } > + else > + { > + Nit: excess blank line. > + class expand_operand ops[2]; > + create_output_operand (&ops[0], target, mode); > + create_input_operand (&ops[1], data, QImode); > + expand_insn (icode, 2, ops); > + if (!rtx_equal_p (target, ops[0].value)) > + emit_move_insn (target, ops[0].value); > } > > + return target; > +} > + > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > + bytes from constant string DATA + OFFSET and return it as target > + constant. If PREV isn't nullptr, it has the RTL info from the > + previous iteration. */ > + > +rtx > +builtin_memset_read_str (void *data, void *prev, > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > + machine_mode mode) > +{ > + rtx target; > const char *c = (const char *) data; > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > + char *p; > + unsigned int size = GET_MODE_SIZE (mode).to_constant (); > + > + /* Don't use the previous value if size is 1. */ Why not though? The existing code does, and that seems like the right thing to do when operating on integers. I can see the check would be a good thing to do if MODE isn't a vector mode and the previous mode was. Is that the case you're worried about? If so, that check could go in gen_memset_value_from_prev instead. > + if (size != 1) > + { > + target = gen_memset_value_from_prev (prev, mode); > + if (target != nullptr) > + return target; > + > + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode)); > + memset (p, *c, GET_MODE_SIZE (QImode)); > + rtx src = c_readstr (p, QImode); This seems unnecessarily complicated. Couldn't it just be gen_int_mode (QImode, *c); instead? There are no endianness concerns for single QI values. > + target = gen_memset_broadcast (src, mode); > + if (target != nullptr) > + return target; > + } > + > + p = XALLOCAVEC (char, size); > > - memset (p, *c, GET_MODE_SIZE (mode)); > + memset (p, *c, size); > > - return c_readstr (p, mode); > + return c_readstr (p, as_a <scalar_int_mode> (mode)); > } > > /* Callback routine for store_by_pieces. Return the RTL of a register > @@ -6719,33 +6799,29 @@ builtin_memset_read_str (void *data, void *prevp, > nullptr, it has the RTL info from the previous iteration. */ > > static rtx > -builtin_memset_gen_str (void *data, void *prevp, > +builtin_memset_gen_str (void *data, void *prev, > HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > - scalar_int_mode mode) > + machine_mode mode) > { > rtx target, coeff; > size_t size; > char *p; > > - by_pieces_prev *prev = (by_pieces_prev *) prevp; > - if (prev != nullptr && prev->data != nullptr) > - { > - /* Use the previous data in the same mode. */ > - if (prev->mode == mode) > - return prev->data; > - > - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0); > - if (target != nullptr) > - return target; > - } > - > - size = GET_MODE_SIZE (mode); > + size = GET_MODE_SIZE (mode).to_constant (); > if (size == 1) > return (rtx) data; > > + target = gen_memset_value_from_prev (prev, mode); > + if (target != nullptr) > + return target; > + > + target = gen_memset_broadcast ((rtx) data, mode); > + if (target != nullptr) > + return target; > + > p = XALLOCAVEC (char, size); > memset (p, 1, size); > - coeff = c_readstr (p, mode); > + coeff = c_readstr (p, as_a <scalar_int_mode> (mode)); > > target = convert_to_mode (mode, (rtx) data, 1); > target = expand_mult (mode, target, coeff, NULL_RTX, 1); > @@ -6849,7 +6925,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, > &valc, align, true)) > return false; > > - rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode); > + by_pieces_constfn constfun; > void *constfundata; > if (val) > { > diff --git a/gcc/builtins.h b/gcc/builtins.h > index a64ece3f1cd..31d07621750 100644 > --- a/gcc/builtins.h > +++ b/gcc/builtins.h > @@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum built_in_function fn); > extern tree mathfn_built_in (tree, combined_fn); > extern tree mathfn_built_in_type (combined_fn); > extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT, > - scalar_int_mode); > + machine_mode); > extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT, > - scalar_int_mode); > + machine_mode); > extern rtx expand_builtin_saveregs (void); > extern tree std_build_builtin_va_list (void); > extern tree std_fn_abi_va_list (tree); > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi > index 3ad39443eba..c1995270720 100644 > --- a/gcc/doc/tm.texi > +++ b/gcc/doc/tm.texi > @@ -12123,6 +12123,11 @@ This function prepares to emit a conditional comparison within a sequence > @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares. > @end deftypefn > > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode}) > +This hook should return an rtx for scratch register in @var{mode} to > +be used by memset broadcast. The default is @code{gen_reg_rtx}. This should give a hint why the default might not be appropriate. > +@end deftypefn > + > @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop}) > This target hook returns a new value for the number of times @var{loop} > should be unrolled. The parameter @var{nunroll} is the number of times > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in > index f881cdabe9e..a6bbf4f2667 100644 > --- a/gcc/doc/tm.texi.in > +++ b/gcc/doc/tm.texi.in > @@ -7958,6 +7958,8 @@ lists. > > @hook TARGET_GEN_CCMP_NEXT > > +@hook TARGET_GEN_MEMSET_SCRATCH_RTX > + > @hook TARGET_LOOP_UNROLL_ADJUST > > @defmac POWI_MAX_MULTS > diff --git a/gcc/expr.c b/gcc/expr.c > index 6a4368113c4..92dbf47194e 100644 > --- a/gcc/expr.c > +++ b/gcc/expr.c > @@ -769,15 +769,36 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align) > return align; > } > > -/* Return the widest integer mode that is narrower than SIZE bytes. */ > +/* Return the widest integer or QI vector mode that is narrower than > + SIZE bytes. */ > > -static scalar_int_mode > -widest_int_mode_for_size (unsigned int size) > +static machine_mode > +widest_int_or_QI_vector_mode_for_size (unsigned int size, > + bool qi_vector) > { > - scalar_int_mode result = NARROWEST_INT_MODE; > + machine_mode result = NARROWEST_INT_MODE; > > gcc_checking_assert (size > 1); > > + /* Use QI vector only if size is wider than a WORD. */ > + if (qi_vector && size > GET_MODE_SIZE (word_mode)) > + { > + machine_mode mode; > + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) > + if (GET_MODE_INNER (mode) == QImode > + && GET_MODE_SIZE (mode).is_constant ()) > + { > + if (GET_MODE_SIZE (mode).to_constant () >= size) > + break; > + if (optab_handler (vec_duplicate_optab, mode) > + != CODE_FOR_nothing) > + result = mode; > + } > + > + if (result != NARROWEST_INT_MODE) > + return result; > + } > + > opt_scalar_int_mode tmode; > FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT) > if (GET_MODE_SIZE (tmode.require ()) < size) > @@ -815,16 +836,18 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align, > unsigned int max_size, by_pieces_operation op) > { > unsigned HOST_WIDE_INT n_insns = 0; > - scalar_int_mode mode; > + machine_mode mode; > > if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES) > { > /* NB: Round up L and ALIGN to the widest integer mode for > MAX_SIZE. */ > - mode = widest_int_mode_for_size (max_size); > + mode = widest_int_or_QI_vector_mode_for_size (max_size, > + op == SET_BY_PIECES); > if (optab_handler (mov_optab, mode) != CODE_FOR_nothing) > { > - unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode)); > + unsigned HOST_WIDE_INT up = ROUND_UP (l, > + GET_MODE_SIZE (mode).to_constant ()); > if (up > l) > l = up; > align = GET_MODE_ALIGNMENT (mode); > @@ -835,10 +858,11 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align, > > while (max_size > 1 && l > 0) > { > - mode = widest_int_mode_for_size (max_size); > + mode = widest_int_or_QI_vector_mode_for_size (max_size, > + op == SET_BY_PIECES); > enum insn_code icode; > > - unsigned int modesize = GET_MODE_SIZE (mode); > + unsigned int modesize = GET_MODE_SIZE (mode).to_constant (); > > icode = optab_handler (mov_optab, mode); > if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode)) > @@ -903,8 +927,7 @@ class pieces_addr > void *m_cfndata; > public: > pieces_addr (rtx, bool, by_pieces_constfn, void *); > - rtx adjust (scalar_int_mode, HOST_WIDE_INT, > - by_pieces_prev * = nullptr); > + rtx adjust (machine_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr); > void increment_address (HOST_WIDE_INT); > void maybe_predec (HOST_WIDE_INT); > void maybe_postinc (HOST_WIDE_INT); > @@ -1006,7 +1029,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse, > but we still modify the MEM's properties. */ > > rtx > -pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset, > +pieces_addr::adjust (machine_mode mode, HOST_WIDE_INT offset, > by_pieces_prev *prev) > { > if (m_constfn) > @@ -1060,7 +1083,8 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size) > class op_by_pieces_d > { > private: > - scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int); > + machine_mode get_usable_mode (machine_mode, unsigned int); > + machine_mode smallest_mode_for_size (unsigned int); > > protected: > pieces_addr m_to, m_from; > @@ -1073,6 +1097,8 @@ class op_by_pieces_d > bool m_push; > /* True if targetm.overlap_op_by_pieces_p () returns true. */ > bool m_overlap_op_by_pieces; > + /* True if QI vector mode can be used. */ > + bool m_qi_vector_mode; > > /* Virtual functions, overriden by derived classes for the specific > operation. */ > @@ -1084,7 +1110,8 @@ class op_by_pieces_d > > public: > op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *, > - unsigned HOST_WIDE_INT, unsigned int, bool); > + unsigned HOST_WIDE_INT, unsigned int, bool, > + bool = false); > void run (); > }; > > @@ -1099,11 +1126,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, > by_pieces_constfn from_cfn, > void *from_cfn_data, > unsigned HOST_WIDE_INT len, > - unsigned int align, bool push) > + unsigned int align, bool push, > + bool qi_vector_mode) > : m_to (to, to_load, NULL, NULL), > m_from (from, from_load, from_cfn, from_cfn_data), > m_len (len), m_max_size (MOVE_MAX_PIECES + 1), > - m_push (push) > + m_push (push), m_qi_vector_mode (qi_vector_mode) > { > int toi = m_to.get_addr_inc (); > int fromi = m_from.get_addr_inc (); > @@ -1124,7 +1152,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, > if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2) > { > /* Find the mode of the largest comparison. */ > - scalar_int_mode mode = widest_int_mode_for_size (m_max_size); > + machine_mode mode > + = widest_int_or_QI_vector_mode_for_size (m_max_size, > + m_qi_vector_mode); > > m_from.decide_autoinc (mode, m_reverse, len); > m_to.decide_autoinc (mode, m_reverse, len); > @@ -1139,22 +1169,42 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, > /* This function returns the largest usable integer mode for LEN bytes > whose size is no bigger than size of MODE. */ > > -scalar_int_mode > -op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len) > +machine_mode > +op_by_pieces_d::get_usable_mode (machine_mode mode, unsigned int len) > { > unsigned int size; > do > { > - size = GET_MODE_SIZE (mode); > + size = GET_MODE_SIZE (mode).to_constant (); > if (len >= size && prepare_mode (mode, m_align)) > break; > - /* NB: widest_int_mode_for_size checks SIZE > 1. */ > - mode = widest_int_mode_for_size (size); > + /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1. */ > + mode = widest_int_or_QI_vector_mode_for_size (size, > + m_qi_vector_mode); > } > while (1); > return mode; > } > > +/* Return the smallest integer or QI vector mode that is not narrower > + than SIZE bits. */ > + > +machine_mode > +op_by_pieces_d::smallest_mode_for_size (unsigned int size) > +{ > + /* Use QI vector only for > size of WORD. */ > + if (m_qi_vector_mode && size > GET_MODE_PRECISION (word_mode)) > + { > + machine_mode mode; > + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) > + if (GET_MODE_INNER (mode) == QImode > + && known_ge (GET_MODE_PRECISION (mode), size)) > + return mode; Shouldn't this also test for vec_duplicate_optab? I guess you're hoping that the previous register will be reused via a lowpart, but since that's up to the callbacks, I think we should be conservative here. Alternatively, we could bypass the callbacks when this code is used. Thanks, Richard > + } > + > + return smallest_int_mode_for_size (size); > +} > + > /* This function contains the main loop used for expanding a block > operation. First move what we can in the largest integer mode, > then go to successively smaller modes. For every access, call > @@ -1166,8 +1216,10 @@ op_by_pieces_d::run () > if (m_len == 0) > return; > > - /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1. */ > - scalar_int_mode mode = widest_int_mode_for_size (m_max_size); > + /* NB: widest_int_or_QI_vector_mode_for_size checks M_MAX_SIZE > 1. */ > + machine_mode mode > + = widest_int_or_QI_vector_mode_for_size (m_max_size, > + m_qi_vector_mode); > mode = get_usable_mode (mode, m_len); > > by_pieces_prev to_prev = { nullptr, mode }; > @@ -1175,7 +1227,7 @@ op_by_pieces_d::run () > > do > { > - unsigned int size = GET_MODE_SIZE (mode); > + unsigned int size = GET_MODE_SIZE (mode).to_constant (); > rtx to1 = NULL_RTX, from1; > > while (m_len >= size) > @@ -1214,9 +1266,10 @@ op_by_pieces_d::run () > /* NB: Generate overlapping operations if it is not a stack > push since stack push must not overlap. Get the smallest > integer mode for M_LEN bytes. */ > - mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT); > - mode = get_usable_mode (mode, GET_MODE_SIZE (mode)); > - int gap = GET_MODE_SIZE (mode) - m_len; > + mode = smallest_mode_for_size (m_len * BITS_PER_UNIT); > + unsigned int mode_size = GET_MODE_SIZE (mode).to_constant (); > + mode = get_usable_mode (mode, mode_size); > + int gap = mode_size - m_len; > if (gap > 0) > { > /* If size of MODE > M_LEN, generate the last operation > @@ -1231,8 +1284,9 @@ op_by_pieces_d::run () > } > else > { > - /* NB: widest_int_mode_for_size checks SIZE > 1. */ > - mode = widest_int_mode_for_size (size); > + /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1. */ > + mode = widest_int_or_QI_vector_mode_for_size (size, > + m_qi_vector_mode); > mode = get_usable_mode (mode, m_len); > } > } > @@ -1355,9 +1409,10 @@ class store_by_pieces_d : public op_by_pieces_d > > public: > store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data, > - unsigned HOST_WIDE_INT len, unsigned int align) > + unsigned HOST_WIDE_INT len, unsigned int align, > + bool qi_vector_mode = false) > : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len, > - align, false) > + align, false, qi_vector_mode) > { > } > rtx finish_retmode (memop_ret); > @@ -1446,13 +1501,14 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len, > max_size = STORE_MAX_PIECES + 1; > while (max_size > 1 && l > 0) > { > - scalar_int_mode mode = widest_int_mode_for_size (max_size); > + machine_mode mode > + = widest_int_or_QI_vector_mode_for_size (max_size, memsetp); > > icode = optab_handler (mov_optab, mode); > if (icode != CODE_FOR_nothing > && align >= GET_MODE_ALIGNMENT (mode)) > { > - unsigned int size = GET_MODE_SIZE (mode); > + unsigned int size = GET_MODE_SIZE (mode).to_constant (); > > while (l >= size) > { > @@ -1470,7 +1526,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len, > } > } > > - max_size = GET_MODE_SIZE (mode); > + max_size = GET_MODE_SIZE (mode).to_constant (); > } > > /* The code above should have handled everything. */ > @@ -1504,7 +1560,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len, > memsetp ? SET_BY_PIECES : STORE_BY_PIECES, > optimize_insn_for_speed_p ())); > > - store_by_pieces_d data (to, constfun, constfundata, len, align); > + store_by_pieces_d data (to, constfun, constfundata, len, align, > + memsetp); > data.run (); > > if (retmode != RETURN_BEGIN) > @@ -1517,7 +1574,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len, > Return const0_rtx unconditionally. */ > > static rtx > -clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode) > +clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, machine_mode) > { > return const0_rtx; > } > @@ -5754,7 +5811,7 @@ emit_storent_insn (rtx to, rtx from) > > static rtx > string_cst_read_str (void *data, void *, HOST_WIDE_INT offset, > - scalar_int_mode mode) > + machine_mode mode) > { > tree str = (tree) data; > > @@ -5762,17 +5819,19 @@ string_cst_read_str (void *data, void *, HOST_WIDE_INT offset, > if (offset >= TREE_STRING_LENGTH (str)) > return const0_rtx; > > - if ((unsigned HOST_WIDE_INT) offset + GET_MODE_SIZE (mode) > + unsigned int size = GET_MODE_SIZE (mode).to_constant (); > + if ((unsigned HOST_WIDE_INT) offset + size > > (unsigned HOST_WIDE_INT) TREE_STRING_LENGTH (str)) > { > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > + char *p = XALLOCAVEC (char, size); > size_t l = TREE_STRING_LENGTH (str) - offset; > memcpy (p, TREE_STRING_POINTER (str) + offset, l); > - memset (p + l, '\0', GET_MODE_SIZE (mode) - l); > - return c_readstr (p, mode, false); > + memset (p + l, '\0', size - l); > + return c_readstr (p, as_a <scalar_int_mode> (mode), false); > } > > - return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false); > + return c_readstr (TREE_STRING_POINTER (str) + offset, > + as_a <scalar_int_mode> (mode), false); > } > > /* Generate code for computing expression EXP, > diff --git a/gcc/expr.h b/gcc/expr.h > index a4f44265759..552645e4403 100644 > --- a/gcc/expr.h > +++ b/gcc/expr.h > @@ -108,13 +108,13 @@ enum block_op_methods > }; > > typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT, > - scalar_int_mode); > + machine_mode); > > /* The second pointer passed to by_pieces_constfn. */ > struct by_pieces_prev > { > rtx data; > - scalar_int_mode mode; > + machine_mode mode; > }; > > extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods); > diff --git a/gcc/target.def b/gcc/target.def > index 2e40448e6c5..cc4aa3a4212 100644 > --- a/gcc/target.def > +++ b/gcc/target.def > @@ -2726,6 +2726,13 @@ DEFHOOK > rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code), > NULL) > > +DEFHOOK > +(gen_memset_scratch_rtx, > + "This hook should return an rtx for scratch register in @var{mode} to\n\ > +be used by memset broadcast. The default is @code{gen_reg_rtx}.", > + rtx, (machine_mode mode), > + gen_reg_rtx) > + > /* Return a new value for loop unroll size. */ > DEFHOOK > (loop_unroll_adjust, > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c > index b6dbcf7809b..007e79f91b0 100644 > --- a/gcc/testsuite/gcc.target/i386/pr100865-3.c > +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c > @@ -10,6 +10,6 @@ foo (void) > } > > /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */ > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */ > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */ > /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */ > /* { dg-final { scan-assembler-not "vmovdqa" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c > index f41e6147b4c..1e50dc842bc 100644 > --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c > +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c > @@ -4,6 +4,6 @@ > #include "pr100865-4a.c" > > /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */ > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */ > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */ > /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */ > /* { dg-final { scan-assembler-not "vmovdqa" } } */
On Mon, Jul 19, 2021 at 7:41 AM Richard Sandiford <richard.sandiford@arm.com> wrote: > > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > > On Fri, Jul 16, 2021 at 7:15 AM H.J. Lu <hjl.tools@gmail.com> wrote: > >> > >> On Fri, Jul 16, 2021 at 6:24 AM Richard Sandiford > >> <richard.sandiford@arm.com> wrote: > >> > > >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > >> > > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford > >> > > <richard.sandiford@arm.com> wrote: > >> > >> > >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > >> > >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with > >> > >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value. > >> > >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard > >> > >> > scratch register to avoid stack realignment when expanding memset. > >> > >> > > >> > >> > PR middle-end/90773 > >> > >> > * builtins.c (gen_memset_value_from_prev): New function. > >> > >> > (gen_memset_broadcast): Likewise. > >> > >> > (builtin_memset_read_str): Use gen_memset_value_from_prev > >> > >> > and gen_memset_broadcast. > >> > >> > (builtin_memset_gen_str): Likewise. > >> > >> > * target.def (gen_memset_scratch_rtx): New hook. > >> > >> > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. > >> > >> > * doc/tm.texi: Regenerated. > >> > >> > --- > >> > >> > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++-------- > >> > >> > gcc/doc/tm.texi | 5 ++ > >> > >> > gcc/doc/tm.texi.in | 2 + > >> > >> > gcc/target.def | 7 +++ > >> > >> > 4 files changed, 116 insertions(+), 21 deletions(-) > >> > >> > > >> > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c > >> > >> > index 39ab139b7e1..c1758ae2efc 100644 > >> > >> > --- a/gcc/builtins.c > >> > >> > +++ b/gcc/builtins.c > >> > >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target) > >> > >> > return NULL_RTX; > >> > >> > } > >> > >> > > >> > >> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > >> > >> > - bytes from constant string DATA + OFFSET and return it as target > >> > >> > - constant. If PREV isn't nullptr, it has the RTL info from the > >> > >> > +/* Return the RTL of a register in MODE generated from PREV in the > >> > >> > previous iteration. */ > >> > >> > > >> > >> > -rtx > >> > >> > -builtin_memset_read_str (void *data, void *prevp, > >> > >> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > >> > >> > - scalar_int_mode mode) > >> > >> > +static rtx > >> > >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode) > >> > >> > { > >> > >> > + rtx target = nullptr; > >> > >> > by_pieces_prev *prev = (by_pieces_prev *) prevp; > >> > >> > if (prev != nullptr && prev->data != nullptr) > >> > >> > { > >> > >> > /* Use the previous data in the same mode. */ > >> > >> > if (prev->mode == mode) > >> > >> > return prev->data; > >> > >> > + > >> > >> > + rtx prev_rtx = prev->data; > >> > >> > + machine_mode prev_mode = prev->mode; > >> > >> > + unsigned int word_size = GET_MODE_SIZE (word_mode); > >> > >> > + if (word_size < GET_MODE_SIZE (prev->mode) > >> > >> > + && word_size > GET_MODE_SIZE (mode)) > >> > >> > + { > >> > >> > + /* First generate subreg of word mode if the previous mode is > >> > >> > + wider than word mode and word mode is wider than MODE. */ > >> > >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, > >> > >> > + prev_mode, 0); > >> > >> > + prev_mode = word_mode; > >> > >> > + } > >> > >> > + if (prev_rtx != nullptr) > >> > >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); > >> > >> > } > >> > >> > + return target; > >> > >> > +} > >> > >> > + > >> > >> > +/* Return the RTL of a register in MODE broadcasted from DATA. */ > >> > >> > + > >> > >> > +static rtx > >> > >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode) > >> > >> > +{ > >> > >> > + /* Skip if regno_reg_rtx isn't initialized. */ > >> > >> > + if (!regno_reg_rtx) > >> > >> > + return nullptr; > >> > >> > + > >> > >> > + rtx target = nullptr; > >> > >> > + > >> > >> > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode); > >> > >> > + machine_mode vector_mode; > >> > >> > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode)) > >> > >> > + gcc_unreachable (); > >> > >> > >> > >> Sorry, I realise it's a bit late to be raising this objection now, > >> > >> but I don't think it's a good idea to use scalar integer modes as > >> > >> a proxy for vector modes. In principle there's no reason why a > >> > >> target has to define an integer mode for every vector mode. > >> > > > >> > > A target always defines the largest integer mode. > >> > > >> > Right. But a target shouldn't *need* to define an integer mode > >> > for every vector mode. > >> > > >> > >> If we want the mode to be a vector then I think the by-pieces > >> > >> infrastructure should be extended to support vectors directly, > >> > >> rather than assuming that each piece can be represented as > >> > >> a scalar_int_mode. > >> > >> > >> > > > >> > > The current by-pieces infrastructure operates on scalar_int_mode. > >> > > Only for memset, there is > >> > > > >> > > /* Callback routine for store_by_pieces. Return the RTL of a register > >> > > containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned > >> > > char value given in the RTL register data. For example, if mode is > >> > > 4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't > >> > > nullptr, it has the RTL info from the previous iteration. */ > >> > > > >> > > static rtx > >> > > builtin_memset_gen_str (void *data, void *prevp, > >> > > HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > >> > > scalar_int_mode mode) > >> > > > >> > > It is a broadcast. If a target can broadcast a byte to a wider integer, > >> > > can you suggest a way to use it in the current by-pieces infrastructure? > >> > > >> > I meant that we should extend the by-pieces infrastructure so that > >> > it no longer requires scalar_int_mode, rather than work within the > >> > current limits. > >> > > >> > IMO the fact that we're (legitimately) going through vec_duplicate_optab > >> > >> If vec_duplicate_optab is the only blocking issue, can we add an integer > >> broadcast optab? I want to avoid a major surgery just because we > >> don't support integer broadcast from byte. > >> > >> > shows that we really are dealing with vectors here, not integers. > >> > > >> > >> Are you suggesting we should add the optional vector mode > >> support to by-pieces for memset? > >> > > > > Like this? > > > > 1. Replace scalar_int_mode with machine_mode in the by-pieces > > infrastructure to allow non-integer mode. > > I guess it might be better to use fixed_size_mode instead of > machine_mode. Fixed. > > 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size > > to return QI vector mode for memset. > > 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest > > integer or QI vector mode. > > > > Thanks. > > > > -- > > H.J. > > > > From 171f0a919300a8aac58dfe5fc9904eaa8fe0a05b Mon Sep 17 00:00:00 2001 > > From: "H.J. Lu" <hjl.tools@gmail.com> > > Date: Sun, 6 Mar 2016 06:38:21 -0800 > > Subject: [PATCH] Add QI vector mode support to by-pieces for memset > > > > 1. Replace scalar_int_mode with machine_mode in the by-pieces > > infrastructure to allow non-integer mode. > > 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size > > to return QI vector mode for memset. > > 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest > > integer or QI vector mode. > > > > gcc/ > > > > PR middle-end/90773 > > * builtins.c (builtin_memcpy_read_str): Change the mode argument > > from scalar_int_mode to machine_mode. > > (builtin_strncpy_read_str): Likewise. > > (gen_memset_value_from_prev): New function. > > (gen_memset_broadcast): Likewise. > > (builtin_memset_read_str): Change the mode argument from > > scalar_int_mode to machine_mode. Use gen_memset_value_from_prev > > and gen_memset_broadcast. > > (builtin_memset_gen_str): Likewise. > > (try_store_by_multiple_pieces): Use by_pieces_constfn to declare > > constfun. > > * builtins.h (builtin_strncpy_read_str): Updated. > > (builtin_memset_read_str): Likewise. > > * expr.c (widest_int_mode_for_size): Renamed to ... > > (widest_int_or_QI_vector_mode_for_size): Add a bool argument to > > indicate if QI vector mode can be used. > > (by_pieces_ninsns): Call widest_int_or_QI_vector_mode_for_size > > instead of widest_int_mode_for_size. > > (pieces_addr::adjust): Change the mode argument from > > scalar_int_mode to machine_mode. > > (op_by_pieces_d): Add a bool member, m_qi_vector_mode, to > > indicate that QI vector mode can be used. > > (op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize > > m_qi_vector_mode. Call widest_int_or_QI_vector_mode_for_size > > instead of widest_int_mode_for_size. > > (op_by_pieces_d::get_usable_mode): Change the mode argument from > > scalar_int_mode to machine_mode. Call > > widest_int_or_QI_vector_mode_for_size instead of > > widest_int_mode_for_size. > > (op_by_pieces_d::smallest_mode_for_size): New member function > > to return the smallest integer or QI vector mode. > > (op_by_pieces_d::run): Call widest_int_or_QI_vector_mode_for_size > > instead of widest_int_mode_for_size. Call smallest_mode_for_size > > instead of smallest_int_mode_for_size. > > (store_by_pieces_d::store_by_pieces_d): Add a bool argument to > > indicate that QI vector mode can be used and pass it to > > op_by_pieces_d::op_by_pieces_d. > > (can_store_by_pieces): Call widest_int_or_QI_vector_mode_for_size > > instead of widest_int_mode_for_size. > > (store_by_pieces::store_by_pieces): Pass memsetp to > > store_by_pieces_d::store_by_pieces_d. > > (clear_by_pieces_1): Change scalar_int_mode to machine_mode. > > (string_cst_read_str): Change the mode argument from > > scalar_int_mode to machine_mode. > > * expr.h (by_pieces_constfn): Change scalar_int_mode to > > machine_mode. > > (by_pieces_prev): Likewise. > > * target.def (gen_memset_scratch_rtx): New hook. > > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. > > * doc/tm.texi: Regenerated. > > > > gcc/testsuite/ > > > > * gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of > > vmovdqu. > > * gcc.target/i386/pr100865-4b.c: Likewise. > > --- > > gcc/builtins.c | 138 ++++++++++++++----- > > gcc/builtins.h | 4 +- > > gcc/doc/tm.texi | 5 + > > gcc/doc/tm.texi.in | 2 + > > gcc/expr.c | 145 ++++++++++++++------ > > gcc/expr.h | 4 +- > > gcc/target.def | 7 + > > gcc/testsuite/gcc.target/i386/pr100865-3.c | 2 +- > > gcc/testsuite/gcc.target/i386/pr100865-4b.c | 2 +- > > 9 files changed, 229 insertions(+), 80 deletions(-) > > > > diff --git a/gcc/builtins.c b/gcc/builtins.c > > index 39ab139b7e1..876378b467c 100644 > > --- a/gcc/builtins.c > > +++ b/gcc/builtins.c > > @@ -3890,13 +3890,14 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode) > > > > static rtx > > builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset, > > - scalar_int_mode mode) > > + machine_mode mode) > > { > > /* The REPresentation pointed to by DATA need not be a nul-terminated > > string but the caller guarantees it's large enough for MODE. */ > > const char *rep = (const char *) data; > > > > - return c_readstr (rep + offset, mode, /*nul_terminated=*/false); > > + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode), > > + /*nul_terminated=*/false); > > } > > > > /* LEN specify length of the block of memcpy/memset operation. > > @@ -6478,14 +6479,14 @@ expand_builtin_stpncpy (tree exp, rtx) > > > > rtx > > builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset, > > - scalar_int_mode mode) > > + machine_mode mode) > > { > > const char *str = (const char *) data; > > > > if ((unsigned HOST_WIDE_INT) offset > strlen (str)) > > return const0_rtx; > > > > - return c_readstr (str + offset, mode); > > + return c_readstr (str + offset, as_a <scalar_int_mode> (mode)); > > } > > /* Helper to check the sizes of sequences and the destination of calls > > I think both of these as_a<scalar_int_mode>s should have a comment > explaining that we don't (yet?) consider using vector modes for this > operation. Fixed. > > @@ -6686,30 +6687,109 @@ expand_builtin_strncpy (tree exp, rtx target) > > return NULL_RTX; > > } > > > > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > > - bytes from constant string DATA + OFFSET and return it as target > > - constant. If PREV isn't nullptr, it has the RTL info from the > > +/* Return the RTL of a register in MODE generated from PREV in the > > previous iteration. */ > > > > -rtx > > -builtin_memset_read_str (void *data, void *prevp, > > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > > - scalar_int_mode mode) > > +static rtx > > +gen_memset_value_from_prev (void *prevp, machine_mode mode) > > { > > + rtx target = nullptr; > > by_pieces_prev *prev = (by_pieces_prev *) prevp; > > I think it'd be better to pass the proper type instead, since this > function isn't directly implementing a callback interface. Fixed. > > if (prev != nullptr && prev->data != nullptr) > > { > > /* Use the previous data in the same mode. */ > > if (prev->mode == mode) > > return prev->data; > > + > > + rtx prev_rtx = prev->data; > > + machine_mode prev_mode = prev->mode; > > + unsigned int word_size = GET_MODE_SIZE (word_mode); > > + if (word_size < GET_MODE_SIZE (prev->mode).to_constant () > > + && word_size > GET_MODE_SIZE (mode).to_constant ()) > > Using fixed_size_mode would also avoid the to_constants here. Fixed. > UNITS_PER_WORD is more direct/canonical than GET_MODE_SIZE (word_mode). Fixed. > > + { > > + /* First generate subreg of word mode if the previous mode is > > + wider than word mode and word mode is wider than MODE. */ > > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, > > + prev_mode, 0); > > + prev_mode = word_mode; > > + } > > + if (prev_rtx != nullptr) > > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); > > This should be lowpart_subreg, since 0 isn't the right offset for > big-endian targets. Using lowpart_subreg should also avoid the need > for the word_size “if” above: lowpart_subreg can handle lowpart subword > subregs of multiword values. I tried it. It didn't work since it caused the LRA failure. I replaced simplify_gen_subreg with lowpart_subreg instead. > > + } > > + return target; > > +} > > + > > +/* Return the RTL of a register in MODE broadcasted from DATA. */ > > + > > +static rtx > > +gen_memset_broadcast (rtx data, machine_mode mode) > > +{ > > + /* Skip if regno_reg_rtx isn't initialized or not vector mode. */ > > Why's the regno_reg_rtx condition needed? Removed. > > + if (!regno_reg_rtx || !VECTOR_MODE_P (mode)) > > + return nullptr; > > + > > + gcc_assert (GET_MODE_INNER (mode) == QImode); > > + > > + enum insn_code icode = optab_handler (vec_duplicate_optab, mode); > > + > > + gcc_assert (icode != CODE_FOR_nothing); > > Might be worth a comment here saying that having vec_duplicate_optab is > a precondition for picking a vector mode. I added a comment. > > + > > + rtx target = targetm.gen_memset_scratch_rtx (mode); > > + if (CONST_INT_P (data)) > > + { > > + /* Use the move expander with CONST_VECTOR. */ > > + rtx const_vec = gen_const_vec_duplicate (mode, data); > > + emit_move_insn (target, const_vec); > > + } > > + else > > + { > > + > > Nit: excess blank line. Fixed. > > + class expand_operand ops[2]; > > + create_output_operand (&ops[0], target, mode); > > + create_input_operand (&ops[1], data, QImode); > > + expand_insn (icode, 2, ops); > > + if (!rtx_equal_p (target, ops[0].value)) > > + emit_move_insn (target, ops[0].value); > > } > > > > + return target; > > +} > > + > > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > > + bytes from constant string DATA + OFFSET and return it as target > > + constant. If PREV isn't nullptr, it has the RTL info from the > > + previous iteration. */ > > + > > +rtx > > +builtin_memset_read_str (void *data, void *prev, > > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > > + machine_mode mode) > > +{ > > + rtx target; > > const char *c = (const char *) data; > > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > > + char *p; > > + unsigned int size = GET_MODE_SIZE (mode).to_constant (); > > + > > + /* Don't use the previous value if size is 1. */ > > Why not though? The existing code does, and that seems like the right > thing to do when operating on integers. > > I can see the check would be a good thing to do if MODE isn't a vector > mode and the previous mode was. Is that the case you're worried about? > If so, that check could go in gen_memset_value_from_prev instead. We are storing one byte. Doing it directly is faster. > > + if (size != 1) > > + { > > + target = gen_memset_value_from_prev (prev, mode); > > + if (target != nullptr) > > + return target; > > + > > + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode)); > > + memset (p, *c, GET_MODE_SIZE (QImode)); > > + rtx src = c_readstr (p, QImode); > > This seems unnecessarily complicated. Couldn't it just be > > gen_int_mode (QImode, *c); > > instead? There are no endianness concerns for single QI values. Fixed. > > + target = gen_memset_broadcast (src, mode); > > + if (target != nullptr) > > + return target; > > + } > > + > > + p = XALLOCAVEC (char, size); > > > > - memset (p, *c, GET_MODE_SIZE (mode)); > > + memset (p, *c, size); > > > > - return c_readstr (p, mode); > > + return c_readstr (p, as_a <scalar_int_mode> (mode)); > > } > > > > /* Callback routine for store_by_pieces. Return the RTL of a register > > @@ -6719,33 +6799,29 @@ builtin_memset_read_str (void *data, void *prevp, > > nullptr, it has the RTL info from the previous iteration. */ > > > > static rtx > > -builtin_memset_gen_str (void *data, void *prevp, > > +builtin_memset_gen_str (void *data, void *prev, > > HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > > - scalar_int_mode mode) > > + machine_mode mode) > > { > > rtx target, coeff; > > size_t size; > > char *p; > > > > - by_pieces_prev *prev = (by_pieces_prev *) prevp; > > - if (prev != nullptr && prev->data != nullptr) > > - { > > - /* Use the previous data in the same mode. */ > > - if (prev->mode == mode) > > - return prev->data; > > - > > - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0); > > - if (target != nullptr) > > - return target; > > - } > > - > > - size = GET_MODE_SIZE (mode); > > + size = GET_MODE_SIZE (mode).to_constant (); > > if (size == 1) > > return (rtx) data; > > > > + target = gen_memset_value_from_prev (prev, mode); > > + if (target != nullptr) > > + return target; > > + > > + target = gen_memset_broadcast ((rtx) data, mode); > > + if (target != nullptr) > > + return target; > > + > > p = XALLOCAVEC (char, size); > > memset (p, 1, size); > > - coeff = c_readstr (p, mode); > > + coeff = c_readstr (p, as_a <scalar_int_mode> (mode)); > > > > target = convert_to_mode (mode, (rtx) data, 1); > > target = expand_mult (mode, target, coeff, NULL_RTX, 1); > > @@ -6849,7 +6925,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, > > &valc, align, true)) > > return false; > > > > - rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode); > > + by_pieces_constfn constfun; > > void *constfundata; > > if (val) > > { > > diff --git a/gcc/builtins.h b/gcc/builtins.h > > index a64ece3f1cd..31d07621750 100644 > > --- a/gcc/builtins.h > > +++ b/gcc/builtins.h > > @@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum built_in_function fn); > > extern tree mathfn_built_in (tree, combined_fn); > > extern tree mathfn_built_in_type (combined_fn); > > extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT, > > - scalar_int_mode); > > + machine_mode); > > extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT, > > - scalar_int_mode); > > + machine_mode); > > extern rtx expand_builtin_saveregs (void); > > extern tree std_build_builtin_va_list (void); > > extern tree std_fn_abi_va_list (tree); > > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi > > index 3ad39443eba..c1995270720 100644 > > --- a/gcc/doc/tm.texi > > +++ b/gcc/doc/tm.texi > > @@ -12123,6 +12123,11 @@ This function prepares to emit a conditional comparison within a sequence > > @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares. > > @end deftypefn > > > > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode}) > > +This hook should return an rtx for scratch register in @var{mode} to > > +be used by memset broadcast. The default is @code{gen_reg_rtx}. > > This should give a hint why the default might not be appropriate. Updated. > > +@end deftypefn > > + > > @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop}) > > This target hook returns a new value for the number of times @var{loop} > > should be unrolled. The parameter @var{nunroll} is the number of times > > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in > > index f881cdabe9e..a6bbf4f2667 100644 > > --- a/gcc/doc/tm.texi.in > > +++ b/gcc/doc/tm.texi.in > > @@ -7958,6 +7958,8 @@ lists. > > > > @hook TARGET_GEN_CCMP_NEXT > > > > +@hook TARGET_GEN_MEMSET_SCRATCH_RTX > > + > > @hook TARGET_LOOP_UNROLL_ADJUST > > > > @defmac POWI_MAX_MULTS > > diff --git a/gcc/expr.c b/gcc/expr.c > > index 6a4368113c4..92dbf47194e 100644 > > --- a/gcc/expr.c > > +++ b/gcc/expr.c > > @@ -769,15 +769,36 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align) > > return align; > > } > > > > -/* Return the widest integer mode that is narrower than SIZE bytes. */ > > +/* Return the widest integer or QI vector mode that is narrower than > > + SIZE bytes. */ > > > > -static scalar_int_mode > > -widest_int_mode_for_size (unsigned int size) > > +static machine_mode > > +widest_int_or_QI_vector_mode_for_size (unsigned int size, > > + bool qi_vector) > > { > > - scalar_int_mode result = NARROWEST_INT_MODE; > > + machine_mode result = NARROWEST_INT_MODE; > > > > gcc_checking_assert (size > 1); > > > > + /* Use QI vector only if size is wider than a WORD. */ > > + if (qi_vector && size > GET_MODE_SIZE (word_mode)) > > + { > > + machine_mode mode; > > + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) > > + if (GET_MODE_INNER (mode) == QImode > > + && GET_MODE_SIZE (mode).is_constant ()) > > + { > > + if (GET_MODE_SIZE (mode).to_constant () >= size) > > + break; > > + if (optab_handler (vec_duplicate_optab, mode) > > + != CODE_FOR_nothing) > > + result = mode; > > + } > > + > > + if (result != NARROWEST_INT_MODE) > > + return result; > > + } > > + > > opt_scalar_int_mode tmode; > > FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT) > > if (GET_MODE_SIZE (tmode.require ()) < size) > > @@ -815,16 +836,18 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align, > > unsigned int max_size, by_pieces_operation op) > > { > > unsigned HOST_WIDE_INT n_insns = 0; > > - scalar_int_mode mode; > > + machine_mode mode; > > > > if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES) > > { > > /* NB: Round up L and ALIGN to the widest integer mode for > > MAX_SIZE. */ > > - mode = widest_int_mode_for_size (max_size); > > + mode = widest_int_or_QI_vector_mode_for_size (max_size, > > + op == SET_BY_PIECES); > > if (optab_handler (mov_optab, mode) != CODE_FOR_nothing) > > { > > - unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode)); > > + unsigned HOST_WIDE_INT up = ROUND_UP (l, > > + GET_MODE_SIZE (mode).to_constant ()); > > if (up > l) > > l = up; > > align = GET_MODE_ALIGNMENT (mode); > > @@ -835,10 +858,11 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align, > > > > while (max_size > 1 && l > 0) > > { > > - mode = widest_int_mode_for_size (max_size); > > + mode = widest_int_or_QI_vector_mode_for_size (max_size, > > + op == SET_BY_PIECES); > > enum insn_code icode; > > > > - unsigned int modesize = GET_MODE_SIZE (mode); > > + unsigned int modesize = GET_MODE_SIZE (mode).to_constant (); > > > > icode = optab_handler (mov_optab, mode); > > if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode)) > > @@ -903,8 +927,7 @@ class pieces_addr > > void *m_cfndata; > > public: > > pieces_addr (rtx, bool, by_pieces_constfn, void *); > > - rtx adjust (scalar_int_mode, HOST_WIDE_INT, > > - by_pieces_prev * = nullptr); > > + rtx adjust (machine_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr); > > void increment_address (HOST_WIDE_INT); > > void maybe_predec (HOST_WIDE_INT); > > void maybe_postinc (HOST_WIDE_INT); > > @@ -1006,7 +1029,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse, > > but we still modify the MEM's properties. */ > > > > rtx > > -pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset, > > +pieces_addr::adjust (machine_mode mode, HOST_WIDE_INT offset, > > by_pieces_prev *prev) > > { > > if (m_constfn) > > @@ -1060,7 +1083,8 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size) > > class op_by_pieces_d > > { > > private: > > - scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int); > > + machine_mode get_usable_mode (machine_mode, unsigned int); > > + machine_mode smallest_mode_for_size (unsigned int); > > > > protected: > > pieces_addr m_to, m_from; > > @@ -1073,6 +1097,8 @@ class op_by_pieces_d > > bool m_push; > > /* True if targetm.overlap_op_by_pieces_p () returns true. */ > > bool m_overlap_op_by_pieces; > > + /* True if QI vector mode can be used. */ > > + bool m_qi_vector_mode; > > > > /* Virtual functions, overriden by derived classes for the specific > > operation. */ > > @@ -1084,7 +1110,8 @@ class op_by_pieces_d > > > > public: > > op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *, > > - unsigned HOST_WIDE_INT, unsigned int, bool); > > + unsigned HOST_WIDE_INT, unsigned int, bool, > > + bool = false); > > void run (); > > }; > > > > @@ -1099,11 +1126,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, > > by_pieces_constfn from_cfn, > > void *from_cfn_data, > > unsigned HOST_WIDE_INT len, > > - unsigned int align, bool push) > > + unsigned int align, bool push, > > + bool qi_vector_mode) > > : m_to (to, to_load, NULL, NULL), > > m_from (from, from_load, from_cfn, from_cfn_data), > > m_len (len), m_max_size (MOVE_MAX_PIECES + 1), > > - m_push (push) > > + m_push (push), m_qi_vector_mode (qi_vector_mode) > > { > > int toi = m_to.get_addr_inc (); > > int fromi = m_from.get_addr_inc (); > > @@ -1124,7 +1152,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, > > if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2) > > { > > /* Find the mode of the largest comparison. */ > > - scalar_int_mode mode = widest_int_mode_for_size (m_max_size); > > + machine_mode mode > > + = widest_int_or_QI_vector_mode_for_size (m_max_size, > > + m_qi_vector_mode); > > > > m_from.decide_autoinc (mode, m_reverse, len); > > m_to.decide_autoinc (mode, m_reverse, len); > > @@ -1139,22 +1169,42 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, > > /* This function returns the largest usable integer mode for LEN bytes > > whose size is no bigger than size of MODE. */ > > > > -scalar_int_mode > > -op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len) > > +machine_mode > > +op_by_pieces_d::get_usable_mode (machine_mode mode, unsigned int len) > > { > > unsigned int size; > > do > > { > > - size = GET_MODE_SIZE (mode); > > + size = GET_MODE_SIZE (mode).to_constant (); > > if (len >= size && prepare_mode (mode, m_align)) > > break; > > - /* NB: widest_int_mode_for_size checks SIZE > 1. */ > > - mode = widest_int_mode_for_size (size); > > + /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1. */ > > + mode = widest_int_or_QI_vector_mode_for_size (size, > > + m_qi_vector_mode); > > } > > while (1); > > return mode; > > } > > > > +/* Return the smallest integer or QI vector mode that is not narrower > > + than SIZE bits. */ > > + > > +machine_mode > > +op_by_pieces_d::smallest_mode_for_size (unsigned int size) > > +{ > > + /* Use QI vector only for > size of WORD. */ > > + if (m_qi_vector_mode && size > GET_MODE_PRECISION (word_mode)) > > + { > > + machine_mode mode; > > + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) > > + if (GET_MODE_INNER (mode) == QImode > > + && known_ge (GET_MODE_PRECISION (mode), size)) > > + return mode; > > Shouldn't this also test for vec_duplicate_optab? I guess you're > hoping that the previous register will be reused via a lowpart, > but since that's up to the callbacks, I think we should be > conservative here. Fixed. I will submit the v2 patch. Thanks. > Alternatively, we could bypass the callbacks when this code is used. > > Thanks, > Richard > > > + } > > + > > + return smallest_int_mode_for_size (size); > > +} > > + > > /* This function contains the main loop used for expanding a block > > operation. First move what we can in the largest integer mode, > > then go to successively smaller modes. For every access, call > > @@ -1166,8 +1216,10 @@ op_by_pieces_d::run () > > if (m_len == 0) > > return; > > > > - /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1. */ > > - scalar_int_mode mode = widest_int_mode_for_size (m_max_size); > > + /* NB: widest_int_or_QI_vector_mode_for_size checks M_MAX_SIZE > 1. */ > > + machine_mode mode > > + = widest_int_or_QI_vector_mode_for_size (m_max_size, > > + m_qi_vector_mode); > > mode = get_usable_mode (mode, m_len); > > > > by_pieces_prev to_prev = { nullptr, mode }; > > @@ -1175,7 +1227,7 @@ op_by_pieces_d::run () > > > > do > > { > > - unsigned int size = GET_MODE_SIZE (mode); > > + unsigned int size = GET_MODE_SIZE (mode).to_constant (); > > rtx to1 = NULL_RTX, from1; > > > > while (m_len >= size) > > @@ -1214,9 +1266,10 @@ op_by_pieces_d::run () > > /* NB: Generate overlapping operations if it is not a stack > > push since stack push must not overlap. Get the smallest > > integer mode for M_LEN bytes. */ > > - mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT); > > - mode = get_usable_mode (mode, GET_MODE_SIZE (mode)); > > - int gap = GET_MODE_SIZE (mode) - m_len; > > + mode = smallest_mode_for_size (m_len * BITS_PER_UNIT); > > + unsigned int mode_size = GET_MODE_SIZE (mode).to_constant (); > > + mode = get_usable_mode (mode, mode_size); > > + int gap = mode_size - m_len; > > if (gap > 0) > > { > > /* If size of MODE > M_LEN, generate the last operation > > @@ -1231,8 +1284,9 @@ op_by_pieces_d::run () > > } > > else > > { > > - /* NB: widest_int_mode_for_size checks SIZE > 1. */ > > - mode = widest_int_mode_for_size (size); > > + /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1. */ > > + mode = widest_int_or_QI_vector_mode_for_size (size, > > + m_qi_vector_mode); > > mode = get_usable_mode (mode, m_len); > > } > > } > > @@ -1355,9 +1409,10 @@ class store_by_pieces_d : public op_by_pieces_d > > > > public: > > store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data, > > - unsigned HOST_WIDE_INT len, unsigned int align) > > + unsigned HOST_WIDE_INT len, unsigned int align, > > + bool qi_vector_mode = false) > > : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len, > > - align, false) > > + align, false, qi_vector_mode) > > { > > } > > rtx finish_retmode (memop_ret); > > @@ -1446,13 +1501,14 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len, > > max_size = STORE_MAX_PIECES + 1; > > while (max_size > 1 && l > 0) > > { > > - scalar_int_mode mode = widest_int_mode_for_size (max_size); > > + machine_mode mode > > + = widest_int_or_QI_vector_mode_for_size (max_size, memsetp); > > > > icode = optab_handler (mov_optab, mode); > > if (icode != CODE_FOR_nothing > > && align >= GET_MODE_ALIGNMENT (mode)) > > { > > - unsigned int size = GET_MODE_SIZE (mode); > > + unsigned int size = GET_MODE_SIZE (mode).to_constant (); > > > > while (l >= size) > > { > > @@ -1470,7 +1526,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len, > > } > > } > > > > - max_size = GET_MODE_SIZE (mode); > > + max_size = GET_MODE_SIZE (mode).to_constant (); > > } > > > > /* The code above should have handled everything. */ > > @@ -1504,7 +1560,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len, > > memsetp ? SET_BY_PIECES : STORE_BY_PIECES, > > optimize_insn_for_speed_p ())); > > > > - store_by_pieces_d data (to, constfun, constfundata, len, align); > > + store_by_pieces_d data (to, constfun, constfundata, len, align, > > + memsetp); > > data.run (); > > > > if (retmode != RETURN_BEGIN) > > @@ -1517,7 +1574,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len, > > Return const0_rtx unconditionally. */ > > > > static rtx > > -clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode) > > +clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, machine_mode) > > { > > return const0_rtx; > > } > > @@ -5754,7 +5811,7 @@ emit_storent_insn (rtx to, rtx from) > > > > static rtx > > string_cst_read_str (void *data, void *, HOST_WIDE_INT offset, > > - scalar_int_mode mode) > > + machine_mode mode) > > { > > tree str = (tree) data; > > > > @@ -5762,17 +5819,19 @@ string_cst_read_str (void *data, void *, HOST_WIDE_INT offset, > > if (offset >= TREE_STRING_LENGTH (str)) > > return const0_rtx; > > > > - if ((unsigned HOST_WIDE_INT) offset + GET_MODE_SIZE (mode) > > + unsigned int size = GET_MODE_SIZE (mode).to_constant (); > > + if ((unsigned HOST_WIDE_INT) offset + size > > > (unsigned HOST_WIDE_INT) TREE_STRING_LENGTH (str)) > > { > > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > > + char *p = XALLOCAVEC (char, size); > > size_t l = TREE_STRING_LENGTH (str) - offset; > > memcpy (p, TREE_STRING_POINTER (str) + offset, l); > > - memset (p + l, '\0', GET_MODE_SIZE (mode) - l); > > - return c_readstr (p, mode, false); > > + memset (p + l, '\0', size - l); > > + return c_readstr (p, as_a <scalar_int_mode> (mode), false); > > } > > > > - return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false); > > + return c_readstr (TREE_STRING_POINTER (str) + offset, > > + as_a <scalar_int_mode> (mode), false); > > } > > > > /* Generate code for computing expression EXP, > > diff --git a/gcc/expr.h b/gcc/expr.h > > index a4f44265759..552645e4403 100644 > > --- a/gcc/expr.h > > +++ b/gcc/expr.h > > @@ -108,13 +108,13 @@ enum block_op_methods > > }; > > > > typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT, > > - scalar_int_mode); > > + machine_mode); > > > > /* The second pointer passed to by_pieces_constfn. */ > > struct by_pieces_prev > > { > > rtx data; > > - scalar_int_mode mode; > > + machine_mode mode; > > }; > > > > extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods); > > diff --git a/gcc/target.def b/gcc/target.def > > index 2e40448e6c5..cc4aa3a4212 100644 > > --- a/gcc/target.def > > +++ b/gcc/target.def > > @@ -2726,6 +2726,13 @@ DEFHOOK > > rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code), > > NULL) > > > > +DEFHOOK > > +(gen_memset_scratch_rtx, > > + "This hook should return an rtx for scratch register in @var{mode} to\n\ > > +be used by memset broadcast. The default is @code{gen_reg_rtx}.", > > + rtx, (machine_mode mode), > > + gen_reg_rtx) > > + > > /* Return a new value for loop unroll size. */ > > DEFHOOK > > (loop_unroll_adjust, > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c > > index b6dbcf7809b..007e79f91b0 100644 > > --- a/gcc/testsuite/gcc.target/i386/pr100865-3.c > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c > > @@ -10,6 +10,6 @@ foo (void) > > } > > > > /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */ > > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */ > > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */ > > /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */ > > /* { dg-final { scan-assembler-not "vmovdqa" } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c > > index f41e6147b4c..1e50dc842bc 100644 > > --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c > > @@ -4,6 +4,6 @@ > > #include "pr100865-4a.c" > > > > /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */ > > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */ > > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */ > > /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */ > > /* { dg-final { scan-assembler-not "vmovdqa" } } */
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: >> > + { >> > + /* First generate subreg of word mode if the previous mode is >> > + wider than word mode and word mode is wider than MODE. */ >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, >> > + prev_mode, 0); >> > + prev_mode = word_mode; >> > + } >> > + if (prev_rtx != nullptr) >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); >> >> This should be lowpart_subreg, since 0 isn't the right offset for >> big-endian targets. Using lowpart_subreg should also avoid the need >> for the word_size “if” above: lowpart_subreg can handle lowpart subword >> subregs of multiword values. > > I tried it. It didn't work since it caused the LRA failure. I replaced > simplify_gen_subreg with lowpart_subreg instead. What specifically went wrong? >> > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) >> > + bytes from constant string DATA + OFFSET and return it as target >> > + constant. If PREV isn't nullptr, it has the RTL info from the >> > + previous iteration. */ >> > + >> > +rtx >> > +builtin_memset_read_str (void *data, void *prev, >> > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED, >> > + machine_mode mode) >> > +{ >> > + rtx target; >> > const char *c = (const char *) data; >> > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); >> > + char *p; >> > + unsigned int size = GET_MODE_SIZE (mode).to_constant (); >> > + >> > + /* Don't use the previous value if size is 1. */ >> >> Why not though? The existing code does, and that seems like the right >> thing to do when operating on integers. >> >> I can see the check would be a good thing to do if MODE isn't a vector >> mode and the previous mode was. Is that the case you're worried about? >> If so, that check could go in gen_memset_value_from_prev instead. > > We are storing one byte. Doing it directly is faster. But the first thing being protected here is… >> > + if (size != 1) >> > + { >> > + target = gen_memset_value_from_prev (prev, mode); >> > + if (target != nullptr) >> > + return target; …this attempt to use the previous value. If the target uses, say, SImode for the first piece and QImode for a final byte, using the QImode lowpart of the SImode register would avoid having to move the byte value into a separate QImode register. Why's that a bad thing to do? AFAICT it's what the current code would do, so if we want to change it even for integer modes, I think it should be a separate patch with a separate justification. Like I say, I can understand that using the QImode lowpart of a vector wouldn't be a good idea. But if that's specifically what you're trying to prevent, I think we should test for it. Thanks, Richard
On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford <richard.sandiford@arm.com> wrote: > > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > >> > + { > >> > + /* First generate subreg of word mode if the previous mode is > >> > + wider than word mode and word mode is wider than MODE. */ > >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, > >> > + prev_mode, 0); > >> > + prev_mode = word_mode; > >> > + } > >> > + if (prev_rtx != nullptr) > >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); > >> > >> This should be lowpart_subreg, since 0 isn't the right offset for > >> big-endian targets. Using lowpart_subreg should also avoid the need > >> for the word_size “if” above: lowpart_subreg can handle lowpart subword > >> subregs of multiword values. > > > > I tried it. It didn't work since it caused the LRA failure. I replaced > > simplify_gen_subreg with lowpart_subreg instead. > > What specifically went wrong? With vector broadcast, for --- extern void *ops; void foo (int c) { __builtin_memset (ops, c, 18); } --- we generate HI from V16QI. With a single lowpart_subreg, I get (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void *)ops.0_1]+16 S2 A8]) (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} (nil)) instead of (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void *)ops.0_1]+16 S2 A8]) (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} (nil)) IRA and LRA fail to reload: (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void *)ops.0_1]+16 S2 A8]) (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} (nil)) since ix86_can_change_mode_class has if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass)) { /* Vector registers do not support QI or HImode loads. If we don't disallow a change to these modes, reload will assume it's ok to drop the subreg from (subreg:SI (reg:HI 100) 0). This affects the vec_dupv4hi pattern. */ if (GET_MODE_SIZE (from) < 4) return false; } If we don't use a hard scratch register, (subreg:HI (reg:V16QI)) compiles. But codegen is worse: vmovd %edi, %xmm0 vpbroadcastb %xmm0, %xmm0 vmovdqa %xmm0, -24(%rsp) movq ops(%rip), %rax movzwl -24(%rsp), %edx vmovdqu %xmm0, (%rax) movw %dx, 16(%rax) vs vmovd %edi, %xmm15 movq ops(%rip), %rax vpbroadcastb %xmm15, %xmm15 vmovq %xmm15, %rdx movw %dx, 16(%rax) vmovdqu %xmm15, (%rax) > >> > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > >> > + bytes from constant string DATA + OFFSET and return it as target > >> > + constant. If PREV isn't nullptr, it has the RTL info from the > >> > + previous iteration. */ > >> > + > >> > +rtx > >> > +builtin_memset_read_str (void *data, void *prev, > >> > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > >> > + machine_mode mode) > >> > +{ > >> > + rtx target; > >> > const char *c = (const char *) data; > >> > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > >> > + char *p; > >> > + unsigned int size = GET_MODE_SIZE (mode).to_constant (); > >> > + > >> > + /* Don't use the previous value if size is 1. */ > >> > >> Why not though? The existing code does, and that seems like the right > >> thing to do when operating on integers. > >> > >> I can see the check would be a good thing to do if MODE isn't a vector > >> mode and the previous mode was. Is that the case you're worried about? > >> If so, that check could go in gen_memset_value_from_prev instead. > > > > We are storing one byte. Doing it directly is faster. > > But the first thing being protected here is… > > >> > + if (size != 1) > >> > + { > >> > + target = gen_memset_value_from_prev (prev, mode); > >> > + if (target != nullptr) > >> > + return target; > > …this attempt to use the previous value. If the target uses, say, > SImode for the first piece and QImode for a final byte, using the QImode > lowpart of the SImode register would avoid having to move the byte value > into a separate QImode register. Why's that a bad thing to do? AFAICT > it's what the current code would do, so if we want to change it even for > integer modes, I think it should be a separate patch with a separate > justification. I removed the size == 1 check. I didn't notice any issues. > Like I say, I can understand that using the QImode lowpart of a vector > wouldn't be a good idea. But if that's specifically what you're trying > to prevent, I think we should test for it. > > Thanks, > Richard I will submit the v3 patch. Thanks.
On Tue, Jul 20, 2021 at 5:48 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford > <richard.sandiford@arm.com> wrote: > > > > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > > >> > + { > > >> > + /* First generate subreg of word mode if the previous mode is > > >> > + wider than word mode and word mode is wider than MODE. */ > > >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, > > >> > + prev_mode, 0); > > >> > + prev_mode = word_mode; > > >> > + } > > >> > + if (prev_rtx != nullptr) > > >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); > > >> > > >> This should be lowpart_subreg, since 0 isn't the right offset for > > >> big-endian targets. Using lowpart_subreg should also avoid the need > > >> for the word_size “if” above: lowpart_subreg can handle lowpart subword > > >> subregs of multiword values. > > > > > > I tried it. It didn't work since it caused the LRA failure. I replaced > > > simplify_gen_subreg with lowpart_subreg instead. > > > > What specifically went wrong? > > With vector broadcast, for > --- > extern void *ops; > > void > foo (int c) > { > __builtin_memset (ops, c, 18); > } > --- > we generate HI from V16QI. With a single lowpart_subreg, I get > > (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) > (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void > *)ops.0_1]+16 S2 A8]) > (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} > (nil)) > > instead of > > (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) > (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void > *)ops.0_1]+16 S2 A8]) > (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} > (nil)) > > IRA and LRA fail to reload: > > (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) > (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void > *)ops.0_1]+16 S2 A8]) > (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} > (nil)) > > since ix86_can_change_mode_class has > > if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass)) > { > /* Vector registers do not support QI or HImode loads. If we don't > disallow a change to these modes, reload will assume it's ok to > drop the subreg from (subreg:SI (reg:HI 100) 0). This affects > the vec_dupv4hi pattern. */ > if (GET_MODE_SIZE (from) < 4) > return false; > } Correction. It is ix86_hard_regno_mode_ok which doesn't allow HImode in XMM registers. > If we don't use a hard scratch register, (subreg:HI (reg:V16QI)) compiles. But > codegen is worse: > > vmovd %edi, %xmm0 > vpbroadcastb %xmm0, %xmm0 > vmovdqa %xmm0, -24(%rsp) > movq ops(%rip), %rax > movzwl -24(%rsp), %edx > vmovdqu %xmm0, (%rax) > movw %dx, 16(%rax) > > vs > > vmovd %edi, %xmm15 > movq ops(%rip), %rax > vpbroadcastb %xmm15, %xmm15 > vmovq %xmm15, %rdx > movw %dx, 16(%rax) > vmovdqu %xmm15, (%rax) > > > >> > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > > >> > + bytes from constant string DATA + OFFSET and return it as target > > >> > + constant. If PREV isn't nullptr, it has the RTL info from the > > >> > + previous iteration. */ > > >> > + > > >> > +rtx > > >> > +builtin_memset_read_str (void *data, void *prev, > > >> > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > > >> > + machine_mode mode) > > >> > +{ > > >> > + rtx target; > > >> > const char *c = (const char *) data; > > >> > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > > >> > + char *p; > > >> > + unsigned int size = GET_MODE_SIZE (mode).to_constant (); > > >> > + > > >> > + /* Don't use the previous value if size is 1. */ > > >> > > >> Why not though? The existing code does, and that seems like the right > > >> thing to do when operating on integers. > > >> > > >> I can see the check would be a good thing to do if MODE isn't a vector > > >> mode and the previous mode was. Is that the case you're worried about? > > >> If so, that check could go in gen_memset_value_from_prev instead. > > > > > > We are storing one byte. Doing it directly is faster. > > > > But the first thing being protected here is… > > > > >> > + if (size != 1) > > >> > + { > > >> > + target = gen_memset_value_from_prev (prev, mode); > > >> > + if (target != nullptr) > > >> > + return target; > > > > …this attempt to use the previous value. If the target uses, say, > > SImode for the first piece and QImode for a final byte, using the QImode > > lowpart of the SImode register would avoid having to move the byte value > > into a separate QImode register. Why's that a bad thing to do? AFAICT > > it's what the current code would do, so if we want to change it even for > > integer modes, I think it should be a separate patch with a separate > > justification. > > I removed the size == 1 check. I didn't notice any issues. > > > Like I say, I can understand that using the QImode lowpart of a vector > > wouldn't be a good idea. But if that's specifically what you're trying > > to prevent, I think we should test for it. > > > > Thanks, > > Richard > > I will submit the v3 patch. > > Thanks. > > -- > H.J.
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford > <richard.sandiford@arm.com> wrote: >> >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: >> >> > + { >> >> > + /* First generate subreg of word mode if the previous mode is >> >> > + wider than word mode and word mode is wider than MODE. */ >> >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, >> >> > + prev_mode, 0); >> >> > + prev_mode = word_mode; >> >> > + } >> >> > + if (prev_rtx != nullptr) >> >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); >> >> >> >> This should be lowpart_subreg, since 0 isn't the right offset for >> >> big-endian targets. Using lowpart_subreg should also avoid the need >> >> for the word_size “if” above: lowpart_subreg can handle lowpart subword >> >> subregs of multiword values. >> > >> > I tried it. It didn't work since it caused the LRA failure. I replaced >> > simplify_gen_subreg with lowpart_subreg instead. >> >> What specifically went wrong? > > With vector broadcast, for > --- > extern void *ops; > > void > foo (int c) > { > __builtin_memset (ops, c, 18); > } > --- > we generate HI from V16QI. With a single lowpart_subreg, I get > > (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) > (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void > *)ops.0_1]+16 S2 A8]) > (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} > (nil)) > > instead of > > (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) > (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void > *)ops.0_1]+16 S2 A8]) > (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} > (nil)) > > IRA and LRA fail to reload: > > (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) > (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void > *)ops.0_1]+16 S2 A8]) > (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} > (nil)) > > since ix86_can_change_mode_class has > > if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass)) > { > /* Vector registers do not support QI or HImode loads. If we don't > disallow a change to these modes, reload will assume it's ok to > drop the subreg from (subreg:SI (reg:HI 100) 0). This affects > the vec_dupv4hi pattern. */ > if (GET_MODE_SIZE (from) < 4) > return false; > } Ah! OK. In that case, maybe we should have something like: if (REG_P (prev_rtx) && HARD_REGISTER_P (prev_rtx) && REG_CAN_CHANGE_MODE_P (REGNO (prev_rtx), prev->mode, mode)) prev_rtx = copy_to_reg (prev_rtx); and then just have the single lowpart_subreg after that. Thanks, Richard
Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: >> On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford >> <richard.sandiford@arm.com> wrote: >>> >>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: >>> >> > + { >>> >> > + /* First generate subreg of word mode if the previous mode is >>> >> > + wider than word mode and word mode is wider than MODE. */ >>> >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, >>> >> > + prev_mode, 0); >>> >> > + prev_mode = word_mode; >>> >> > + } >>> >> > + if (prev_rtx != nullptr) >>> >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); >>> >> >>> >> This should be lowpart_subreg, since 0 isn't the right offset for >>> >> big-endian targets. Using lowpart_subreg should also avoid the need >>> >> for the word_size “if” above: lowpart_subreg can handle lowpart subword >>> >> subregs of multiword values. >>> > >>> > I tried it. It didn't work since it caused the LRA failure. I replaced >>> > simplify_gen_subreg with lowpart_subreg instead. >>> >>> What specifically went wrong? >> >> With vector broadcast, for >> --- >> extern void *ops; >> >> void >> foo (int c) >> { >> __builtin_memset (ops, c, 18); >> } >> --- >> we generate HI from V16QI. With a single lowpart_subreg, I get >> >> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) >> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void >> *)ops.0_1]+16 S2 A8]) >> (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} >> (nil)) >> >> instead of >> >> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) >> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void >> *)ops.0_1]+16 S2 A8]) >> (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} >> (nil)) >> >> IRA and LRA fail to reload: >> >> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) >> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void >> *)ops.0_1]+16 S2 A8]) >> (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} >> (nil)) >> >> since ix86_can_change_mode_class has >> >> if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass)) >> { >> /* Vector registers do not support QI or HImode loads. If we don't >> disallow a change to these modes, reload will assume it's ok to >> drop the subreg from (subreg:SI (reg:HI 100) 0). This affects >> the vec_dupv4hi pattern. */ >> if (GET_MODE_SIZE (from) < 4) >> return false; >> } > > Ah! OK. In that case, maybe we should have something like: > > if (REG_P (prev_rtx) > && HARD_REGISTER_P (prev_rtx) > && REG_CAN_CHANGE_MODE_P (REGNO (prev_rtx), prev->mode, mode)) Sorry, make that last line: && lowpart_subreg_regno (REGNO (prev_rtx), prev->mode, mode) < 0 where lowpart_subreg_regno is a new wrapper around simplify_subreg_regno that uses subreg_lowpart_offset (mode, prev->mode) as the offset. Thanks, Richard > prev_rtx = copy_to_reg (prev_rtx); > > and then just have the single lowpart_subreg after that. > > Thanks, > Richard
On Tue, Jul 20, 2021 at 8:12 AM Richard Sandiford <richard.sandiford@arm.com> wrote: > > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > >> On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford > >> <richard.sandiford@arm.com> wrote: > >>> > >>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > >>> >> > + { > >>> >> > + /* First generate subreg of word mode if the previous mode is > >>> >> > + wider than word mode and word mode is wider than MODE. */ > >>> >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, > >>> >> > + prev_mode, 0); > >>> >> > + prev_mode = word_mode; > >>> >> > + } > >>> >> > + if (prev_rtx != nullptr) > >>> >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); > >>> >> > >>> >> This should be lowpart_subreg, since 0 isn't the right offset for > >>> >> big-endian targets. Using lowpart_subreg should also avoid the need > >>> >> for the word_size “if” above: lowpart_subreg can handle lowpart subword > >>> >> subregs of multiword values. > >>> > > >>> > I tried it. It didn't work since it caused the LRA failure. I replaced > >>> > simplify_gen_subreg with lowpart_subreg instead. > >>> > >>> What specifically went wrong? > >> > >> With vector broadcast, for > >> --- > >> extern void *ops; > >> > >> void > >> foo (int c) > >> { > >> __builtin_memset (ops, c, 18); > >> } > >> --- > >> we generate HI from V16QI. With a single lowpart_subreg, I get > >> > >> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) > >> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void > >> *)ops.0_1]+16 S2 A8]) > >> (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} > >> (nil)) > >> > >> instead of > >> > >> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) > >> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void > >> *)ops.0_1]+16 S2 A8]) > >> (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} > >> (nil)) > >> > >> IRA and LRA fail to reload: > >> > >> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) > >> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void > >> *)ops.0_1]+16 S2 A8]) > >> (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal} > >> (nil)) > >> > >> since ix86_can_change_mode_class has > >> > >> if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass)) > >> { > >> /* Vector registers do not support QI or HImode loads. If we don't > >> disallow a change to these modes, reload will assume it's ok to > >> drop the subreg from (subreg:SI (reg:HI 100) 0). This affects > >> the vec_dupv4hi pattern. */ > >> if (GET_MODE_SIZE (from) < 4) > >> return false; > >> } > > > > Ah! OK. In that case, maybe we should have something like: > > > > if (REG_P (prev_rtx) > > && HARD_REGISTER_P (prev_rtx) > > && REG_CAN_CHANGE_MODE_P (REGNO (prev_rtx), prev->mode, mode)) > > Sorry, make that last line: > > && lowpart_subreg_regno (REGNO (prev_rtx), prev->mode, mode) < 0 > > where lowpart_subreg_regno is a new wrapper around simplify_subreg_regno > that uses subreg_lowpart_offset (mode, prev->mode) as the offset. Fixed. I submitted the v3 patch: https://gcc.gnu.org/pipermail/gcc-patches/2021-July/575670.html Thanks. > Thanks, > Richard > > > prev_rtx = copy_to_reg (prev_rtx); > > > > and then just have the single lowpart_subreg after that. > > > > Thanks, > > Richard
From 171f0a919300a8aac58dfe5fc9904eaa8fe0a05b Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.tools@gmail.com> Date: Sun, 6 Mar 2016 06:38:21 -0800 Subject: [PATCH] Add QI vector mode support to by-pieces for memset 1. Replace scalar_int_mode with machine_mode in the by-pieces infrastructure to allow non-integer mode. 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size to return QI vector mode for memset. 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest integer or QI vector mode. gcc/ PR middle-end/90773 * builtins.c (builtin_memcpy_read_str): Change the mode argument from scalar_int_mode to machine_mode. (builtin_strncpy_read_str): Likewise. (gen_memset_value_from_prev): New function. (gen_memset_broadcast): Likewise. (builtin_memset_read_str): Change the mode argument from scalar_int_mode to machine_mode. Use gen_memset_value_from_prev and gen_memset_broadcast. (builtin_memset_gen_str): Likewise. (try_store_by_multiple_pieces): Use by_pieces_constfn to declare constfun. * builtins.h (builtin_strncpy_read_str): Updated. (builtin_memset_read_str): Likewise. * expr.c (widest_int_mode_for_size): Renamed to ... (widest_int_or_QI_vector_mode_for_size): Add a bool argument to indicate if QI vector mode can be used. (by_pieces_ninsns): Call widest_int_or_QI_vector_mode_for_size instead of widest_int_mode_for_size. (pieces_addr::adjust): Change the mode argument from scalar_int_mode to machine_mode. (op_by_pieces_d): Add a bool member, m_qi_vector_mode, to indicate that QI vector mode can be used. (op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize m_qi_vector_mode. Call widest_int_or_QI_vector_mode_for_size instead of widest_int_mode_for_size. (op_by_pieces_d::get_usable_mode): Change the mode argument from scalar_int_mode to machine_mode. Call widest_int_or_QI_vector_mode_for_size instead of widest_int_mode_for_size. (op_by_pieces_d::smallest_mode_for_size): New member function to return the smallest integer or QI vector mode. (op_by_pieces_d::run): Call widest_int_or_QI_vector_mode_for_size instead of widest_int_mode_for_size. Call smallest_mode_for_size instead of smallest_int_mode_for_size. (store_by_pieces_d::store_by_pieces_d): Add a bool argument to indicate that QI vector mode can be used and pass it to op_by_pieces_d::op_by_pieces_d. (can_store_by_pieces): Call widest_int_or_QI_vector_mode_for_size instead of widest_int_mode_for_size. (store_by_pieces::store_by_pieces): Pass memsetp to store_by_pieces_d::store_by_pieces_d. (clear_by_pieces_1): Change scalar_int_mode to machine_mode. (string_cst_read_str): Change the mode argument from scalar_int_mode to machine_mode. * expr.h (by_pieces_constfn): Change scalar_int_mode to machine_mode. (by_pieces_prev): Likewise. * target.def (gen_memset_scratch_rtx): New hook. * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. * doc/tm.texi: Regenerated. gcc/testsuite/ * gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of vmovdqu. * gcc.target/i386/pr100865-4b.c: Likewise. --- gcc/builtins.c | 138 ++++++++++++++----- gcc/builtins.h | 4 +- gcc/doc/tm.texi | 5 + gcc/doc/tm.texi.in | 2 + gcc/expr.c | 145 ++++++++++++++------ gcc/expr.h | 4 +- gcc/target.def | 7 + gcc/testsuite/gcc.target/i386/pr100865-3.c | 2 +- gcc/testsuite/gcc.target/i386/pr100865-4b.c | 2 +- 9 files changed, 229 insertions(+), 80 deletions(-) diff --git a/gcc/builtins.c b/gcc/builtins.c index 39ab139b7e1..876378b467c 100644 --- a/gcc/builtins.c +++ b/gcc/builtins.c @@ -3890,13 +3890,14 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode) static rtx builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset, - scalar_int_mode mode) + machine_mode mode) { /* The REPresentation pointed to by DATA need not be a nul-terminated string but the caller guarantees it's large enough for MODE. */ const char *rep = (const char *) data; - return c_readstr (rep + offset, mode, /*nul_terminated=*/false); + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode), + /*nul_terminated=*/false); } /* LEN specify length of the block of memcpy/memset operation. @@ -6478,14 +6479,14 @@ expand_builtin_stpncpy (tree exp, rtx) rtx builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset, - scalar_int_mode mode) + machine_mode mode) { const char *str = (const char *) data; if ((unsigned HOST_WIDE_INT) offset > strlen (str)) return const0_rtx; - return c_readstr (str + offset, mode); + return c_readstr (str + offset, as_a <scalar_int_mode> (mode)); } /* Helper to check the sizes of sequences and the destination of calls @@ -6686,30 +6687,109 @@ expand_builtin_strncpy (tree exp, rtx target) return NULL_RTX; } -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) - bytes from constant string DATA + OFFSET and return it as target - constant. If PREV isn't nullptr, it has the RTL info from the +/* Return the RTL of a register in MODE generated from PREV in the previous iteration. */ -rtx -builtin_memset_read_str (void *data, void *prevp, - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, - scalar_int_mode mode) +static rtx +gen_memset_value_from_prev (void *prevp, machine_mode mode) { + rtx target = nullptr; by_pieces_prev *prev = (by_pieces_prev *) prevp; if (prev != nullptr && prev->data != nullptr) { /* Use the previous data in the same mode. */ if (prev->mode == mode) return prev->data; + + rtx prev_rtx = prev->data; + machine_mode prev_mode = prev->mode; + unsigned int word_size = GET_MODE_SIZE (word_mode); + if (word_size < GET_MODE_SIZE (prev->mode).to_constant () + && word_size > GET_MODE_SIZE (mode).to_constant ()) + { + /* First generate subreg of word mode if the previous mode is + wider than word mode and word mode is wider than MODE. */ + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, + prev_mode, 0); + prev_mode = word_mode; + } + if (prev_rtx != nullptr) + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); + } + return target; +} + +/* Return the RTL of a register in MODE broadcasted from DATA. */ + +static rtx +gen_memset_broadcast (rtx data, machine_mode mode) +{ + /* Skip if regno_reg_rtx isn't initialized or not vector mode. */ + if (!regno_reg_rtx || !VECTOR_MODE_P (mode)) + return nullptr; + + gcc_assert (GET_MODE_INNER (mode) == QImode); + + enum insn_code icode = optab_handler (vec_duplicate_optab, mode); + + gcc_assert (icode != CODE_FOR_nothing); + + rtx target = targetm.gen_memset_scratch_rtx (mode); + if (CONST_INT_P (data)) + { + /* Use the move expander with CONST_VECTOR. */ + rtx const_vec = gen_const_vec_duplicate (mode, data); + emit_move_insn (target, const_vec); + } + else + { + + class expand_operand ops[2]; + create_output_operand (&ops[0], target, mode); + create_input_operand (&ops[1], data, QImode); + expand_insn (icode, 2, ops); + if (!rtx_equal_p (target, ops[0].value)) + emit_move_insn (target, ops[0].value); } + return target; +} + +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) + bytes from constant string DATA + OFFSET and return it as target + constant. If PREV isn't nullptr, it has the RTL info from the + previous iteration. */ + +rtx +builtin_memset_read_str (void *data, void *prev, + HOST_WIDE_INT offset ATTRIBUTE_UNUSED, + machine_mode mode) +{ + rtx target; const char *c = (const char *) data; - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); + char *p; + unsigned int size = GET_MODE_SIZE (mode).to_constant (); + + /* Don't use the previous value if size is 1. */ + if (size != 1) + { + target = gen_memset_value_from_prev (prev, mode); + if (target != nullptr) + return target; + + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode)); + memset (p, *c, GET_MODE_SIZE (QImode)); + rtx src = c_readstr (p, QImode); + target = gen_memset_broadcast (src, mode); + if (target != nullptr) + return target; + } + + p = XALLOCAVEC (char, size); - memset (p, *c, GET_MODE_SIZE (mode)); + memset (p, *c, size); - return c_readstr (p, mode); + return c_readstr (p, as_a <scalar_int_mode> (mode)); } /* Callback routine for store_by_pieces. Return the RTL of a register @@ -6719,33 +6799,29 @@ builtin_memset_read_str (void *data, void *prevp, nullptr, it has the RTL info from the previous iteration. */ static rtx -builtin_memset_gen_str (void *data, void *prevp, +builtin_memset_gen_str (void *data, void *prev, HOST_WIDE_INT offset ATTRIBUTE_UNUSED, - scalar_int_mode mode) + machine_mode mode) { rtx target, coeff; size_t size; char *p; - by_pieces_prev *prev = (by_pieces_prev *) prevp; - if (prev != nullptr && prev->data != nullptr) - { - /* Use the previous data in the same mode. */ - if (prev->mode == mode) - return prev->data; - - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0); - if (target != nullptr) - return target; - } - - size = GET_MODE_SIZE (mode); + size = GET_MODE_SIZE (mode).to_constant (); if (size == 1) return (rtx) data; + target = gen_memset_value_from_prev (prev, mode); + if (target != nullptr) + return target; + + target = gen_memset_broadcast ((rtx) data, mode); + if (target != nullptr) + return target; + p = XALLOCAVEC (char, size); memset (p, 1, size); - coeff = c_readstr (p, mode); + coeff = c_readstr (p, as_a <scalar_int_mode> (mode)); target = convert_to_mode (mode, (rtx) data, 1); target = expand_mult (mode, target, coeff, NULL_RTX, 1); @@ -6849,7 +6925,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, &valc, align, true)) return false; - rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode); + by_pieces_constfn constfun; void *constfundata; if (val) { diff --git a/gcc/builtins.h b/gcc/builtins.h index a64ece3f1cd..31d07621750 100644 --- a/gcc/builtins.h +++ b/gcc/builtins.h @@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum built_in_function fn); extern tree mathfn_built_in (tree, combined_fn); extern tree mathfn_built_in_type (combined_fn); extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT, - scalar_int_mode); + machine_mode); extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT, - scalar_int_mode); + machine_mode); extern rtx expand_builtin_saveregs (void); extern tree std_build_builtin_va_list (void); extern tree std_fn_abi_va_list (tree); diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 3ad39443eba..c1995270720 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -12123,6 +12123,11 @@ This function prepares to emit a conditional comparison within a sequence @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares. @end deftypefn +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode}) +This hook should return an rtx for scratch register in @var{mode} to +be used by memset broadcast. The default is @code{gen_reg_rtx}. +@end deftypefn + @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop}) This target hook returns a new value for the number of times @var{loop} should be unrolled. The parameter @var{nunroll} is the number of times diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index f881cdabe9e..a6bbf4f2667 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -7958,6 +7958,8 @@ lists. @hook TARGET_GEN_CCMP_NEXT +@hook TARGET_GEN_MEMSET_SCRATCH_RTX + @hook TARGET_LOOP_UNROLL_ADJUST @defmac POWI_MAX_MULTS diff --git a/gcc/expr.c b/gcc/expr.c index 6a4368113c4..92dbf47194e 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -769,15 +769,36 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align) return align; } -/* Return the widest integer mode that is narrower than SIZE bytes. */ +/* Return the widest integer or QI vector mode that is narrower than + SIZE bytes. */ -static scalar_int_mode -widest_int_mode_for_size (unsigned int size) +static machine_mode +widest_int_or_QI_vector_mode_for_size (unsigned int size, + bool qi_vector) { - scalar_int_mode result = NARROWEST_INT_MODE; + machine_mode result = NARROWEST_INT_MODE; gcc_checking_assert (size > 1); + /* Use QI vector only if size is wider than a WORD. */ + if (qi_vector && size > GET_MODE_SIZE (word_mode)) + { + machine_mode mode; + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) + if (GET_MODE_INNER (mode) == QImode + && GET_MODE_SIZE (mode).is_constant ()) + { + if (GET_MODE_SIZE (mode).to_constant () >= size) + break; + if (optab_handler (vec_duplicate_optab, mode) + != CODE_FOR_nothing) + result = mode; + } + + if (result != NARROWEST_INT_MODE) + return result; + } + opt_scalar_int_mode tmode; FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT) if (GET_MODE_SIZE (tmode.require ()) < size) @@ -815,16 +836,18 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align, unsigned int max_size, by_pieces_operation op) { unsigned HOST_WIDE_INT n_insns = 0; - scalar_int_mode mode; + machine_mode mode; if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES) { /* NB: Round up L and ALIGN to the widest integer mode for MAX_SIZE. */ - mode = widest_int_mode_for_size (max_size); + mode = widest_int_or_QI_vector_mode_for_size (max_size, + op == SET_BY_PIECES); if (optab_handler (mov_optab, mode) != CODE_FOR_nothing) { - unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode)); + unsigned HOST_WIDE_INT up = ROUND_UP (l, + GET_MODE_SIZE (mode).to_constant ()); if (up > l) l = up; align = GET_MODE_ALIGNMENT (mode); @@ -835,10 +858,11 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align, while (max_size > 1 && l > 0) { - mode = widest_int_mode_for_size (max_size); + mode = widest_int_or_QI_vector_mode_for_size (max_size, + op == SET_BY_PIECES); enum insn_code icode; - unsigned int modesize = GET_MODE_SIZE (mode); + unsigned int modesize = GET_MODE_SIZE (mode).to_constant (); icode = optab_handler (mov_optab, mode); if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode)) @@ -903,8 +927,7 @@ class pieces_addr void *m_cfndata; public: pieces_addr (rtx, bool, by_pieces_constfn, void *); - rtx adjust (scalar_int_mode, HOST_WIDE_INT, - by_pieces_prev * = nullptr); + rtx adjust (machine_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr); void increment_address (HOST_WIDE_INT); void maybe_predec (HOST_WIDE_INT); void maybe_postinc (HOST_WIDE_INT); @@ -1006,7 +1029,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse, but we still modify the MEM's properties. */ rtx -pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset, +pieces_addr::adjust (machine_mode mode, HOST_WIDE_INT offset, by_pieces_prev *prev) { if (m_constfn) @@ -1060,7 +1083,8 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size) class op_by_pieces_d { private: - scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int); + machine_mode get_usable_mode (machine_mode, unsigned int); + machine_mode smallest_mode_for_size (unsigned int); protected: pieces_addr m_to, m_from; @@ -1073,6 +1097,8 @@ class op_by_pieces_d bool m_push; /* True if targetm.overlap_op_by_pieces_p () returns true. */ bool m_overlap_op_by_pieces; + /* True if QI vector mode can be used. */ + bool m_qi_vector_mode; /* Virtual functions, overriden by derived classes for the specific operation. */ @@ -1084,7 +1110,8 @@ class op_by_pieces_d public: op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *, - unsigned HOST_WIDE_INT, unsigned int, bool); + unsigned HOST_WIDE_INT, unsigned int, bool, + bool = false); void run (); }; @@ -1099,11 +1126,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, by_pieces_constfn from_cfn, void *from_cfn_data, unsigned HOST_WIDE_INT len, - unsigned int align, bool push) + unsigned int align, bool push, + bool qi_vector_mode) : m_to (to, to_load, NULL, NULL), m_from (from, from_load, from_cfn, from_cfn_data), m_len (len), m_max_size (MOVE_MAX_PIECES + 1), - m_push (push) + m_push (push), m_qi_vector_mode (qi_vector_mode) { int toi = m_to.get_addr_inc (); int fromi = m_from.get_addr_inc (); @@ -1124,7 +1152,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2) { /* Find the mode of the largest comparison. */ - scalar_int_mode mode = widest_int_mode_for_size (m_max_size); + machine_mode mode + = widest_int_or_QI_vector_mode_for_size (m_max_size, + m_qi_vector_mode); m_from.decide_autoinc (mode, m_reverse, len); m_to.decide_autoinc (mode, m_reverse, len); @@ -1139,22 +1169,42 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, /* This function returns the largest usable integer mode for LEN bytes whose size is no bigger than size of MODE. */ -scalar_int_mode -op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len) +machine_mode +op_by_pieces_d::get_usable_mode (machine_mode mode, unsigned int len) { unsigned int size; do { - size = GET_MODE_SIZE (mode); + size = GET_MODE_SIZE (mode).to_constant (); if (len >= size && prepare_mode (mode, m_align)) break; - /* NB: widest_int_mode_for_size checks SIZE > 1. */ - mode = widest_int_mode_for_size (size); + /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1. */ + mode = widest_int_or_QI_vector_mode_for_size (size, + m_qi_vector_mode); } while (1); return mode; } +/* Return the smallest integer or QI vector mode that is not narrower + than SIZE bits. */ + +machine_mode +op_by_pieces_d::smallest_mode_for_size (unsigned int size) +{ + /* Use QI vector only for > size of WORD. */ + if (m_qi_vector_mode && size > GET_MODE_PRECISION (word_mode)) + { + machine_mode mode; + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) + if (GET_MODE_INNER (mode) == QImode + && known_ge (GET_MODE_PRECISION (mode), size)) + return mode; + } + + return smallest_int_mode_for_size (size); +} + /* This function contains the main loop used for expanding a block operation. First move what we can in the largest integer mode, then go to successively smaller modes. For every access, call @@ -1166,8 +1216,10 @@ op_by_pieces_d::run () if (m_len == 0) return; - /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1. */ - scalar_int_mode mode = widest_int_mode_for_size (m_max_size); + /* NB: widest_int_or_QI_vector_mode_for_size checks M_MAX_SIZE > 1. */ + machine_mode mode + = widest_int_or_QI_vector_mode_for_size (m_max_size, + m_qi_vector_mode); mode = get_usable_mode (mode, m_len); by_pieces_prev to_prev = { nullptr, mode }; @@ -1175,7 +1227,7 @@ op_by_pieces_d::run () do { - unsigned int size = GET_MODE_SIZE (mode); + unsigned int size = GET_MODE_SIZE (mode).to_constant (); rtx to1 = NULL_RTX, from1; while (m_len >= size) @@ -1214,9 +1266,10 @@ op_by_pieces_d::run () /* NB: Generate overlapping operations if it is not a stack push since stack push must not overlap. Get the smallest integer mode for M_LEN bytes. */ - mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT); - mode = get_usable_mode (mode, GET_MODE_SIZE (mode)); - int gap = GET_MODE_SIZE (mode) - m_len; + mode = smallest_mode_for_size (m_len * BITS_PER_UNIT); + unsigned int mode_size = GET_MODE_SIZE (mode).to_constant (); + mode = get_usable_mode (mode, mode_size); + int gap = mode_size - m_len; if (gap > 0) { /* If size of MODE > M_LEN, generate the last operation @@ -1231,8 +1284,9 @@ op_by_pieces_d::run () } else { - /* NB: widest_int_mode_for_size checks SIZE > 1. */ - mode = widest_int_mode_for_size (size); + /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1. */ + mode = widest_int_or_QI_vector_mode_for_size (size, + m_qi_vector_mode); mode = get_usable_mode (mode, m_len); } } @@ -1355,9 +1409,10 @@ class store_by_pieces_d : public op_by_pieces_d public: store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data, - unsigned HOST_WIDE_INT len, unsigned int align) + unsigned HOST_WIDE_INT len, unsigned int align, + bool qi_vector_mode = false) : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len, - align, false) + align, false, qi_vector_mode) { } rtx finish_retmode (memop_ret); @@ -1446,13 +1501,14 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len, max_size = STORE_MAX_PIECES + 1; while (max_size > 1 && l > 0) { - scalar_int_mode mode = widest_int_mode_for_size (max_size); + machine_mode mode + = widest_int_or_QI_vector_mode_for_size (max_size, memsetp); icode = optab_handler (mov_optab, mode); if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode)) { - unsigned int size = GET_MODE_SIZE (mode); + unsigned int size = GET_MODE_SIZE (mode).to_constant (); while (l >= size) { @@ -1470,7 +1526,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len, } } - max_size = GET_MODE_SIZE (mode); + max_size = GET_MODE_SIZE (mode).to_constant (); } /* The code above should have handled everything. */ @@ -1504,7 +1560,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len, memsetp ? SET_BY_PIECES : STORE_BY_PIECES, optimize_insn_for_speed_p ())); - store_by_pieces_d data (to, constfun, constfundata, len, align); + store_by_pieces_d data (to, constfun, constfundata, len, align, + memsetp); data.run (); if (retmode != RETURN_BEGIN) @@ -1517,7 +1574,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len, Return const0_rtx unconditionally. */ static rtx -clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode) +clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, machine_mode) { return const0_rtx; } @@ -5754,7 +5811,7 @@ emit_storent_insn (rtx to, rtx from) static rtx string_cst_read_str (void *data, void *, HOST_WIDE_INT offset, - scalar_int_mode mode) + machine_mode mode) { tree str = (tree) data; @@ -5762,17 +5819,19 @@ string_cst_read_str (void *data, void *, HOST_WIDE_INT offset, if (offset >= TREE_STRING_LENGTH (str)) return const0_rtx; - if ((unsigned HOST_WIDE_INT) offset + GET_MODE_SIZE (mode) + unsigned int size = GET_MODE_SIZE (mode).to_constant (); + if ((unsigned HOST_WIDE_INT) offset + size > (unsigned HOST_WIDE_INT) TREE_STRING_LENGTH (str)) { - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); + char *p = XALLOCAVEC (char, size); size_t l = TREE_STRING_LENGTH (str) - offset; memcpy (p, TREE_STRING_POINTER (str) + offset, l); - memset (p + l, '\0', GET_MODE_SIZE (mode) - l); - return c_readstr (p, mode, false); + memset (p + l, '\0', size - l); + return c_readstr (p, as_a <scalar_int_mode> (mode), false); } - return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false); + return c_readstr (TREE_STRING_POINTER (str) + offset, + as_a <scalar_int_mode> (mode), false); } /* Generate code for computing expression EXP, diff --git a/gcc/expr.h b/gcc/expr.h index a4f44265759..552645e4403 100644 --- a/gcc/expr.h +++ b/gcc/expr.h @@ -108,13 +108,13 @@ enum block_op_methods }; typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT, - scalar_int_mode); + machine_mode); /* The second pointer passed to by_pieces_constfn. */ struct by_pieces_prev { rtx data; - scalar_int_mode mode; + machine_mode mode; }; extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods); diff --git a/gcc/target.def b/gcc/target.def index 2e40448e6c5..cc4aa3a4212 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -2726,6 +2726,13 @@ DEFHOOK rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code), NULL) +DEFHOOK +(gen_memset_scratch_rtx, + "This hook should return an rtx for scratch register in @var{mode} to\n\ +be used by memset broadcast. The default is @code{gen_reg_rtx}.", + rtx, (machine_mode mode), + gen_reg_rtx) + /* Return a new value for loop unroll size. */ DEFHOOK (loop_unroll_adjust, diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c index b6dbcf7809b..007e79f91b0 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-3.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c @@ -10,6 +10,6 @@ foo (void) } /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */ -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */ /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */ /* { dg-final { scan-assembler-not "vmovdqa" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c index f41e6147b4c..1e50dc842bc 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c @@ -4,6 +4,6 @@ #include "pr100865-4a.c" /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */ -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */ +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */ /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */ /* { dg-final { scan-assembler-not "vmovdqa" } } */ -- 2.31.1