diff mbox series

[11a/n] Avoid retrying with the same vector modes

Message ID mpt5zjyvbaz.fsf@arm.com
State New
Headers show
Series [11a/n] Avoid retrying with the same vector modes | expand

Commit Message

Richard Sandiford Nov. 5, 2019, 10:59 a.m. UTC
Patch 12/n makes the AArch64 port add four entries to
autovectorize_vector_modes.  Each entry describes a different
vector mode assignment for vector code that mixes 8-bit, 16-bit,
32-bit and 64-bit elements.  But if (as usual) the vector code has
fewer element sizes than that, we could end up trying the same
combination of vector modes multiple times.  This patch adds a
check to prevent that.

As before: each patch tested individually on aarch64-linux-gnu and the
series as a whole on x86_64-linux-gnu.


2019-11-04  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* tree-vectorizer.h (vec_info::mode_set): New typedef.
	(vec_info::used_vector_mode): New member variable.
	(vect_chooses_same_modes_p): Declare.
	* tree-vect-stmts.c (get_vectype_for_scalar_type): Record each
	chosen vector mode in vec_info::used_vector_mode.
	(vect_chooses_same_modes_p): New function.
	* tree-vect-loop.c (vect_analyze_loop): Use it to avoid trying
	the same vector statements multiple times.
	* tree-vect-slp.c (vect_slp_bb_region): Likewise.

Comments

Richard Biener Nov. 6, 2019, 9:49 a.m. UTC | #1
On Tue, Nov 5, 2019 at 9:25 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Patch 12/n makes the AArch64 port add four entries to
> autovectorize_vector_modes.  Each entry describes a different
> vector mode assignment for vector code that mixes 8-bit, 16-bit,
> 32-bit and 64-bit elements.  But if (as usual) the vector code has
> fewer element sizes than that, we could end up trying the same
> combination of vector modes multiple times.  This patch adds a
> check to prevent that.
>
> As before: each patch tested individually on aarch64-linux-gnu and the
> series as a whole on x86_64-linux-gnu.
>
>
> 2019-11-04  Richard Sandiford  <richard.sandiford@arm.com>
>
> gcc/
>         * tree-vectorizer.h (vec_info::mode_set): New typedef.
>         (vec_info::used_vector_mode): New member variable.
>         (vect_chooses_same_modes_p): Declare.
>         * tree-vect-stmts.c (get_vectype_for_scalar_type): Record each
>         chosen vector mode in vec_info::used_vector_mode.
>         (vect_chooses_same_modes_p): New function.
>         * tree-vect-loop.c (vect_analyze_loop): Use it to avoid trying
>         the same vector statements multiple times.
>         * tree-vect-slp.c (vect_slp_bb_region): Likewise.
>
> Index: gcc/tree-vectorizer.h
> ===================================================================
> --- gcc/tree-vectorizer.h       2019-11-05 10:48:11.246092351 +0000
> +++ gcc/tree-vectorizer.h       2019-11-05 10:57:41.662071145 +0000
> @@ -298,6 +298,7 @@ typedef std::pair<tree, tree> vec_object
>  /* Vectorizer state common between loop and basic-block vectorization.  */
>  class vec_info {
>  public:
> +  typedef hash_set<int_hash<machine_mode, E_VOIDmode, E_BLKmode> > mode_set;
>    enum vec_kind { bb, loop };
>
>    vec_info (vec_kind, void *, vec_info_shared *);
> @@ -335,6 +336,9 @@ typedef std::pair<tree, tree> vec_object
>    /* Cost data used by the target cost model.  */
>    void *target_cost_data;
>
> +  /* The set of vector modes used in the vectorized region.  */
> +  mode_set used_vector_modes;
> +
>    /* The argument we should pass to related_vector_mode when looking up
>       the vector mode for a scalar mode, or VOIDmode if we haven't yet
>       made any decisions about which vector modes to use.  */
> @@ -1615,6 +1619,7 @@ extern tree get_related_vectype_for_scal
>  extern tree get_vectype_for_scalar_type (vec_info *, tree);
>  extern tree get_mask_type_for_scalar_type (vec_info *, tree);
>  extern tree get_same_sized_vectype (tree, tree);
> +extern bool vect_chooses_same_modes_p (vec_info *, machine_mode);
>  extern bool vect_get_loop_mask_type (loop_vec_info);
>  extern bool vect_is_simple_use (tree, vec_info *, enum vect_def_type *,
>                                 stmt_vec_info * = NULL, gimple ** = NULL);
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c       2019-11-05 10:48:11.242092379 +0000
> +++ gcc/tree-vect-stmts.c       2019-11-05 10:57:41.662071145 +0000
> @@ -11235,6 +11235,10 @@ get_vectype_for_scalar_type (vec_info *v
>                                                       scalar_type);
>    if (vectype && vinfo->vector_mode == VOIDmode)
>      vinfo->vector_mode = TYPE_MODE (vectype);
> +
> +  if (vectype)
> +    vinfo->used_vector_modes.add (TYPE_MODE (vectype));
> +

Do we actually end up _using_ all types returned by this function?

Otherwise OK.

Richard.

>    return vectype;
>  }
>
> @@ -11274,6 +11278,20 @@ get_same_sized_vectype (tree scalar_type
>                                               scalar_type, nunits);
>  }
>
> +/* Return true if replacing LOOP_VINFO->vector_mode with VECTOR_MODE
> +   would not change the chosen vector modes.  */
> +
> +bool
> +vect_chooses_same_modes_p (vec_info *vinfo, machine_mode vector_mode)
> +{
> +  for (vec_info::mode_set::iterator i = vinfo->used_vector_modes.begin ();
> +       i != vinfo->used_vector_modes.end (); ++i)
> +    if (!VECTOR_MODE_P (*i)
> +       || related_vector_mode (vector_mode, GET_MODE_INNER (*i), 0) != *i)
> +      return false;
> +  return true;
> +}
> +
>  /* Function vect_is_simple_use.
>
>     Input:
> Index: gcc/tree-vect-loop.c
> ===================================================================
> --- gcc/tree-vect-loop.c        2019-11-05 10:48:11.238092407 +0000
> +++ gcc/tree-vect-loop.c        2019-11-05 10:57:41.658071173 +0000
> @@ -2430,6 +2430,19 @@ vect_analyze_loop (class loop *loop, vec
>         }
>
>        loop->aux = NULL;
> +
> +      if (!fatal)
> +       while (mode_i < vector_modes.length ()
> +              && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
> +         {
> +           if (dump_enabled_p ())
> +             dump_printf_loc (MSG_NOTE, vect_location,
> +                              "***** The result for vector mode %s would"
> +                              " be the same\n",
> +                              GET_MODE_NAME (vector_modes[mode_i]));
> +           mode_i += 1;
> +         }
> +
>        if (res)
>         {
>           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
> Index: gcc/tree-vect-slp.c
> ===================================================================
> --- gcc/tree-vect-slp.c 2019-11-05 10:48:11.242092379 +0000
> +++ gcc/tree-vect-slp.c 2019-11-05 10:57:41.662071145 +0000
> @@ -3238,6 +3238,18 @@ vect_slp_bb_region (gimple_stmt_iterator
>        if (mode_i == 0)
>         autodetected_vector_mode = bb_vinfo->vector_mode;
>
> +      if (!fatal)
> +       while (mode_i < vector_modes.length ()
> +              && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
> +         {
> +           if (dump_enabled_p ())
> +             dump_printf_loc (MSG_NOTE, vect_location,
> +                              "***** The result for vector mode %s would"
> +                              " be the same\n",
> +                              GET_MODE_NAME (vector_modes[mode_i]));
> +           mode_i += 1;
> +         }
> +
>        delete bb_vinfo;
>
>        if (mode_i < vector_modes.length ()
Richard Sandiford Nov. 6, 2019, 10:21 a.m. UTC | #2
Richard Biener <richard.guenther@gmail.com> writes:
> On Tue, Nov 5, 2019 at 9:25 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Patch 12/n makes the AArch64 port add four entries to
>> autovectorize_vector_modes.  Each entry describes a different
>> vector mode assignment for vector code that mixes 8-bit, 16-bit,
>> 32-bit and 64-bit elements.  But if (as usual) the vector code has
>> fewer element sizes than that, we could end up trying the same
>> combination of vector modes multiple times.  This patch adds a
>> check to prevent that.
>>
>> As before: each patch tested individually on aarch64-linux-gnu and the
>> series as a whole on x86_64-linux-gnu.
>>
>>
>> 2019-11-04  Richard Sandiford  <richard.sandiford@arm.com>
>>
>> gcc/
>>         * tree-vectorizer.h (vec_info::mode_set): New typedef.
>>         (vec_info::used_vector_mode): New member variable.
>>         (vect_chooses_same_modes_p): Declare.
>>         * tree-vect-stmts.c (get_vectype_for_scalar_type): Record each
>>         chosen vector mode in vec_info::used_vector_mode.
>>         (vect_chooses_same_modes_p): New function.
>>         * tree-vect-loop.c (vect_analyze_loop): Use it to avoid trying
>>         the same vector statements multiple times.
>>         * tree-vect-slp.c (vect_slp_bb_region): Likewise.
>>
>> Index: gcc/tree-vectorizer.h
>> ===================================================================
>> --- gcc/tree-vectorizer.h       2019-11-05 10:48:11.246092351 +0000
>> +++ gcc/tree-vectorizer.h       2019-11-05 10:57:41.662071145 +0000
>> @@ -298,6 +298,7 @@ typedef std::pair<tree, tree> vec_object
>>  /* Vectorizer state common between loop and basic-block vectorization.  */
>>  class vec_info {
>>  public:
>> +  typedef hash_set<int_hash<machine_mode, E_VOIDmode, E_BLKmode> > mode_set;
>>    enum vec_kind { bb, loop };
>>
>>    vec_info (vec_kind, void *, vec_info_shared *);
>> @@ -335,6 +336,9 @@ typedef std::pair<tree, tree> vec_object
>>    /* Cost data used by the target cost model.  */
>>    void *target_cost_data;
>>
>> +  /* The set of vector modes used in the vectorized region.  */
>> +  mode_set used_vector_modes;
>> +
>>    /* The argument we should pass to related_vector_mode when looking up
>>       the vector mode for a scalar mode, or VOIDmode if we haven't yet
>>       made any decisions about which vector modes to use.  */
>> @@ -1615,6 +1619,7 @@ extern tree get_related_vectype_for_scal
>>  extern tree get_vectype_for_scalar_type (vec_info *, tree);
>>  extern tree get_mask_type_for_scalar_type (vec_info *, tree);
>>  extern tree get_same_sized_vectype (tree, tree);
>> +extern bool vect_chooses_same_modes_p (vec_info *, machine_mode);
>>  extern bool vect_get_loop_mask_type (loop_vec_info);
>>  extern bool vect_is_simple_use (tree, vec_info *, enum vect_def_type *,
>>                                 stmt_vec_info * = NULL, gimple ** = NULL);
>> Index: gcc/tree-vect-stmts.c
>> ===================================================================
>> --- gcc/tree-vect-stmts.c       2019-11-05 10:48:11.242092379 +0000
>> +++ gcc/tree-vect-stmts.c       2019-11-05 10:57:41.662071145 +0000
>> @@ -11235,6 +11235,10 @@ get_vectype_for_scalar_type (vec_info *v
>>                                                       scalar_type);
>>    if (vectype && vinfo->vector_mode == VOIDmode)
>>      vinfo->vector_mode = TYPE_MODE (vectype);
>> +
>> +  if (vectype)
>> +    vinfo->used_vector_modes.add (TYPE_MODE (vectype));
>> +
>
> Do we actually end up _using_ all types returned by this function?

No, not all of them, so it's a bit crude.  E.g. some types might end up
not being relevant after pattern recognition, or after we've made a
final decision about which parts of an address calculation to include
in a gather or scatter op.  So we can still end up retrying the same
thing even after the patch.

The problem is that we're trying to avoid pointless retries on failure
as well as success, so we could end up stopping at arbitrary points.
I wasn't sure where else to handle this.

Thanks,
Richard
Richard Biener Nov. 6, 2019, 10:26 a.m. UTC | #3
On Wed, Nov 6, 2019 at 11:21 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> > On Tue, Nov 5, 2019 at 9:25 PM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Patch 12/n makes the AArch64 port add four entries to
> >> autovectorize_vector_modes.  Each entry describes a different
> >> vector mode assignment for vector code that mixes 8-bit, 16-bit,
> >> 32-bit and 64-bit elements.  But if (as usual) the vector code has
> >> fewer element sizes than that, we could end up trying the same
> >> combination of vector modes multiple times.  This patch adds a
> >> check to prevent that.
> >>
> >> As before: each patch tested individually on aarch64-linux-gnu and the
> >> series as a whole on x86_64-linux-gnu.
> >>
> >>
> >> 2019-11-04  Richard Sandiford  <richard.sandiford@arm.com>
> >>
> >> gcc/
> >>         * tree-vectorizer.h (vec_info::mode_set): New typedef.
> >>         (vec_info::used_vector_mode): New member variable.
> >>         (vect_chooses_same_modes_p): Declare.
> >>         * tree-vect-stmts.c (get_vectype_for_scalar_type): Record each
> >>         chosen vector mode in vec_info::used_vector_mode.
> >>         (vect_chooses_same_modes_p): New function.
> >>         * tree-vect-loop.c (vect_analyze_loop): Use it to avoid trying
> >>         the same vector statements multiple times.
> >>         * tree-vect-slp.c (vect_slp_bb_region): Likewise.
> >>
> >> Index: gcc/tree-vectorizer.h
> >> ===================================================================
> >> --- gcc/tree-vectorizer.h       2019-11-05 10:48:11.246092351 +0000
> >> +++ gcc/tree-vectorizer.h       2019-11-05 10:57:41.662071145 +0000
> >> @@ -298,6 +298,7 @@ typedef std::pair<tree, tree> vec_object
> >>  /* Vectorizer state common between loop and basic-block vectorization.  */
> >>  class vec_info {
> >>  public:
> >> +  typedef hash_set<int_hash<machine_mode, E_VOIDmode, E_BLKmode> > mode_set;
> >>    enum vec_kind { bb, loop };
> >>
> >>    vec_info (vec_kind, void *, vec_info_shared *);
> >> @@ -335,6 +336,9 @@ typedef std::pair<tree, tree> vec_object
> >>    /* Cost data used by the target cost model.  */
> >>    void *target_cost_data;
> >>
> >> +  /* The set of vector modes used in the vectorized region.  */
> >> +  mode_set used_vector_modes;
> >> +
> >>    /* The argument we should pass to related_vector_mode when looking up
> >>       the vector mode for a scalar mode, or VOIDmode if we haven't yet
> >>       made any decisions about which vector modes to use.  */
> >> @@ -1615,6 +1619,7 @@ extern tree get_related_vectype_for_scal
> >>  extern tree get_vectype_for_scalar_type (vec_info *, tree);
> >>  extern tree get_mask_type_for_scalar_type (vec_info *, tree);
> >>  extern tree get_same_sized_vectype (tree, tree);
> >> +extern bool vect_chooses_same_modes_p (vec_info *, machine_mode);
> >>  extern bool vect_get_loop_mask_type (loop_vec_info);
> >>  extern bool vect_is_simple_use (tree, vec_info *, enum vect_def_type *,
> >>                                 stmt_vec_info * = NULL, gimple ** = NULL);
> >> Index: gcc/tree-vect-stmts.c
> >> ===================================================================
> >> --- gcc/tree-vect-stmts.c       2019-11-05 10:48:11.242092379 +0000
> >> +++ gcc/tree-vect-stmts.c       2019-11-05 10:57:41.662071145 +0000
> >> @@ -11235,6 +11235,10 @@ get_vectype_for_scalar_type (vec_info *v
> >>                                                       scalar_type);
> >>    if (vectype && vinfo->vector_mode == VOIDmode)
> >>      vinfo->vector_mode = TYPE_MODE (vectype);
> >> +
> >> +  if (vectype)
> >> +    vinfo->used_vector_modes.add (TYPE_MODE (vectype));
> >> +
> >
> > Do we actually end up _using_ all types returned by this function?
>
> No, not all of them, so it's a bit crude.  E.g. some types might end up
> not being relevant after pattern recognition, or after we've made a
> final decision about which parts of an address calculation to include
> in a gather or scatter op.  So we can still end up retrying the same
> thing even after the patch.
>
> The problem is that we're trying to avoid pointless retries on failure
> as well as success, so we could end up stopping at arbitrary points.
> I wasn't sure where else to handle this.

Yeah, I think this "iterating" is somewhat bogus (crude) now.  What we'd
like to collect is for all defs the vector types we could use and then
vectorizable_ defines constraints between input and output vector types.
From that we'd arrive at a (possibly quite large) set of "SLP graphs
with vector types"
we'd choose from.  I believe we'll never want to truly explore the whole space
but guess we want to greedily compute those "SLP graphs with vector types"
starting from what (grouped) datarefs tells us is possible (which is
kind of what
we do now).

Richard.

> Thanks,
> Richard
Richard Sandiford Nov. 6, 2019, 11:02 a.m. UTC | #4
Richard Biener <richard.guenther@gmail.com> writes:
> On Wed, Nov 6, 2019 at 11:21 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Richard Biener <richard.guenther@gmail.com> writes:
>> > On Tue, Nov 5, 2019 at 9:25 PM Richard Sandiford
>> > <richard.sandiford@arm.com> wrote:
>> >>
>> >> Patch 12/n makes the AArch64 port add four entries to
>> >> autovectorize_vector_modes.  Each entry describes a different
>> >> vector mode assignment for vector code that mixes 8-bit, 16-bit,
>> >> 32-bit and 64-bit elements.  But if (as usual) the vector code has
>> >> fewer element sizes than that, we could end up trying the same
>> >> combination of vector modes multiple times.  This patch adds a
>> >> check to prevent that.
>> >>
>> >> As before: each patch tested individually on aarch64-linux-gnu and the
>> >> series as a whole on x86_64-linux-gnu.
>> >>
>> >>
>> >> 2019-11-04  Richard Sandiford  <richard.sandiford@arm.com>
>> >>
>> >> gcc/
>> >>         * tree-vectorizer.h (vec_info::mode_set): New typedef.
>> >>         (vec_info::used_vector_mode): New member variable.
>> >>         (vect_chooses_same_modes_p): Declare.
>> >>         * tree-vect-stmts.c (get_vectype_for_scalar_type): Record each
>> >>         chosen vector mode in vec_info::used_vector_mode.
>> >>         (vect_chooses_same_modes_p): New function.
>> >>         * tree-vect-loop.c (vect_analyze_loop): Use it to avoid trying
>> >>         the same vector statements multiple times.
>> >>         * tree-vect-slp.c (vect_slp_bb_region): Likewise.
>> >>
>> >> Index: gcc/tree-vectorizer.h
>> >> ===================================================================
>> >> --- gcc/tree-vectorizer.h       2019-11-05 10:48:11.246092351 +0000
>> >> +++ gcc/tree-vectorizer.h       2019-11-05 10:57:41.662071145 +0000
>> >> @@ -298,6 +298,7 @@ typedef std::pair<tree, tree> vec_object
>> >>  /* Vectorizer state common between loop and basic-block vectorization.  */
>> >>  class vec_info {
>> >>  public:
>> >> +  typedef hash_set<int_hash<machine_mode, E_VOIDmode, E_BLKmode> > mode_set;
>> >>    enum vec_kind { bb, loop };
>> >>
>> >>    vec_info (vec_kind, void *, vec_info_shared *);
>> >> @@ -335,6 +336,9 @@ typedef std::pair<tree, tree> vec_object
>> >>    /* Cost data used by the target cost model.  */
>> >>    void *target_cost_data;
>> >>
>> >> +  /* The set of vector modes used in the vectorized region.  */
>> >> +  mode_set used_vector_modes;
>> >> +
>> >>    /* The argument we should pass to related_vector_mode when looking up
>> >>       the vector mode for a scalar mode, or VOIDmode if we haven't yet
>> >>       made any decisions about which vector modes to use.  */
>> >> @@ -1615,6 +1619,7 @@ extern tree get_related_vectype_for_scal
>> >>  extern tree get_vectype_for_scalar_type (vec_info *, tree);
>> >>  extern tree get_mask_type_for_scalar_type (vec_info *, tree);
>> >>  extern tree get_same_sized_vectype (tree, tree);
>> >> +extern bool vect_chooses_same_modes_p (vec_info *, machine_mode);
>> >>  extern bool vect_get_loop_mask_type (loop_vec_info);
>> >>  extern bool vect_is_simple_use (tree, vec_info *, enum vect_def_type *,
>> >>                                 stmt_vec_info * = NULL, gimple ** = NULL);
>> >> Index: gcc/tree-vect-stmts.c
>> >> ===================================================================
>> >> --- gcc/tree-vect-stmts.c       2019-11-05 10:48:11.242092379 +0000
>> >> +++ gcc/tree-vect-stmts.c       2019-11-05 10:57:41.662071145 +0000
>> >> @@ -11235,6 +11235,10 @@ get_vectype_for_scalar_type (vec_info *v
>> >>                                                       scalar_type);
>> >>    if (vectype && vinfo->vector_mode == VOIDmode)
>> >>      vinfo->vector_mode = TYPE_MODE (vectype);
>> >> +
>> >> +  if (vectype)
>> >> +    vinfo->used_vector_modes.add (TYPE_MODE (vectype));
>> >> +
>> >
>> > Do we actually end up _using_ all types returned by this function?
>>
>> No, not all of them, so it's a bit crude.  E.g. some types might end up
>> not being relevant after pattern recognition, or after we've made a
>> final decision about which parts of an address calculation to include
>> in a gather or scatter op.  So we can still end up retrying the same
>> thing even after the patch.
>>
>> The problem is that we're trying to avoid pointless retries on failure
>> as well as success, so we could end up stopping at arbitrary points.
>> I wasn't sure where else to handle this.
>
> Yeah, I think this "iterating" is somewhat bogus (crude) now.

I think it was crude even before the series though. :-)  Not sure the
series is making things worse.

The problem is that there's a chicken-and-egg problem between how
we decide to vectorise and which vector subarchitecture and VF we use.
E.g. if we have:

  unsigned char *x, *y;
  ...
  x[i] = (unsigned short) (x[i] + y[i] + 1) >> 1;

do we build the SLP graph on the assumption that we need to use short
elements, or on the assumption that we can use IFN_AVG_CEIL?  This
affects the VF we get out: using IFN_AVG_CEIL gives double the VF
relative to doing unsigned short arithmetic.

And we need to know which vector subarchitecture we're targetting when
making that decision: e.g. Advanced SIMD and SVE2 have IFN_AVG_CEIL,
but SVE doesn't.  On the other hand, SVE supports things that Advanced
SIMD doesn't.  It's similar story of course for the x86 vector subarchs.

For one pattern like this, we could simply try both ways.
But that becomes untenable if there are multiple potential patterns.
Iterating over the vector subarchs gives us a sensible way of reducing
the search space by only applying patterns that the subarch supports.

So...

> What we'd like to collect is for all defs the vector types we could
> use and then vectorizable_ defines constraints between input and
> output vector types.  From that we'd arrive at a (possibly quite
> large) set of "SLP graphs with vector types" we'd choose from.  I
> believe we'll never want to truly explore the whole space but guess we
> want to greedily compute those "SLP graphs with vector types" starting
> from what (grouped) datarefs tells us is possible (which is kind of
> what we do now).

...I don't think we can/should use the same SLP graph to pick vector
types for all subarchs, since the ideal graph depends on the subarch.
I'm also not sure the vectorizable_* routines could say anything that
isn't obvious from the input and output scalar types.  Won't it still be
the case that within an SLP instance, all scalars of type T will become
the same vector(N) T?

Thanks,
Richard
Richard Biener Nov. 6, 2019, 11:22 a.m. UTC | #5
On Wed, Nov 6, 2019 at 12:02 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> > On Wed, Nov 6, 2019 at 11:21 AM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Richard Biener <richard.guenther@gmail.com> writes:
> >> > On Tue, Nov 5, 2019 at 9:25 PM Richard Sandiford
> >> > <richard.sandiford@arm.com> wrote:
> >> >>
> >> >> Patch 12/n makes the AArch64 port add four entries to
> >> >> autovectorize_vector_modes.  Each entry describes a different
> >> >> vector mode assignment for vector code that mixes 8-bit, 16-bit,
> >> >> 32-bit and 64-bit elements.  But if (as usual) the vector code has
> >> >> fewer element sizes than that, we could end up trying the same
> >> >> combination of vector modes multiple times.  This patch adds a
> >> >> check to prevent that.
> >> >>
> >> >> As before: each patch tested individually on aarch64-linux-gnu and the
> >> >> series as a whole on x86_64-linux-gnu.
> >> >>
> >> >>
> >> >> 2019-11-04  Richard Sandiford  <richard.sandiford@arm.com>
> >> >>
> >> >> gcc/
> >> >>         * tree-vectorizer.h (vec_info::mode_set): New typedef.
> >> >>         (vec_info::used_vector_mode): New member variable.
> >> >>         (vect_chooses_same_modes_p): Declare.
> >> >>         * tree-vect-stmts.c (get_vectype_for_scalar_type): Record each
> >> >>         chosen vector mode in vec_info::used_vector_mode.
> >> >>         (vect_chooses_same_modes_p): New function.
> >> >>         * tree-vect-loop.c (vect_analyze_loop): Use it to avoid trying
> >> >>         the same vector statements multiple times.
> >> >>         * tree-vect-slp.c (vect_slp_bb_region): Likewise.
> >> >>
> >> >> Index: gcc/tree-vectorizer.h
> >> >> ===================================================================
> >> >> --- gcc/tree-vectorizer.h       2019-11-05 10:48:11.246092351 +0000
> >> >> +++ gcc/tree-vectorizer.h       2019-11-05 10:57:41.662071145 +0000
> >> >> @@ -298,6 +298,7 @@ typedef std::pair<tree, tree> vec_object
> >> >>  /* Vectorizer state common between loop and basic-block vectorization.  */
> >> >>  class vec_info {
> >> >>  public:
> >> >> +  typedef hash_set<int_hash<machine_mode, E_VOIDmode, E_BLKmode> > mode_set;
> >> >>    enum vec_kind { bb, loop };
> >> >>
> >> >>    vec_info (vec_kind, void *, vec_info_shared *);
> >> >> @@ -335,6 +336,9 @@ typedef std::pair<tree, tree> vec_object
> >> >>    /* Cost data used by the target cost model.  */
> >> >>    void *target_cost_data;
> >> >>
> >> >> +  /* The set of vector modes used in the vectorized region.  */
> >> >> +  mode_set used_vector_modes;
> >> >> +
> >> >>    /* The argument we should pass to related_vector_mode when looking up
> >> >>       the vector mode for a scalar mode, or VOIDmode if we haven't yet
> >> >>       made any decisions about which vector modes to use.  */
> >> >> @@ -1615,6 +1619,7 @@ extern tree get_related_vectype_for_scal
> >> >>  extern tree get_vectype_for_scalar_type (vec_info *, tree);
> >> >>  extern tree get_mask_type_for_scalar_type (vec_info *, tree);
> >> >>  extern tree get_same_sized_vectype (tree, tree);
> >> >> +extern bool vect_chooses_same_modes_p (vec_info *, machine_mode);
> >> >>  extern bool vect_get_loop_mask_type (loop_vec_info);
> >> >>  extern bool vect_is_simple_use (tree, vec_info *, enum vect_def_type *,
> >> >>                                 stmt_vec_info * = NULL, gimple ** = NULL);
> >> >> Index: gcc/tree-vect-stmts.c
> >> >> ===================================================================
> >> >> --- gcc/tree-vect-stmts.c       2019-11-05 10:48:11.242092379 +0000
> >> >> +++ gcc/tree-vect-stmts.c       2019-11-05 10:57:41.662071145 +0000
> >> >> @@ -11235,6 +11235,10 @@ get_vectype_for_scalar_type (vec_info *v
> >> >>                                                       scalar_type);
> >> >>    if (vectype && vinfo->vector_mode == VOIDmode)
> >> >>      vinfo->vector_mode = TYPE_MODE (vectype);
> >> >> +
> >> >> +  if (vectype)
> >> >> +    vinfo->used_vector_modes.add (TYPE_MODE (vectype));
> >> >> +
> >> >
> >> > Do we actually end up _using_ all types returned by this function?
> >>
> >> No, not all of them, so it's a bit crude.  E.g. some types might end up
> >> not being relevant after pattern recognition, or after we've made a
> >> final decision about which parts of an address calculation to include
> >> in a gather or scatter op.  So we can still end up retrying the same
> >> thing even after the patch.
> >>
> >> The problem is that we're trying to avoid pointless retries on failure
> >> as well as success, so we could end up stopping at arbitrary points.
> >> I wasn't sure where else to handle this.
> >
> > Yeah, I think this "iterating" is somewhat bogus (crude) now.
>
> I think it was crude even before the series though. :-)  Not sure the
> series is making things worse.
>
> The problem is that there's a chicken-and-egg problem between how
> we decide to vectorise and which vector subarchitecture and VF we use.
> E.g. if we have:
>
>   unsigned char *x, *y;
>   ...
>   x[i] = (unsigned short) (x[i] + y[i] + 1) >> 1;
>
> do we build the SLP graph on the assumption that we need to use short
> elements, or on the assumption that we can use IFN_AVG_CEIL?  This
> affects the VF we get out: using IFN_AVG_CEIL gives double the VF
> relative to doing unsigned short arithmetic.
>
> And we need to know which vector subarchitecture we're targetting when
> making that decision: e.g. Advanced SIMD and SVE2 have IFN_AVG_CEIL,
> but SVE doesn't.  On the other hand, SVE supports things that Advanced
> SIMD doesn't.  It's similar story of course for the x86 vector subarchs.
>
> For one pattern like this, we could simply try both ways.
> But that becomes untenable if there are multiple potential patterns.
> Iterating over the vector subarchs gives us a sensible way of reducing
> the search space by only applying patterns that the subarch supports.
>
> So...
>
> > What we'd like to collect is for all defs the vector types we could
> > use and then vectorizable_ defines constraints between input and
> > output vector types.  From that we'd arrive at a (possibly quite
> > large) set of "SLP graphs with vector types" we'd choose from.  I
> > believe we'll never want to truly explore the whole space but guess we
> > want to greedily compute those "SLP graphs with vector types" starting
> > from what (grouped) datarefs tells us is possible (which is kind of
> > what we do now).
>
> ...I don't think we can/should use the same SLP graph to pick vector
> types for all subarchs, since the ideal graph depends on the subarch.
> I'm also not sure the vectorizable_* routines could say anything that
> isn't obvious from the input and output scalar types.  Won't it still be
> the case that within an SLP instance, all scalars of type T will become
> the same vector(N) T?

Not necessarily.  I can see the SLP graph containing "reductions"
(like those complex patterns proposed).  But yes, at the moment
there's a single group-size per SLP instance.  Now, for the SLP _graph_
we may have one instance with 4 and one with group size of 8 both
for example sharing the same grouped load.  It may make sense to
vectorize the load with v8si and for the group-size 4 SLP instance
have a "reduction operation" that selects the appropriate part of the
loaded vector.

Now, vectorizable_* for say a conversion from int to double
may be able to vectorize for a v4si directly to v4df or to two times v2df.
With the size restriction relaxed vectorizable_* could opt to choose
smaller/larger vectors specifically and thus also have different vector types
for the same scalar type in the SLP graph.  I do expect this to be
profitable on x86 for some loops due to some asymmetries in the ISA
(and extra cost of lane-crossing operations for say AVX where using SSE
is cheaper for some ops even though you now have 2 or more instructions).

Richard.

>
> Thanks,
> Richard
Richard Sandiford Nov. 6, 2019, 12:47 p.m. UTC | #6
Richard Biener <richard.guenther@gmail.com> writes:
> On Wed, Nov 6, 2019 at 12:02 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Richard Biener <richard.guenther@gmail.com> writes:
>> > On Wed, Nov 6, 2019 at 11:21 AM Richard Sandiford
>> > <richard.sandiford@arm.com> wrote:
>> >>
>> >> Richard Biener <richard.guenther@gmail.com> writes:
>> >> > On Tue, Nov 5, 2019 at 9:25 PM Richard Sandiford
>> >> > <richard.sandiford@arm.com> wrote:
>> >> >>
>> >> >> Patch 12/n makes the AArch64 port add four entries to
>> >> >> autovectorize_vector_modes.  Each entry describes a different
>> >> >> vector mode assignment for vector code that mixes 8-bit, 16-bit,
>> >> >> 32-bit and 64-bit elements.  But if (as usual) the vector code has
>> >> >> fewer element sizes than that, we could end up trying the same
>> >> >> combination of vector modes multiple times.  This patch adds a
>> >> >> check to prevent that.
>> >> >>
>> >> >> As before: each patch tested individually on aarch64-linux-gnu and the
>> >> >> series as a whole on x86_64-linux-gnu.
>> >> >>
>> >> >>
>> >> >> 2019-11-04  Richard Sandiford  <richard.sandiford@arm.com>
>> >> >>
>> >> >> gcc/
>> >> >>         * tree-vectorizer.h (vec_info::mode_set): New typedef.
>> >> >>         (vec_info::used_vector_mode): New member variable.
>> >> >>         (vect_chooses_same_modes_p): Declare.
>> >> >>         * tree-vect-stmts.c (get_vectype_for_scalar_type): Record each
>> >> >>         chosen vector mode in vec_info::used_vector_mode.
>> >> >>         (vect_chooses_same_modes_p): New function.
>> >> >>         * tree-vect-loop.c (vect_analyze_loop): Use it to avoid trying
>> >> >>         the same vector statements multiple times.
>> >> >>         * tree-vect-slp.c (vect_slp_bb_region): Likewise.
>> >> >>
>> >> >> Index: gcc/tree-vectorizer.h
>> >> >> ===================================================================
>> >> >> --- gcc/tree-vectorizer.h       2019-11-05 10:48:11.246092351 +0000
>> >> >> +++ gcc/tree-vectorizer.h       2019-11-05 10:57:41.662071145 +0000
>> >> >> @@ -298,6 +298,7 @@ typedef std::pair<tree, tree> vec_object
>> >> >>  /* Vectorizer state common between loop and basic-block vectorization.  */
>> >> >>  class vec_info {
>> >> >>  public:
>> >> >> +  typedef hash_set<int_hash<machine_mode, E_VOIDmode, E_BLKmode> > mode_set;
>> >> >>    enum vec_kind { bb, loop };
>> >> >>
>> >> >>    vec_info (vec_kind, void *, vec_info_shared *);
>> >> >> @@ -335,6 +336,9 @@ typedef std::pair<tree, tree> vec_object
>> >> >>    /* Cost data used by the target cost model.  */
>> >> >>    void *target_cost_data;
>> >> >>
>> >> >> +  /* The set of vector modes used in the vectorized region.  */
>> >> >> +  mode_set used_vector_modes;
>> >> >> +
>> >> >>    /* The argument we should pass to related_vector_mode when looking up
>> >> >>       the vector mode for a scalar mode, or VOIDmode if we haven't yet
>> >> >>       made any decisions about which vector modes to use.  */
>> >> >> @@ -1615,6 +1619,7 @@ extern tree get_related_vectype_for_scal
>> >> >>  extern tree get_vectype_for_scalar_type (vec_info *, tree);
>> >> >>  extern tree get_mask_type_for_scalar_type (vec_info *, tree);
>> >> >>  extern tree get_same_sized_vectype (tree, tree);
>> >> >> +extern bool vect_chooses_same_modes_p (vec_info *, machine_mode);
>> >> >>  extern bool vect_get_loop_mask_type (loop_vec_info);
>> >> >>  extern bool vect_is_simple_use (tree, vec_info *, enum vect_def_type *,
>> >> >>                                 stmt_vec_info * = NULL, gimple ** = NULL);
>> >> >> Index: gcc/tree-vect-stmts.c
>> >> >> ===================================================================
>> >> >> --- gcc/tree-vect-stmts.c       2019-11-05 10:48:11.242092379 +0000
>> >> >> +++ gcc/tree-vect-stmts.c       2019-11-05 10:57:41.662071145 +0000
>> >> >> @@ -11235,6 +11235,10 @@ get_vectype_for_scalar_type (vec_info *v
>> >> >>                                                       scalar_type);
>> >> >>    if (vectype && vinfo->vector_mode == VOIDmode)
>> >> >>      vinfo->vector_mode = TYPE_MODE (vectype);
>> >> >> +
>> >> >> +  if (vectype)
>> >> >> +    vinfo->used_vector_modes.add (TYPE_MODE (vectype));
>> >> >> +
>> >> >
>> >> > Do we actually end up _using_ all types returned by this function?
>> >>
>> >> No, not all of them, so it's a bit crude.  E.g. some types might end up
>> >> not being relevant after pattern recognition, or after we've made a
>> >> final decision about which parts of an address calculation to include
>> >> in a gather or scatter op.  So we can still end up retrying the same
>> >> thing even after the patch.
>> >>
>> >> The problem is that we're trying to avoid pointless retries on failure
>> >> as well as success, so we could end up stopping at arbitrary points.
>> >> I wasn't sure where else to handle this.
>> >
>> > Yeah, I think this "iterating" is somewhat bogus (crude) now.
>>
>> I think it was crude even before the series though. :-)  Not sure the
>> series is making things worse.
>>
>> The problem is that there's a chicken-and-egg problem between how
>> we decide to vectorise and which vector subarchitecture and VF we use.
>> E.g. if we have:
>>
>>   unsigned char *x, *y;
>>   ...
>>   x[i] = (unsigned short) (x[i] + y[i] + 1) >> 1;
>>
>> do we build the SLP graph on the assumption that we need to use short
>> elements, or on the assumption that we can use IFN_AVG_CEIL?  This
>> affects the VF we get out: using IFN_AVG_CEIL gives double the VF
>> relative to doing unsigned short arithmetic.
>>
>> And we need to know which vector subarchitecture we're targetting when
>> making that decision: e.g. Advanced SIMD and SVE2 have IFN_AVG_CEIL,
>> but SVE doesn't.  On the other hand, SVE supports things that Advanced
>> SIMD doesn't.  It's similar story of course for the x86 vector subarchs.
>>
>> For one pattern like this, we could simply try both ways.
>> But that becomes untenable if there are multiple potential patterns.
>> Iterating over the vector subarchs gives us a sensible way of reducing
>> the search space by only applying patterns that the subarch supports.
>>
>> So...
>>
>> > What we'd like to collect is for all defs the vector types we could
>> > use and then vectorizable_ defines constraints between input and
>> > output vector types.  From that we'd arrive at a (possibly quite
>> > large) set of "SLP graphs with vector types" we'd choose from.  I
>> > believe we'll never want to truly explore the whole space but guess we
>> > want to greedily compute those "SLP graphs with vector types" starting
>> > from what (grouped) datarefs tells us is possible (which is kind of
>> > what we do now).
>>
>> ...I don't think we can/should use the same SLP graph to pick vector
>> types for all subarchs, since the ideal graph depends on the subarch.
>> I'm also not sure the vectorizable_* routines could say anything that
>> isn't obvious from the input and output scalar types.  Won't it still be
>> the case that within an SLP instance, all scalars of type T will become
>> the same vector(N) T?
>
> Not necessarily.  I can see the SLP graph containing "reductions"
> (like those complex patterns proposed).  But yes, at the moment
> there's a single group-size per SLP instance.  Now, for the SLP _graph_
> we may have one instance with 4 and one with group size of 8 both
> for example sharing the same grouped load.  It may make sense to
> vectorize the load with v8si and for the group-size 4 SLP instance
> have a "reduction operation" that selects the appropriate part of the
> loaded vector.
>
> Now, vectorizable_* for say a conversion from int to double
> may be able to vectorize for a v4si directly to v4df or to two times v2df.
> With the size restriction relaxed vectorizable_* could opt to choose
> smaller/larger vectors specifically and thus also have different vector types
> for the same scalar type in the SLP graph.  I do expect this to be
> profitable on x86 for some loops due to some asymmetries in the ISA
> (and extra cost of lane-crossing operations for say AVX where using SSE
> is cheaper for some ops even though you now have 2 or more instructions).

Making this dependent on vectorizable_* seems to require a natural
conversion point in the original scalar code.  In your shared data-ref
example, the SLP instance with group size 4 could use 2 v2sis or a
single v4si.  The fact that the data ref is shared with a group size
of 8 shouldn't necessarily affect that choice and e.g. force v4si over
v2si.  So when taking the graph as a whole, it seems like we should be
able to combine 2 v2sis into a single v4si or split a v4si into 2 v2sis
at any point where that's worthwhile, independently of the surrounding
operations.

The same goes for the conversions.  If the target only supports
v4si->v2df conversions, it should still be possible to combine 2 v2dfs
into a single v4df as a separate, independent step if there's a benefit
to operating on v4df.  Same idea in reverse if the target only supports
v4si->v4df and an SLP instance wants to operate on v2df.  It would be
good if this splitting and combining wasn't dependent on having a
conversion.

So it seems like the natural point for inserting group size adjustments
is at sharing boundaries between SLP instances, with each SLP instance
using consistent choices internally (i.e. with the scalar type
determining the vector type).  And we should be able to insert those
group size adjustments based only on the types involved.  Whether
vectorizable_conversion can do a particular group size adjustment on
the fly seems more like a costing/pattern-matching decision rather than
something that should restrict the choice of types.

Thanks,
Richard
Richard Biener Nov. 12, 2019, 9:24 a.m. UTC | #7
On Wed, Nov 6, 2019 at 1:47 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> > On Wed, Nov 6, 2019 at 12:02 PM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Richard Biener <richard.guenther@gmail.com> writes:
> >> > On Wed, Nov 6, 2019 at 11:21 AM Richard Sandiford
> >> > <richard.sandiford@arm.com> wrote:
> >> >>
> >> >> Richard Biener <richard.guenther@gmail.com> writes:
> >> >> > On Tue, Nov 5, 2019 at 9:25 PM Richard Sandiford
> >> >> > <richard.sandiford@arm.com> wrote:
> >> >> >>
> >> >> >> Patch 12/n makes the AArch64 port add four entries to
> >> >> >> autovectorize_vector_modes.  Each entry describes a different
> >> >> >> vector mode assignment for vector code that mixes 8-bit, 16-bit,
> >> >> >> 32-bit and 64-bit elements.  But if (as usual) the vector code has
> >> >> >> fewer element sizes than that, we could end up trying the same
> >> >> >> combination of vector modes multiple times.  This patch adds a
> >> >> >> check to prevent that.
> >> >> >>
> >> >> >> As before: each patch tested individually on aarch64-linux-gnu and the
> >> >> >> series as a whole on x86_64-linux-gnu.
> >> >> >>
> >> >> >>
> >> >> >> 2019-11-04  Richard Sandiford  <richard.sandiford@arm.com>
> >> >> >>
> >> >> >> gcc/
> >> >> >>         * tree-vectorizer.h (vec_info::mode_set): New typedef.
> >> >> >>         (vec_info::used_vector_mode): New member variable.
> >> >> >>         (vect_chooses_same_modes_p): Declare.
> >> >> >>         * tree-vect-stmts.c (get_vectype_for_scalar_type): Record each
> >> >> >>         chosen vector mode in vec_info::used_vector_mode.
> >> >> >>         (vect_chooses_same_modes_p): New function.
> >> >> >>         * tree-vect-loop.c (vect_analyze_loop): Use it to avoid trying
> >> >> >>         the same vector statements multiple times.
> >> >> >>         * tree-vect-slp.c (vect_slp_bb_region): Likewise.
> >> >> >>
> >> >> >> Index: gcc/tree-vectorizer.h
> >> >> >> ===================================================================
> >> >> >> --- gcc/tree-vectorizer.h       2019-11-05 10:48:11.246092351 +0000
> >> >> >> +++ gcc/tree-vectorizer.h       2019-11-05 10:57:41.662071145 +0000
> >> >> >> @@ -298,6 +298,7 @@ typedef std::pair<tree, tree> vec_object
> >> >> >>  /* Vectorizer state common between loop and basic-block vectorization.  */
> >> >> >>  class vec_info {
> >> >> >>  public:
> >> >> >> +  typedef hash_set<int_hash<machine_mode, E_VOIDmode, E_BLKmode> > mode_set;
> >> >> >>    enum vec_kind { bb, loop };
> >> >> >>
> >> >> >>    vec_info (vec_kind, void *, vec_info_shared *);
> >> >> >> @@ -335,6 +336,9 @@ typedef std::pair<tree, tree> vec_object
> >> >> >>    /* Cost data used by the target cost model.  */
> >> >> >>    void *target_cost_data;
> >> >> >>
> >> >> >> +  /* The set of vector modes used in the vectorized region.  */
> >> >> >> +  mode_set used_vector_modes;
> >> >> >> +
> >> >> >>    /* The argument we should pass to related_vector_mode when looking up
> >> >> >>       the vector mode for a scalar mode, or VOIDmode if we haven't yet
> >> >> >>       made any decisions about which vector modes to use.  */
> >> >> >> @@ -1615,6 +1619,7 @@ extern tree get_related_vectype_for_scal
> >> >> >>  extern tree get_vectype_for_scalar_type (vec_info *, tree);
> >> >> >>  extern tree get_mask_type_for_scalar_type (vec_info *, tree);
> >> >> >>  extern tree get_same_sized_vectype (tree, tree);
> >> >> >> +extern bool vect_chooses_same_modes_p (vec_info *, machine_mode);
> >> >> >>  extern bool vect_get_loop_mask_type (loop_vec_info);
> >> >> >>  extern bool vect_is_simple_use (tree, vec_info *, enum vect_def_type *,
> >> >> >>                                 stmt_vec_info * = NULL, gimple ** = NULL);
> >> >> >> Index: gcc/tree-vect-stmts.c
> >> >> >> ===================================================================
> >> >> >> --- gcc/tree-vect-stmts.c       2019-11-05 10:48:11.242092379 +0000
> >> >> >> +++ gcc/tree-vect-stmts.c       2019-11-05 10:57:41.662071145 +0000
> >> >> >> @@ -11235,6 +11235,10 @@ get_vectype_for_scalar_type (vec_info *v
> >> >> >>                                                       scalar_type);
> >> >> >>    if (vectype && vinfo->vector_mode == VOIDmode)
> >> >> >>      vinfo->vector_mode = TYPE_MODE (vectype);
> >> >> >> +
> >> >> >> +  if (vectype)
> >> >> >> +    vinfo->used_vector_modes.add (TYPE_MODE (vectype));
> >> >> >> +
> >> >> >
> >> >> > Do we actually end up _using_ all types returned by this function?
> >> >>
> >> >> No, not all of them, so it's a bit crude.  E.g. some types might end up
> >> >> not being relevant after pattern recognition, or after we've made a
> >> >> final decision about which parts of an address calculation to include
> >> >> in a gather or scatter op.  So we can still end up retrying the same
> >> >> thing even after the patch.
> >> >>
> >> >> The problem is that we're trying to avoid pointless retries on failure
> >> >> as well as success, so we could end up stopping at arbitrary points.
> >> >> I wasn't sure where else to handle this.
> >> >
> >> > Yeah, I think this "iterating" is somewhat bogus (crude) now.
> >>
> >> I think it was crude even before the series though. :-)  Not sure the
> >> series is making things worse.
> >>
> >> The problem is that there's a chicken-and-egg problem between how
> >> we decide to vectorise and which vector subarchitecture and VF we use.
> >> E.g. if we have:
> >>
> >>   unsigned char *x, *y;
> >>   ...
> >>   x[i] = (unsigned short) (x[i] + y[i] + 1) >> 1;
> >>
> >> do we build the SLP graph on the assumption that we need to use short
> >> elements, or on the assumption that we can use IFN_AVG_CEIL?  This
> >> affects the VF we get out: using IFN_AVG_CEIL gives double the VF
> >> relative to doing unsigned short arithmetic.
> >>
> >> And we need to know which vector subarchitecture we're targetting when
> >> making that decision: e.g. Advanced SIMD and SVE2 have IFN_AVG_CEIL,
> >> but SVE doesn't.  On the other hand, SVE supports things that Advanced
> >> SIMD doesn't.  It's similar story of course for the x86 vector subarchs.
> >>
> >> For one pattern like this, we could simply try both ways.
> >> But that becomes untenable if there are multiple potential patterns.
> >> Iterating over the vector subarchs gives us a sensible way of reducing
> >> the search space by only applying patterns that the subarch supports.
> >>
> >> So...
> >>
> >> > What we'd like to collect is for all defs the vector types we could
> >> > use and then vectorizable_ defines constraints between input and
> >> > output vector types.  From that we'd arrive at a (possibly quite
> >> > large) set of "SLP graphs with vector types" we'd choose from.  I
> >> > believe we'll never want to truly explore the whole space but guess we
> >> > want to greedily compute those "SLP graphs with vector types" starting
> >> > from what (grouped) datarefs tells us is possible (which is kind of
> >> > what we do now).
> >>
> >> ...I don't think we can/should use the same SLP graph to pick vector
> >> types for all subarchs, since the ideal graph depends on the subarch.
> >> I'm also not sure the vectorizable_* routines could say anything that
> >> isn't obvious from the input and output scalar types.  Won't it still be
> >> the case that within an SLP instance, all scalars of type T will become
> >> the same vector(N) T?
> >
> > Not necessarily.  I can see the SLP graph containing "reductions"
> > (like those complex patterns proposed).  But yes, at the moment
> > there's a single group-size per SLP instance.  Now, for the SLP _graph_
> > we may have one instance with 4 and one with group size of 8 both
> > for example sharing the same grouped load.  It may make sense to
> > vectorize the load with v8si and for the group-size 4 SLP instance
> > have a "reduction operation" that selects the appropriate part of the
> > loaded vector.
> >
> > Now, vectorizable_* for say a conversion from int to double
> > may be able to vectorize for a v4si directly to v4df or to two times v2df.
> > With the size restriction relaxed vectorizable_* could opt to choose
> > smaller/larger vectors specifically and thus also have different vector types
> > for the same scalar type in the SLP graph.  I do expect this to be
> > profitable on x86 for some loops due to some asymmetries in the ISA
> > (and extra cost of lane-crossing operations for say AVX where using SSE
> > is cheaper for some ops even though you now have 2 or more instructions).
>
> Making this dependent on vectorizable_* seems to require a natural
> conversion point in the original scalar code.  In your shared data-ref
> example, the SLP instance with group size 4 could use 2 v2sis or a
> single v4si.  The fact that the data ref is shared with a group size
> of 8 shouldn't necessarily affect that choice and e.g. force v4si over
> v2si.  So when taking the graph as a whole, it seems like we should be
> able to combine 2 v2sis into a single v4si or split a v4si into 2 v2sis
> at any point where that's worthwhile, independently of the surrounding
> operations.
>
> The same goes for the conversions.  If the target only supports
> v4si->v2df conversions, it should still be possible to combine 2 v2dfs
> into a single v4df as a separate, independent step if there's a benefit
> to operating on v4df.  Same idea in reverse if the target only supports
> v4si->v4df and an SLP instance wants to operate on v2df.  It would be
> good if this splitting and combining wasn't dependent on having a
> conversion.
>
> So it seems like the natural point for inserting group size adjustments
> is at sharing boundaries between SLP instances, with each SLP instance
> using consistent choices internally (i.e. with the scalar type
> determining the vector type).  And we should be able to insert those
> group size adjustments based only on the types involved.  Whether
> vectorizable_conversion can do a particular group size adjustment on
> the fly seems more like a costing/pattern-matching decision rather than
> something that should restrict the choice of types.

Yes, this sounds correct.  So let's go with the patch.

Thanks,
Richard.

> Thanks,
> Richard
diff mbox series

Patch

Index: gcc/tree-vectorizer.h
===================================================================
--- gcc/tree-vectorizer.h	2019-11-05 10:48:11.246092351 +0000
+++ gcc/tree-vectorizer.h	2019-11-05 10:57:41.662071145 +0000
@@ -298,6 +298,7 @@  typedef std::pair<tree, tree> vec_object
 /* Vectorizer state common between loop and basic-block vectorization.  */
 class vec_info {
 public:
+  typedef hash_set<int_hash<machine_mode, E_VOIDmode, E_BLKmode> > mode_set;
   enum vec_kind { bb, loop };
 
   vec_info (vec_kind, void *, vec_info_shared *);
@@ -335,6 +336,9 @@  typedef std::pair<tree, tree> vec_object
   /* Cost data used by the target cost model.  */
   void *target_cost_data;
 
+  /* The set of vector modes used in the vectorized region.  */
+  mode_set used_vector_modes;
+
   /* The argument we should pass to related_vector_mode when looking up
      the vector mode for a scalar mode, or VOIDmode if we haven't yet
      made any decisions about which vector modes to use.  */
@@ -1615,6 +1619,7 @@  extern tree get_related_vectype_for_scal
 extern tree get_vectype_for_scalar_type (vec_info *, tree);
 extern tree get_mask_type_for_scalar_type (vec_info *, tree);
 extern tree get_same_sized_vectype (tree, tree);
+extern bool vect_chooses_same_modes_p (vec_info *, machine_mode);
 extern bool vect_get_loop_mask_type (loop_vec_info);
 extern bool vect_is_simple_use (tree, vec_info *, enum vect_def_type *,
 				stmt_vec_info * = NULL, gimple ** = NULL);
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c	2019-11-05 10:48:11.242092379 +0000
+++ gcc/tree-vect-stmts.c	2019-11-05 10:57:41.662071145 +0000
@@ -11235,6 +11235,10 @@  get_vectype_for_scalar_type (vec_info *v
 						      scalar_type);
   if (vectype && vinfo->vector_mode == VOIDmode)
     vinfo->vector_mode = TYPE_MODE (vectype);
+
+  if (vectype)
+    vinfo->used_vector_modes.add (TYPE_MODE (vectype));
+
   return vectype;
 }
 
@@ -11274,6 +11278,20 @@  get_same_sized_vectype (tree scalar_type
 					      scalar_type, nunits);
 }
 
+/* Return true if replacing LOOP_VINFO->vector_mode with VECTOR_MODE
+   would not change the chosen vector modes.  */
+
+bool
+vect_chooses_same_modes_p (vec_info *vinfo, machine_mode vector_mode)
+{
+  for (vec_info::mode_set::iterator i = vinfo->used_vector_modes.begin ();
+       i != vinfo->used_vector_modes.end (); ++i)
+    if (!VECTOR_MODE_P (*i)
+	|| related_vector_mode (vector_mode, GET_MODE_INNER (*i), 0) != *i)
+      return false;
+  return true;
+}
+
 /* Function vect_is_simple_use.
 
    Input:
Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c	2019-11-05 10:48:11.238092407 +0000
+++ gcc/tree-vect-loop.c	2019-11-05 10:57:41.658071173 +0000
@@ -2430,6 +2430,19 @@  vect_analyze_loop (class loop *loop, vec
 	}
 
       loop->aux = NULL;
+
+      if (!fatal)
+	while (mode_i < vector_modes.length ()
+	       && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
+	  {
+	    if (dump_enabled_p ())
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "***** The result for vector mode %s would"
+			       " be the same\n",
+			       GET_MODE_NAME (vector_modes[mode_i]));
+	    mode_i += 1;
+	  }
+
       if (res)
 	{
 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
Index: gcc/tree-vect-slp.c
===================================================================
--- gcc/tree-vect-slp.c	2019-11-05 10:48:11.242092379 +0000
+++ gcc/tree-vect-slp.c	2019-11-05 10:57:41.662071145 +0000
@@ -3238,6 +3238,18 @@  vect_slp_bb_region (gimple_stmt_iterator
       if (mode_i == 0)
 	autodetected_vector_mode = bb_vinfo->vector_mode;
 
+      if (!fatal)
+	while (mode_i < vector_modes.length ()
+	       && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
+	  {
+	    if (dump_enabled_p ())
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "***** The result for vector mode %s would"
+			       " be the same\n",
+			       GET_MODE_NAME (vector_modes[mode_i]));
+	    mode_i += 1;
+	  }
+
       delete bb_vinfo;
 
       if (mode_i < vector_modes.length ()