diff mbox series

[4/8] vect: Determine input vectype for multiple lane-reducing

Message ID LV2PR01MB78393B9BE03402556710D12CF7CC2@LV2PR01MB7839.prod.exchangelabs.com
State New
Headers show
Series [PATH,1/8] vect: Add a function to check lane-reducing stmt | expand

Commit Message

Feng Xue OS June 16, 2024, 7:25 a.m. UTC
The input vectype of reduction PHI statement must be determined before
vect cost computation for the reduction. Since lance-reducing operation has
different input vectype from normal one, so we need to traverse all reduction
statements to find out the input vectype with the least lanes, and set that to
the PHI statement.

Thanks,
Feng

---
gcc/
	* tree-vect-loop.cc (vectorizable_reduction): Determine input vectype
	during traversal of reduction statements.
---
 gcc/tree-vect-loop.cc | 72 +++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 23 deletions(-)

Comments

Richard Biener June 19, 2024, 1:01 p.m. UTC | #1
On Sun, Jun 16, 2024 at 9:25 AM Feng Xue OS <fxue@os.amperecomputing.com> wrote:
>
> The input vectype of reduction PHI statement must be determined before
> vect cost computation for the reduction. Since lance-reducing operation has
> different input vectype from normal one, so we need to traverse all reduction
> statements to find out the input vectype with the least lanes, and set that to
> the PHI statement.
>
> Thanks,
> Feng
>
> ---
> gcc/
>         * tree-vect-loop.cc (vectorizable_reduction): Determine input vectype
>         during traversal of reduction statements.
> ---
>  gcc/tree-vect-loop.cc | 72 +++++++++++++++++++++++++++++--------------
>  1 file changed, 49 insertions(+), 23 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 0f7b125e72d..39aa5cb1197 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7643,7 +7643,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>      {
>        stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
>        stmt_vec_info vdef = vect_stmt_to_vectorize (def);
> -      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
> +      int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
> +
> +      if (reduc_idx == -1)
>         {
>           if (dump_enabled_p ())
>             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -7686,10 +7688,50 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>               return false;
>             }
>         }
> -      else if (!stmt_info)
> -       /* First non-conversion stmt.  */
> -       stmt_info = vdef;
> -      reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
> +      else
> +       {
> +         /* First non-conversion stmt.  */
> +         if (!stmt_info)
> +           stmt_info = vdef;
> +
> +         if (lane_reducing_op_p (op.code))
> +           {
> +             unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 0;
> +             tree op_type = TREE_TYPE (op.ops[0]);
> +             tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo,
> +                                                                op_type,
> +                                                                group_size);

I think doing it this way does not adhere to the vector type size constraint
with loop vectorization.  You should use vect_is_simple_use like the
original code did as the actual vector definition determines the vector type
used.

You are always using op.ops[0] here - I think that works because
reduc_idx is the last operand of all lane-reducing ops.  But then
we should assert reduc_idx != 0 here and add a comment.

> +
> +             /* The last operand of lane-reducing operation is for
> +                reduction.  */
> +             gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
> +
> +             /* For lane-reducing operation vectorizable analysis needs the
> +                reduction PHI information */
> +             STMT_VINFO_REDUC_DEF (def) = phi_info;
> +
> +             if (!new_vectype_in)
> +               return false;
> +
> +             /* Each lane-reducing operation has its own input vectype, while
> +                reduction PHI will record the input vectype with the least
> +                lanes.  */
> +             STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
> +
> +             /* To accommodate lane-reducing operations of mixed input
> +                vectypes, choose input vectype with the least lanes for the
> +                reduction PHI statement, which would result in the most
> +                ncopies for vectorized reduction results.  */
> +             if (!vectype_in
> +                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> +                      < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type))))
> +               vectype_in = new_vectype_in;

I know this is a fragile area but I always wonder since the accumulating operand
is the largest (all lane-reducing ops are widening), and that will be
equal to the
type of the PHI node, how this condition can be ever true.

ncopies is determined by the VF, so the comment is at least misleading.

> +           }
> +         else
> +           vectype_in = STMT_VINFO_VECTYPE (phi_info);

Please initialize vectype_in from phi_info before the loop (that
should never be NULL).

I'll note that with your patch it seems we'd initialize vectype_in to
the biggest
non-accumulation vector type involved in lane-reducing ops but the accumulating
type might still be larger.   Why, when we have multiple lane-reducing
ops, would
we chose the largest input here?  I see we eventually do

  if (slp_node)
    ncopies = 1;
  else
    ncopies = vect_get_num_copies (loop_vinfo, vectype_in);

but then IIRC we always force a single cycle def for lane-reducing ops(?).
In particular for vect_transform_reduction and SLP we rely on
SLP_TREE_NUMBER_OF_VEC_STMTS while non-SLP uses
STMT_VINFO_REDUC_VECTYPE_IN.

So I wonder what breaks when we set vectype_in = vector type of PHI?

Richard.

> +       }
> +
> +      reduc_def = op.ops[reduc_idx];
>        reduc_chain_length++;
>        if (!stmt_info && slp_node)
>         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
> @@ -7747,6 +7789,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>
>    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
>    STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
> +  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> +
>    gimple_match_op op;
>    if (!gimple_extract_op (stmt_info->stmt, &op))
>      gcc_unreachable ();
> @@ -7831,16 +7875,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>           = get_vectype_for_scalar_type (loop_vinfo,
>                                          TREE_TYPE (op.ops[i]), slp_op[i]);
>
> -      /* To properly compute ncopies we are interested in the widest
> -        non-reduction input type in case we're looking at a widening
> -        accumulation that we later handle in vect_transform_reduction.  */
> -      if (lane_reducing
> -         && vectype_op[i]
> -         && (!vectype_in
> -             || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> -                 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
> -       vectype_in = vectype_op[i];
> -
>        /* Record how the non-reduction-def value of COND_EXPR is defined.
>          ???  For a chain of multiple CONDs we'd have to match them up all.  */
>        if (op.code == COND_EXPR && reduc_chain_length == 1)
> @@ -7859,14 +7893,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>             }
>         }
>      }
> -  if (!vectype_in)
> -    vectype_in = STMT_VINFO_VECTYPE (phi_info);
> -  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> -
> -  /* Each lane-reducing operation has its own input vectype, while reduction
> -     PHI records the input vectype with least lanes.  */
> -  if (lane_reducing)
> -    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
>
>    enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
>    STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
> --
> 2.17.1
Feng Xue OS June 20, 2024, 5:47 a.m. UTC | #2
>> +         if (lane_reducing_op_p (op.code))
>> +           {
>> +             unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 0;
>> +             tree op_type = TREE_TYPE (op.ops[0]);
>> +             tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo,
>> +                                                                op_type,
>> +                                                                group_size);
> 
> I think doing it this way does not adhere to the vector type size constraint
> with loop vectorization.  You should use vect_is_simple_use like the
> original code did as the actual vector definition determines the vector type
> used.

OK, though this might be wordy. 

Actually, STMT_VINFO_REDUC_VECTYPE_IN is logically equivalent to nunits_vectype
that is determined in vect_determine_vf_for_stmt_1(). So how about setting the type
in this function?

> 
> You are always using op.ops[0] here - I think that works because
> reduc_idx is the last operand of all lane-reducing ops.  But then
> we should assert reduc_idx != 0 here and add a comment.

Already added in the following assertion.

>> +
>> +             /* The last operand of lane-reducing operation is for
>> +                reduction.  */
>> +             gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);

                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>> +
>> +             /* For lane-reducing operation vectorizable analysis needs the
>> +                reduction PHI information */
>> +             STMT_VINFO_REDUC_DEF (def) = phi_info;
>> +
>> +             if (!new_vectype_in)
>> +               return false;
>> +
>> +             /* Each lane-reducing operation has its own input vectype, while
>> +                reduction PHI will record the input vectype with the least
>> +                lanes.  */
>> +             STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
>> +
>> +             /* To accommodate lane-reducing operations of mixed input
>> +                vectypes, choose input vectype with the least lanes for the
>> +                reduction PHI statement, which would result in the most
>> +                ncopies for vectorized reduction results.  */
>> +             if (!vectype_in
>> +                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
>> +                      < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type))))
>> +               vectype_in = new_vectype_in;
> 
> I know this is a fragile area but I always wonder since the accumulating operand
> is the largest (all lane-reducing ops are widening), and that will be
> equal to the
> type of the PHI node, how this condition can be ever true.

In the original code, accumulating operand is skipped! While it is correctly, we
should not count the operand, this is why we call operation lane-reducing.

> 
> ncopies is determined by the VF, so the comment is at least misleading.
> 
>> +           }
>> +         else
>> +           vectype_in = STMT_VINFO_VECTYPE (phi_info);
> 
> Please initialize vectype_in from phi_info before the loop (that
> should never be NULL).
> 

May not, as the below explanation. 

> I'll note that with your patch it seems we'd initialize vectype_in to
> the biggest
> non-accumulation vector type involved in lane-reducing ops but the accumulating
> type might still be larger.   Why, when we have multiple lane-reducing
> ops, would
> we chose the largest input here?  I see we eventually do
> 
>   if (slp_node)
>     ncopies = 1;
>   else
>     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
> 
> but then IIRC we always force a single cycle def for lane-reducing ops(?).


> In particular for vect_transform_reduction and SLP we rely on
> SLP_TREE_NUMBER_OF_VEC_STMTS while non-SLP uses
> STMT_VINFO_REDUC_VECTYPE_IN.
> 
> So I wonder what breaks when we set vectype_in = vector type of PHI?
> 

Yes. It is right, nothing is broken. Suppose that a loop contains three dot_prods,
two are <16 * char>, one is <8 * short>, and choose <4 * int> as vectype_in:

With the patch #7, we get:

      vector<4> int sum_v0 = { 0, 0, 0, 0 };
      vector<4> int sum_v1 = { 0, 0, 0, 0 };
      vector<4> int sum_v2 = { 0, 0, 0, 0 };
      vector<4> int sum_v3 = { 0, 0, 0, 0 };

      loop () {
         sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0);
 
         sum_v0 = dot_prod<16 * char>(char_b0, char_b1, sum_v0);

         sum_v0 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v0);
         sum_v1 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v1);

         sum_v2 = sum_v2;
         sum_v3 = sum_v3;
      }

The def/use cycles (sum_v2 and sum_v3> would be optimized away finally.
Then this gets same result as setting vectype_in to <8 * short>.

With the patch #8, we get:

      vector<4> int sum_v0 = { 0, 0, 0, 0 };
      vector<4> int sum_v1 = { 0, 0, 0, 0 };
      vector<4> int sum_v2 = { 0, 0, 0, 0 };
      vector<4> int sum_v3 = { 0, 0, 0, 0 };

      loop () {
         sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0); 

         sum_v1 = dot_prod<16 * char>(char_b0, char_b1, sum_v1);

         sum_v2 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v2);
         sum_v3 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v3);
      }

All dot_prods are assigned to separate def/use cycles, and no
dependency. More def/use cycles, higher instruction parallelism,
but there need extra cost in epilogue to combine the result.

So we consider a somewhat compact def/use layout similar to
single-defuse-cycle, in which two <16 * char> dot_prods are independent,
and cycle 2 and 3 are not used, and this is better than the 1st scheme.

      vector<4> int sum_v0 = { 0, 0, 0, 0 };
      vector<4> int sum_v1 = { 0, 0, 0, 0 };

      loop () {
         sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0); 

         sum_v1 = dot_prod<16 * char>(char_b0, char_b1, sum_v1);

         sum_v0 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v0);
         sum_v1 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v1);
      }

For this purpose, we need to track the vectype_in that results in
the most ncopies, for this case, the type is <8 * short>.

BTW: would you please also take a look at patch #7 and #8?

Thanks,
Feng
Richard Biener June 20, 2024, 11:52 a.m. UTC | #3
On Thu, Jun 20, 2024 at 7:47 AM Feng Xue OS <fxue@os.amperecomputing.com> wrote:
>
> >> +         if (lane_reducing_op_p (op.code))
> >> +           {
> >> +             unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 0;
> >> +             tree op_type = TREE_TYPE (op.ops[0]);
> >> +             tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo,
> >> +                                                                op_type,
> >> +                                                                group_size);
> >
> > I think doing it this way does not adhere to the vector type size constraint
> > with loop vectorization.  You should use vect_is_simple_use like the
> > original code did as the actual vector definition determines the vector type
> > used.
>
> OK, though this might be wordy.
>
> Actually, STMT_VINFO_REDUC_VECTYPE_IN is logically equivalent to nunits_vectype
> that is determined in vect_determine_vf_for_stmt_1(). So how about setting the type
> in this function?

But why would we need it here?  Isn't the info indirectly available through
the loops vectorization factor?

> >
> > You are always using op.ops[0] here - I think that works because
> > reduc_idx is the last operand of all lane-reducing ops.  But then
> > we should assert reduc_idx != 0 here and add a comment.
>
> Already added in the following assertion.
>
> >> +
> >> +             /* The last operand of lane-reducing operation is for
> >> +                reduction.  */
> >> +             gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
>
>                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >> +
> >> +             /* For lane-reducing operation vectorizable analysis needs the
> >> +                reduction PHI information */
> >> +             STMT_VINFO_REDUC_DEF (def) = phi_info;
> >> +
> >> +             if (!new_vectype_in)
> >> +               return false;
> >> +
> >> +             /* Each lane-reducing operation has its own input vectype, while
> >> +                reduction PHI will record the input vectype with the least
> >> +                lanes.  */
> >> +             STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
> >> +
> >> +             /* To accommodate lane-reducing operations of mixed input
> >> +                vectypes, choose input vectype with the least lanes for the
> >> +                reduction PHI statement, which would result in the most
> >> +                ncopies for vectorized reduction results.  */
> >> +             if (!vectype_in
> >> +                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> >> +                      < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type))))
> >> +               vectype_in = new_vectype_in;
> >
> > I know this is a fragile area but I always wonder since the accumulating operand
> > is the largest (all lane-reducing ops are widening), and that will be
> > equal to the
> > type of the PHI node, how this condition can be ever true.
>
> In the original code, accumulating operand is skipped! While it is correctly, we
> should not count the operand, this is why we call operation lane-reducing.
>
> >
> > ncopies is determined by the VF, so the comment is at least misleading.
> >
> >> +           }
> >> +         else
> >> +           vectype_in = STMT_VINFO_VECTYPE (phi_info);
> >
> > Please initialize vectype_in from phi_info before the loop (that
> > should never be NULL).
> >
>
> May not, as the below explanation.
>
> > I'll note that with your patch it seems we'd initialize vectype_in to
> > the biggest
> > non-accumulation vector type involved in lane-reducing ops but the accumulating
> > type might still be larger.   Why, when we have multiple lane-reducing
> > ops, would
> > we chose the largest input here?  I see we eventually do
> >
> >   if (slp_node)
> >     ncopies = 1;
> >   else
> >     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
> >
> > but then IIRC we always force a single cycle def for lane-reducing ops(?).
>
>
> > In particular for vect_transform_reduction and SLP we rely on
> > SLP_TREE_NUMBER_OF_VEC_STMTS while non-SLP uses
> > STMT_VINFO_REDUC_VECTYPE_IN.
> >
> > So I wonder what breaks when we set vectype_in = vector type of PHI?
> >
>
> Yes. It is right, nothing is broken. Suppose that a loop contains three dot_prods,
> two are <16 * char>, one is <8 * short>, and choose <4 * int> as vectype_in:
>
> With the patch #7, we get:
>
>       vector<4> int sum_v0 = { 0, 0, 0, 0 };
>       vector<4> int sum_v1 = { 0, 0, 0, 0 };
>       vector<4> int sum_v2 = { 0, 0, 0, 0 };
>       vector<4> int sum_v3 = { 0, 0, 0, 0 };
>
>       loop () {
>          sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0);
>
>          sum_v0 = dot_prod<16 * char>(char_b0, char_b1, sum_v0);
>
>          sum_v0 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v0);
>          sum_v1 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v1);
>
>          sum_v2 = sum_v2;
>          sum_v3 = sum_v3;
>       }
>
> The def/use cycles (sum_v2 and sum_v3> would be optimized away finally.
> Then this gets same result as setting vectype_in to <8 * short>.
>
> With the patch #8, we get:
>
>       vector<4> int sum_v0 = { 0, 0, 0, 0 };
>       vector<4> int sum_v1 = { 0, 0, 0, 0 };
>       vector<4> int sum_v2 = { 0, 0, 0, 0 };
>       vector<4> int sum_v3 = { 0, 0, 0, 0 };
>
>       loop () {
>          sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0);
>
>          sum_v1 = dot_prod<16 * char>(char_b0, char_b1, sum_v1);
>
>          sum_v2 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v2);
>          sum_v3 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v3);
>       }
>
> All dot_prods are assigned to separate def/use cycles, and no
> dependency. More def/use cycles, higher instruction parallelism,
> but there need extra cost in epilogue to combine the result.
>
> So we consider a somewhat compact def/use layout similar to
> single-defuse-cycle, in which two <16 * char> dot_prods are independent,
> and cycle 2 and 3 are not used, and this is better than the 1st scheme.
>
>       vector<4> int sum_v0 = { 0, 0, 0, 0 };
>       vector<4> int sum_v1 = { 0, 0, 0, 0 };
>
>       loop () {
>          sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0);
>
>          sum_v1 = dot_prod<16 * char>(char_b0, char_b1, sum_v1);
>
>          sum_v0 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v0);
>          sum_v1 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v1);
>       }
>
> For this purpose, we need to track the vectype_in that results in
> the most ncopies, for this case, the type is <8 * short>.

So the VF determines the number of inputs to the lane-reducing ops
and by that way also the number of lane-reducing stmts, individually
for each lane-reducing op.  By the soft constraint of having same sized
input vectors you'll probably never arrive at the situation that, for the
above, you end up with one dot_prod<16 * char> but two dot_prod <8 * short>?

That said, I still don't exactly see how we need the input vector type
rather than the accumulator vector type plus the VF to determine everything.
I know it's pre-existing, but now you're extending it to cover multiple possibly
"different" (to what extent "different"?  is the only requirement that
the accumulator
type is the same?) lane-reducing ops.

> BTW: would you please also take a look at patch #7 and #8?

Sure, I thought they were logically dependent on this one (and I didn't get to
them yesterday).

Richard.

> Thanks,
> Feng
>
> ________________________________________
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Wednesday, June 19, 2024 9:01 PM
> To: Feng Xue OS
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH 4/8] vect: Determine input vectype for multiple lane-reducing
>
> On Sun, Jun 16, 2024 at 9:25 AM Feng Xue OS <fxue@os.amperecomputing.com> wrote:
> >
> > The input vectype of reduction PHI statement must be determined before
> > vect cost computation for the reduction. Since lance-reducing operation has
> > different input vectype from normal one, so we need to traverse all reduction
> > statements to find out the input vectype with the least lanes, and set that to
> > the PHI statement.
> >
> > Thanks,
> > Feng
> >
> > ---
> > gcc/
> >         * tree-vect-loop.cc (vectorizable_reduction): Determine input vectype
> >         during traversal of reduction statements.
> > ---
> >  gcc/tree-vect-loop.cc | 72 +++++++++++++++++++++++++++++--------------
> >  1 file changed, 49 insertions(+), 23 deletions(-)
> >
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 0f7b125e72d..39aa5cb1197 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -7643,7 +7643,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >      {
> >        stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
> >        stmt_vec_info vdef = vect_stmt_to_vectorize (def);
> > -      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
> > +      int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
> > +
> > +      if (reduc_idx == -1)
> >         {
> >           if (dump_enabled_p ())
> >             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > @@ -7686,10 +7688,50 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >               return false;
> >             }
> >         }
> > -      else if (!stmt_info)
> > -       /* First non-conversion stmt.  */
> > -       stmt_info = vdef;
> > -      reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
> > +      else
> > +       {
> > +         /* First non-conversion stmt.  */
> > +         if (!stmt_info)
> > +           stmt_info = vdef;
> > +
> > +         if (lane_reducing_op_p (op.code))
> > +           {
> > +             unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 0;
> > +             tree op_type = TREE_TYPE (op.ops[0]);
> > +             tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo,
> > +                                                                op_type,
> > +                                                                group_size);
>
> I think doing it this way does not adhere to the vector type size constraint
> with loop vectorization.  You should use vect_is_simple_use like the
> original code did as the actual vector definition determines the vector type
> used.
>
> You are always using op.ops[0] here - I think that works because
> reduc_idx is the last operand of all lane-reducing ops.  But then
> we should assert reduc_idx != 0 here and add a comment.
>
> > +
> > +             /* The last operand of lane-reducing operation is for
> > +                reduction.  */
> > +             gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
> > +
> > +             /* For lane-reducing operation vectorizable analysis needs the
> > +                reduction PHI information */
> > +             STMT_VINFO_REDUC_DEF (def) = phi_info;
> > +
> > +             if (!new_vectype_in)
> > +               return false;
> > +
> > +             /* Each lane-reducing operation has its own input vectype, while
> > +                reduction PHI will record the input vectype with the least
> > +                lanes.  */
> > +             STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
> > +
> > +             /* To accommodate lane-reducing operations of mixed input
> > +                vectypes, choose input vectype with the least lanes for the
> > +                reduction PHI statement, which would result in the most
> > +                ncopies for vectorized reduction results.  */
> > +             if (!vectype_in
> > +                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> > +                      < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type))))
> > +               vectype_in = new_vectype_in;
>
> I know this is a fragile area but I always wonder since the accumulating operand
> is the largest (all lane-reducing ops are widening), and that will be
> equal to the
> type of the PHI node, how this condition can be ever true.
>
> ncopies is determined by the VF, so the comment is at least misleading.
>
> > +           }
> > +         else
> > +           vectype_in = STMT_VINFO_VECTYPE (phi_info);
>
> Please initialize vectype_in from phi_info before the loop (that
> should never be NULL).
>
> I'll note that with your patch it seems we'd initialize vectype_in to
> the biggest
> non-accumulation vector type involved in lane-reducing ops but the accumulating
> type might still be larger.   Why, when we have multiple lane-reducing
> ops, would
> we chose the largest input here?  I see we eventually do
>
>   if (slp_node)
>     ncopies = 1;
>   else
>     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
>
> but then IIRC we always force a single cycle def for lane-reducing ops(?).
> In particular for vect_transform_reduction and SLP we rely on
> SLP_TREE_NUMBER_OF_VEC_STMTS while non-SLP uses
> STMT_VINFO_REDUC_VECTYPE_IN.
>
> So I wonder what breaks when we set vectype_in = vector type of PHI?
>
> Richard.
>
> > +       }
> > +
> > +      reduc_def = op.ops[reduc_idx];
> >        reduc_chain_length++;
> >        if (!stmt_info && slp_node)
> >         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
> > @@ -7747,6 +7789,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >
> >    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
> >    STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
> > +  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> > +
> >    gimple_match_op op;
> >    if (!gimple_extract_op (stmt_info->stmt, &op))
> >      gcc_unreachable ();
> > @@ -7831,16 +7875,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >           = get_vectype_for_scalar_type (loop_vinfo,
> >                                          TREE_TYPE (op.ops[i]), slp_op[i]);
> >
> > -      /* To properly compute ncopies we are interested in the widest
> > -        non-reduction input type in case we're looking at a widening
> > -        accumulation that we later handle in vect_transform_reduction.  */
> > -      if (lane_reducing
> > -         && vectype_op[i]
> > -         && (!vectype_in
> > -             || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> > -                 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
> > -       vectype_in = vectype_op[i];
> > -
> >        /* Record how the non-reduction-def value of COND_EXPR is defined.
> >          ???  For a chain of multiple CONDs we'd have to match them up all.  */
> >        if (op.code == COND_EXPR && reduc_chain_length == 1)
> > @@ -7859,14 +7893,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >             }
> >         }
> >      }
> > -  if (!vectype_in)
> > -    vectype_in = STMT_VINFO_VECTYPE (phi_info);
> > -  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> > -
> > -  /* Each lane-reducing operation has its own input vectype, while reduction
> > -     PHI records the input vectype with least lanes.  */
> > -  if (lane_reducing)
> > -    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
> >
> >    enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
> >    STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
> > --
> > 2.17.1
diff mbox series

Patch

From f9aa029eef44b65bd6b54f9cb236a4be71f8ad52 Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Sun, 16 Jun 2024 13:00:32 +0800
Subject: [PATCH 4/8] vect: Determine input vectype for multiple lane-reducing
 operations

The input vectype of reduction PHI statement must be determined before
vect cost computation for the reduction. Since lance-reducing operation has
different input vectype from normal one, so we need to traverse all reduction
statements to find out the input vectype with the least lanes, and set that to
the PHI statement.

2024-06-16 Feng Xue <fxue@os.amperecomputing.com>

gcc/
	* tree-vect-loop.cc (vectorizable_reduction): Determine input vectype
	during traversal of reduction statements.
---
 gcc/tree-vect-loop.cc | 72 +++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 23 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 0f7b125e72d..39aa5cb1197 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7643,7 +7643,9 @@  vectorizable_reduction (loop_vec_info loop_vinfo,
     {
       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
-      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
+      int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
+
+      if (reduc_idx == -1)
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -7686,10 +7688,50 @@  vectorizable_reduction (loop_vec_info loop_vinfo,
 	      return false;
 	    }
 	}
-      else if (!stmt_info)
-	/* First non-conversion stmt.  */
-	stmt_info = vdef;
-      reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
+      else
+	{
+	  /* First non-conversion stmt.  */
+	  if (!stmt_info)
+	    stmt_info = vdef;
+
+	  if (lane_reducing_op_p (op.code))
+	    {
+	      unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 0;
+	      tree op_type = TREE_TYPE (op.ops[0]);
+	      tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo,
+								 op_type,
+								 group_size);
+
+	      /* The last operand of lane-reducing operation is for
+		 reduction.  */
+	      gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
+
+	      /* For lane-reducing operation vectorizable analysis needs the
+		 reduction PHI information */
+	      STMT_VINFO_REDUC_DEF (def) = phi_info;
+
+	      if (!new_vectype_in)
+		return false;
+
+	      /* Each lane-reducing operation has its own input vectype, while
+		 reduction PHI will record the input vectype with the least
+		 lanes.  */
+	      STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
+
+	      /* To accommodate lane-reducing operations of mixed input
+		 vectypes, choose input vectype with the least lanes for the
+		 reduction PHI statement, which would result in the most
+		 ncopies for vectorized reduction results.  */
+	      if (!vectype_in
+		  || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+		       < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type))))
+		vectype_in = new_vectype_in;
+	    }
+	  else
+	    vectype_in = STMT_VINFO_VECTYPE (phi_info);
+	}
+
+      reduc_def = op.ops[reduc_idx];
       reduc_chain_length++;
       if (!stmt_info && slp_node)
 	slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
@@ -7747,6 +7789,8 @@  vectorizable_reduction (loop_vec_info loop_vinfo,
 
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
+  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
+
   gimple_match_op op;
   if (!gimple_extract_op (stmt_info->stmt, &op))
     gcc_unreachable ();
@@ -7831,16 +7875,6 @@  vectorizable_reduction (loop_vec_info loop_vinfo,
 	  = get_vectype_for_scalar_type (loop_vinfo,
 					 TREE_TYPE (op.ops[i]), slp_op[i]);
 
-      /* To properly compute ncopies we are interested in the widest
-	 non-reduction input type in case we're looking at a widening
-	 accumulation that we later handle in vect_transform_reduction.  */
-      if (lane_reducing
-	  && vectype_op[i]
-	  && (!vectype_in
-	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
-		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
-	vectype_in = vectype_op[i];
-
       /* Record how the non-reduction-def value of COND_EXPR is defined.
 	 ???  For a chain of multiple CONDs we'd have to match them up all.  */
       if (op.code == COND_EXPR && reduc_chain_length == 1)
@@ -7859,14 +7893,6 @@  vectorizable_reduction (loop_vec_info loop_vinfo,
 	    }
 	}
     }
-  if (!vectype_in)
-    vectype_in = STMT_VINFO_VECTYPE (phi_info);
-  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
-
-  /* Each lane-reducing operation has its own input vectype, while reduction
-     PHI records the input vectype with least lanes.  */
-  if (lane_reducing)
-    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
 
   enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
   STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
-- 
2.17.1