diff mbox series

[aarch64,vect] Support V8QI->V8HI WIDEN_ patterns

Message ID AM5PR0802MB2500DB29DCD6F6DC4D4C150FF58E9@AM5PR0802MB2500.eurprd08.prod.outlook.com
State New
Headers show
Series [aarch64,vect] Support V8QI->V8HI WIDEN_ patterns | expand

Commit Message

Joel Hutton Feb. 9, 2021, 4:40 p.m. UTC
Hi Richards,

This patch adds support for the V8QI->V8HI case from widening vect patterns as discussed to target PR98772.

Bootstrapped and regression tested on aarch64.


[aarch64][vect] Support V8QI->V8HI WIDEN_ patterns

In the case where 8 out of every 16 elements are widened using a
widening pattern and the next 8 are skipped the patterns are not
recognized. This is because they are normally used in a pair, such  as
VEC_WIDEN_MINUS_HI/LO, to achieve a v16qi->v16hi conversion for example.
This patch adds support for V8HI->V8QI patterns.

gcc/ChangeLog:
        PR tree-optimisation/98772
        * optabs-tree.c (supportable_convert_operation): Add case for V8QI->V8HI
        * tree-vect-stmts.c (vect_create_vectorized_promotion_stmts): New function to generate promotion stmts for V8QI->V8HI
        (vectorizable_conversion): Add case for V8QI->V8HI

gcc/testsuite/ChangeLog:
        PR tree-optimisation/98772
        * gcc.target/aarch64/pr98772.c: New test.

Comments

Richard Sandiford Feb. 10, 2021, 9:50 a.m. UTC | #1
Joel Hutton <Joel.Hutton@arm.com> writes:
> Hi Richards,
>
> This patch adds support for the V8QI->V8HI case from widening vect patterns as discussed to target PR98772.

Thanks, the approach looks good to me.  Mostly just minor comments below.

> Bootstrapped and regression tested on aarch64.
>
>
> [aarch64][vect] Support V8QI->V8HI WIDEN_ patterns
>
> In the case where 8 out of every 16 elements are widened using a
> widening pattern and the next 8 are skipped the patterns are not
> recognized. This is because they are normally used in a pair, such  as
> VEC_WIDEN_MINUS_HI/LO, to achieve a v16qi->v16hi conversion for example.
> This patch adds support for V8HI->V8QI patterns.
>
> gcc/ChangeLog:
>         PR tree-optimisation/98772
>         * optabs-tree.c (supportable_convert_operation): Add case for V8QI->V8HI
>         * tree-vect-stmts.c (vect_create_vectorized_promotion_stmts): New function to generate promotion stmts for V8QI->V8HI
>         (vectorizable_conversion): Add case for V8QI->V8HI
>
> gcc/testsuite/ChangeLog:
>         PR tree-optimisation/98772
>         * gcc.target/aarch64/pr98772.c: New test.
>
> diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c
> index c94073e3ed98f8c4cab65891f65dedebdb1ec274..b91ce3af6f0d4b3a62110bdb38f68ecc53765cad 100644
> --- a/gcc/optabs-tree.c
> +++ b/gcc/optabs-tree.c
> @@ -308,6 +308,40 @@ supportable_convert_operation (enum tree_code code,
>    if (!VECTOR_MODE_P (m1) || !VECTOR_MODE_P (m2))
>      return false;
>  
> +  /* The case where a widening operation is not making use of the full width of
> +     of the input vector, but using the full width of the output vector.
> +     Return the non-wided code, which will be used after the inputs are

non-widened

> +     converted to the wide type.  */
> +  if ((code == WIDEN_MINUS_EXPR
> +      || code == WIDEN_PLUS_EXPR
> +      || code == WIDEN_MULT_EXPR
> +      || code == WIDEN_LSHIFT_EXPR)

Minor formatting nit, but the ||s should be indented one space more.

> +      && known_eq (TYPE_VECTOR_SUBPARTS (vectype_in),
> +		  TYPE_VECTOR_SUBPARTS (vectype_out)))
> +  {
> +    switch (code)
> +    {
> +      case WIDEN_LSHIFT_EXPR:
> +	*code1 = LSHIFT_EXPR;
> +	return true;
> +	break;
> +      case WIDEN_MINUS_EXPR:
> +	*code1 = MINUS_EXPR;
> +	return true;
> +	break;
> +      case WIDEN_PLUS_EXPR:
> +	*code1 = PLUS_EXPR;
> +	return true;
> +	break;
> +      case WIDEN_MULT_EXPR:
> +	*code1 = MULT_EXPR;
> +	return true;
> +	break;
> +      default:
> +	gcc_unreachable ();
> +    }
> +  }

Rather than return true, I think we should do something like:

      if (!supportable_convert_operation (NOP_EXPR, vectype_out,
                                          vectype_in, &dummy_code))
        return false;

      optab = optab_for_tree_code (*code1, vectype_out, optab_default);
      return (optab_handler (optab, TYPE_MODE (vectype_out))
              != CODE_FOR_nothing);

to make sure that the target really does support this.

AFAICT the caller always knows when it wants the “if” statement above
to be used.  What it's doing is a bit different from what
supportable_convert_operation normally does, so it might be better
to put it into a separate function that tests whether the target
supports the non-widening form of a widening operation.

> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index f180ced312443ba1e698932d5e8362208690b3fc..b34b00f67ea67943dee7023ab9bfd19c1be5ccbe 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -4545,6 +4545,72 @@ vect_create_vectorized_promotion_stmts (vec_info *vinfo,
>    *vec_oprnds0 = vec_tmp;
>  }
>  
> +/* Create vectorized promotion stmts for widening stmts using only half the
> +   potential vector size for input.  */
> +static void
> +vect_create_vectorized_promotion_stmts (vec_info *vinfo,
> +					vec<tree> *vec_oprnds0,
> +					vec<tree> *vec_oprnds1,
> +					stmt_vec_info stmt_info, tree vec_dest,
> +					gimple_stmt_iterator *gsi,
> +					enum tree_code code1,
> +					int op_type)
> +{
> +  int i;
> +  tree vop0, vop1, new_tmp;
> +  gimple *new_stmt1;
> +  gimple *new_stmt2;
> +  gimple *new_stmt3;
> +  vec<tree> vec_tmp = vNULL;
> +
> +  vec_tmp.create (vec_oprnds0->length () * 2);

It looks like the * 2 isn't needed.

> +  FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
> +    {
> +      tree new_tmp1, new_tmp2, new_tmp3, out_type;
> +
> +      gcc_assert (op_type == binary_op);
> +      vop1 = (*vec_oprnds1)[i];
> +
> +      /* Widen the first vector input.  */
> +      out_type = TREE_TYPE (vec_dest);
> +      new_tmp1 = make_ssa_name (out_type);
> +      new_stmt1 = gimple_build_assign (new_tmp1, NOP_EXPR, vop0);
> +      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt1, gsi);
> +      if (VECTOR_TYPE_P (TREE_TYPE (vop1)))
> +	{
> +	  /* Widen the second vector input.  */
> +	  new_tmp2 = make_ssa_name (out_type);
> +	  new_stmt2 = gimple_build_assign (new_tmp2, NOP_EXPR, vop1);
> +	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt2, gsi);
> +	  /* Perform the operation.  With both vector inputs widened.  */
> +	  new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, new_tmp2);
> +	}
> +      else
> +	{
> +	  /* Perform the operation.  With the single vector input widened.  */
> +	  new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, vop1);
> +      }
> +
> +      new_tmp3 = make_ssa_name (vec_dest, new_stmt3);
> +      gimple_assign_set_lhs (new_stmt3, new_tmp3);
> +      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt3, gsi);
> +      if (is_gimple_call (new_stmt3))
> +	{
> +	  new_tmp = gimple_call_lhs (new_stmt3);
> +	}
> +      else
> +	{
> +	  new_tmp = gimple_assign_lhs (new_stmt3);
> +	}

The lhs is always new_tmp3, so it's not necessary to read it back.

> +
> +      /* Store the results for the next step.  */
> +      vec_tmp.quick_push (new_tmp);

FWIW, you could just assign to vec_oprnds[i] and not have vec_tmp,
but I don't know whether that's more or less confusing.  Either way's
fine with me.

> +    }
> +
> +  vec_oprnds0->release ();
> +  *vec_oprnds0 = vec_tmp;
> +}
> +
>  
>  /* Check if STMT_INFO performs a conversion operation that can be vectorized.
>     If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
> @@ -4697,7 +4763,13 @@ vectorizable_conversion (vec_info *vinfo,
>    nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
>    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
>    if (known_eq (nunits_out, nunits_in))
> -    modifier = NONE;
> +    if (code == WIDEN_MINUS_EXPR
> +	|| code == WIDEN_PLUS_EXPR
> +	|| code == WIDEN_LSHIFT_EXPR
> +	|| code == WIDEN_MULT_EXPR)
> +	modifier = WIDEN;

Formatting nit: the last line should be indented by 6 spaces rather than 8.

> +    else
> +      modifier = NONE;


> @@ -4743,9 +4815,21 @@ vectorizable_conversion (vec_info *vinfo,
>        return false;
> 
>      case WIDEN:
> -      if (supportable_widening_operation (vinfo, code, stmt_info, vectype_out,
> -					  vectype_in, &code1, &code2,
> -					  &multi_step_cvt, &interm_types))
> +      if (known_eq (nunits_out, nunits_in)
> +	  && (code == WIDEN_MINUS_EXPR
> +	      || code == WIDEN_LSHIFT_EXPR
> +	      || code == WIDEN_PLUS_EXPR
> +	      || code == WIDEN_MULT_EXPR)
> +	  && supportable_convert_operation (code, vectype_out, vectype_in,
> +					    &code1))

Guess this is personal taste, sorry, since it's clearly right both ways,
but IMO it'd be better to drop the code test.  We can only get here
with nunits_out==nunits_in if we're converting a widening operation into
a non-widening operation.  If we do end up calling a separate function
(as per the comment above), then it would abort in a meaningful place
if something unexpected slips through.

> +	{
> +	  gcc_assert (!(multi_step_cvt && op_type == binary_op));
> +	  break;
> +	}
> +      else if (supportable_widening_operation (vinfo, code, stmt_info,
> +					       vectype_out, vectype_in, &code1,
> +					       &code2, &multi_step_cvt,
> +					       &interm_types))
>  	{
>  	  /* Binary widening operation can only be supported directly by the
>  	     architecture.  */
> @@ -4981,10 +5065,20 @@ vectorizable_conversion (vec_info *vinfo,
>  	      c1 = codecvt1;
>  	      c2 = codecvt2;
>  	    }
> -	  vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
> -						  &vec_oprnds1, stmt_info,
> -						  this_dest, gsi,
> -						  c1, c2, op_type);
> +	  if ((code == WIDEN_MINUS_EXPR
> +	       || code == WIDEN_PLUS_EXPR
> +	       || code == WIDEN_LSHIFT_EXPR
> +	       || code == WIDEN_MULT_EXPR)
> +	      && known_eq (nunits_in, nunits_out))

Same comment here about dropping the code tests.

Thanks,
Richard

> +	    vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
> +						    &vec_oprnds1, stmt_info,
> +						    this_dest, gsi,
> +						    c1, op_type);
> +	  else
> +	    vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
> +						    &vec_oprnds1, stmt_info,
> +						    this_dest, gsi,
> +						    c1, c2, op_type);
>  	}
 
>        FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
Joel Hutton Feb. 10, 2021, 2:42 p.m. UTC | #2
Thanks for the quick review.

Updated patch attached. I've addressed your comments below.

Tests are still running, OK for trunk assuming tests come out clean?

[aarch64][vect] Support V8QI->V8HI WIDEN_ patterns

In the case where 8 out of every 16 elements are widened using a
widening pattern and the next 8 are skipped the patterns are not
recognized. This is because they are normally used in a pair, such  as
VEC_WIDEN_MINUS_HI/LO, to achieve a v16qi->v16hi conversion for example.
This patch adds support for V8QI->V8HI patterns.

gcc/ChangeLog:

    PR tree-optimisation/98772
    * optabs-tree.c (supportable_half_widening_operation): New function
    to check for supportable V8QI->V8HI widening patterns.
    * optabs-tree.h (supportable_half_widening_operation): New function.
    * tree-vect-stmts.c (vect_create_vectorized_promotion_stmts): New
    function to create promotion stmts for V8QI->V8HI widening patterns.
    (vectorizable_conversion): Add case for V8QI->V8HI.

gcc/testsuite/ChangeLog:

    PR tree-optimisation/98772
    * gcc.target/aarch64/pr98772.c: New test.


>> +  /* The case where a widening operation is not making use of the full width of
>> +     of the input vector, but using the full width of the output vector.
>> +     Return the non-wided code, which will be used after the inputs are
>
>non-widened
Done.

>> +     converted to the wide type.  */
>> +  if ((code == WIDEN_MINUS_EXPR
>> +      || code == WIDEN_PLUS_EXPR
>> +      || code == WIDEN_MULT_EXPR
>> +      || code == WIDEN_LSHIFT_EXPR)
>
>Minor formatting nit, but the ||s should be indented one space more.
Done.

>> +      && known_eq (TYPE_VECTOR_SUBPARTS (vectype_in),
>> +               TYPE_VECTOR_SUBPARTS (vectype_out)))
>> +  {
>> +    switch (code)
>> +    {
>> +      case WIDEN_LSHIFT_EXPR:
>> +     *code1 = LSHIFT_EXPR;
>> +     return true;
>> +     break;
>> +      case WIDEN_MINUS_EXPR:
>> +     *code1 = MINUS_EXPR;
>> +     return true;
>> +     break;
>> +      case WIDEN_PLUS_EXPR:
>> +     *code1 = PLUS_EXPR;
>> +     return true;
>> +     break;
>> +      case WIDEN_MULT_EXPR:
>> +     *code1 = MULT_EXPR;
>> +     return true;
>> +     break;
>> +      default:
>> +     gcc_unreachable ();
>> +    }
>> +  }
>
>Rather than return true, I think we should do something like:
>
>      if (!supportable_convert_operation (NOP_EXPR, vectype_out,
>                                          vectype_in, &dummy_code))
>        return false;
>
>      optab = optab_for_tree_code (*code1, vectype_out, optab_default);
>      return (optab_handler (optab, TYPE_MODE (vectype_out))
>              != CODE_FOR_nothing);
>
>to make sure that the target really does support this.
Done. I used 'optab_vector' not 'optab_default', as I thought that was correct for this case and otherwise 'optab_for_tree_code' fails an assertion when 'LSHIFT_EXPR' is used.

> AFAICT the caller always knows when it wants the “if” statement above
> to be used.  What it's doing is a bit different from what
> supportable_convert_operation normally does, so it might be better
> to put it into a separate function that tests whether the target
> supports the non-widening form of a widening operation.
Done.

>> +
>> +  vec_tmp.create (vec_oprnds0->length () * 2);
>
>It looks like the * 2 isn't needed.
Done.

>> +      if (is_gimple_call (new_stmt3))
>> +     {
>> +       new_tmp = gimple_call_lhs (new_stmt3);
>> +     }
>> +      else
>> +     {
>> +       new_tmp = gimple_assign_lhs (new_stmt3);
>> +     }
>
>The lhs is always new_tmp3, so it's not necessary to read it back.
Done.

>> +
>> +      /* Store the results for the next step.  */
>> +      vec_tmp.quick_push (new_tmp);
>
>FWIW, you could just assign to vec_oprnds[i] and not have vec_tmp,
>but I don't know whether that's more or less confusing.  Either way's
>fine with me.
I chose to keep vec_tmp, but I don't feel strongly about it.

>> +    }
>> +
>> +  vec_oprnds0->release ();
>> +  *vec_oprnds0 = vec_tmp;
>> +}
>> +
>>  
>>  /* Check if STMT_INFO performs a conversion operation that can be vectorized.
>>     If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
>> @@ -4697,7 +4763,13 @@ vectorizable_conversion (vec_info *vinfo,
>>    nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
>>    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
>>    if (known_eq (nunits_out, nunits_in))
>> -    modifier = NONE;
>> +    if (code == WIDEN_MINUS_EXPR
>> +     || code == WIDEN_PLUS_EXPR
>> +     || code == WIDEN_LSHIFT_EXPR
>> +     || code == WIDEN_MULT_EXPR)
>> +     modifier = WIDEN;
>
>Formatting nit: the last line should be indented by 6 spaces rather than 8.
Done.

>> @@ -4743,9 +4815,21 @@ vectorizable_conversion (vec_info *vinfo,
>>        return false;
>> 
>>      case WIDEN:
>> -      if (supportable_widening_operation (vinfo, code, stmt_info, vectype_out,
>> -                                       vectype_in, &code1, &code2,
>> -                                       &multi_step_cvt, &interm_types))
>> +      if (known_eq (nunits_out, nunits_in)
>> +       && (code == WIDEN_MINUS_EXPR
>> +           || code == WIDEN_LSHIFT_EXPR
>> +           || code == WIDEN_PLUS_EXPR
>> +           || code == WIDEN_MULT_EXPR)
>> +       && supportable_convert_operation (code, vectype_out, vectype_in,
>> +                                         &code1))
>
>Guess this is personal taste, sorry, since it's clearly right both ways,
>but IMO it'd be better to drop the code test.  We can only get here
>with nunits_out==nunits_in if we're converting a widening operation into
>a non-widening operation.  If we do end up calling a separate function
>(as per the comment above), then it would abort in a meaningful place
>if something unexpected slips through.
Done.

>> +     {
>> +       gcc_assert (!(multi_step_cvt && op_type == binary_op));
>> +       break;
>> +     }
>> +      else if (supportable_widening_operation (vinfo, code, stmt_info,
>> +                                            vectype_out, vectype_in, &code1,
>> +                                            &code2, &multi_step_cvt,
>> +                                            &interm_types))
>>        {
>>          /* Binary widening operation can only be supported directly by the
>>             architecture.  */
>> @@ -4981,10 +5065,20 @@ vectorizable_conversion (vec_info *vinfo,
>>              c1 = codecvt1;
>>              c2 = codecvt2;
>>            }
>> -       vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
>> -                                               &vec_oprnds1, stmt_info,
>> -                                               this_dest, gsi,
>> -                                               c1, c2, op_type);
>> +       if ((code == WIDEN_MINUS_EXPR
>> +            || code == WIDEN_PLUS_EXPR
>> +            || code == WIDEN_LSHIFT_EXPR
>> +            || code == WIDEN_MULT_EXPR)
>> +           && known_eq (nunits_in, nunits_out))
>
>Same comment here about dropping the code tests.
Done.
Richard Sandiford Feb. 10, 2021, 7:46 p.m. UTC | #3
Joel Hutton <Joel.Hutton@arm.com> writes:
> @@ -277,6 +277,81 @@ optab_for_tree_code (enum tree_code code, const_tree type,
>      }
>  }
>  
> +/* Function supportable_half_widening_operation
> +

I realise existing (related) functions do have a “Function foo” line,
but it's not generally the GCC style, so I think it would be better
to leave it out.

> +   Check whether an operation represented by code is a 'half' widening operation

Nit: CODE should be in caps since it's a parameter name.

> +   using only half of the full vector width for the inputs. This allows the

I think this could be misread as saying that we only look at half the
input vector.  Maybe it would be clearer to say something like “in which
the input vectors contain half as many bits as the output vectors”
instead of “using…”.

> +   output to be stored in a single vector, as opposed to using the full width
> +   for input, where half the elements must be processed at a time into
> +   respective full-width vector outputs i.e. a 'hi'/'lo' pair using codes such
> +   as VEC_WIDEN_MINUS_HI/LO. This is handled by widening the inputs and using
> +   non-widening instructions. RTL fusing converts these to the widening hardware
> +   instructions if supported.

And here the contrast is with cases in which the input vectors have
the same number of bits as the output vectors, meaning that we need
a lo/hi pair to generate twice the amount of output data.

Might be clearer to make “This is handled…” a new paragraph.

> +
> +   Supported widening operations:
> +    WIDEN_MINUS_EXPR
> +    WIDEN_PLUS_EXPR
> +    WIDEN_MULT_EXPR
> +    WIDEN_LSHIFT_EXPR
> +
> +   Output:
> +   - CODE1 - The non-widened code, which will be used after the inputs are
> +     converted to the wide type.
> +   */

Formatting nit, */ should be on the previous line, i.e. “.” + two spaces
+ “*/”.  (Sorry for all the formatting comments.)

> +bool
> +supportable_half_widening_operation (enum tree_code code,
> +			       tree vectype_out, tree vectype_in,
> +			       enum tree_code *code1)
> +{
> +  machine_mode m1,m2;
> +  bool truncp;
> +  enum tree_code dummy_code;
> +  optab op;
> +
> +  gcc_assert (VECTOR_TYPE_P (vectype_out) && VECTOR_TYPE_P (vectype_in));
> +
> +  m1 = TYPE_MODE (vectype_out);
> +  m2 = TYPE_MODE (vectype_in);
> +
> +  if (!VECTOR_MODE_P (m1) || !VECTOR_MODE_P (m2))
> +    return false;
> +
> +  if(!known_eq (TYPE_VECTOR_SUBPARTS (vectype_in),
> +		  TYPE_VECTOR_SUBPARTS (vectype_out)))
> +      return false;

Formatting nit: should be a space between “if” and “(”.  IMO
maybe_ne is more readable than !known_eq.  The return statement
should only be indented by 4 columns.

> +  if(!(code == WIDEN_MINUS_EXPR
> +       || code == WIDEN_PLUS_EXPR
> +       || code == WIDEN_MULT_EXPR
> +       || code == WIDEN_LSHIFT_EXPR))
> +    return false;
> +
> +  switch (code)
> +  {
> +    case WIDEN_LSHIFT_EXPR:
> +      *code1 = LSHIFT_EXPR;
> +      break;
> +    case WIDEN_MINUS_EXPR:
> +      *code1 = MINUS_EXPR;
> +      break;
> +    case WIDEN_PLUS_EXPR:
> +      *code1 = PLUS_EXPR;
> +      break;
> +    case WIDEN_MULT_EXPR:
> +      *code1 = MULT_EXPR;
> +      break;
> +    default:
> +      gcc_unreachable ();

I think it would be better to return false in the default case and remove
the “if” above, so that we only have one copy of the list of codes.

> +  }

Formatting nit: the { and } lines should be indented two spaces more.
(The contents of the switch are OK as-is.)

> +
> +  if (!supportable_convert_operation(NOP_EXPR, vectype_out, vectype_in,
> +				     &dummy_code))

Nit: should be a space between the function name and ”(”.

> +    return false;
> +
> +  op = optab_for_tree_code (*code1, vectype_out, optab_vector);
> +  return (optab_handler (op, TYPE_MODE (vectype_out)) != CODE_FOR_nothing);
> +}
> +
>  /* Function supportable_convert_operation
>  
>     Check whether an operation represented by the code CODE is a

> […]
> @@ -4697,7 +4755,13 @@ vectorizable_conversion (vec_info *vinfo,
>    nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
>    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
>    if (known_eq (nunits_out, nunits_in))
> -    modifier = NONE;
> +    if (code == WIDEN_MINUS_EXPR
> +	|| code == WIDEN_PLUS_EXPR
> +	|| code == WIDEN_LSHIFT_EXPR
> +	|| code == WIDEN_MULT_EXPR)
> +	modifier = WIDEN;

The “modifier = WIDEN;” line is indented by a tab (i.e. 8 columns),
but it should only be indented by 6 spaces.

> +    else
> +      modifier = NONE;
>    else if (multiple_p (nunits_out, nunits_in))
>      modifier = NARROW;
>    else
> @@ -4743,9 +4807,16 @@ vectorizable_conversion (vec_info *vinfo,
>        return false;
>  
>      case WIDEN:
> -      if (supportable_widening_operation (vinfo, code, stmt_info, vectype_out,
> -					  vectype_in, &code1, &code2,
> -					  &multi_step_cvt, &interm_types))
> +      if (supportable_half_widening_operation (code, vectype_out, vectype_in,
> +					    &code1))
> +	{
> +	  gcc_assert (!(multi_step_cvt && op_type == binary_op));
> +	  break;
> +	}

I think here we should keep the known_eq test from earlier and not
go into the code below for that case.  I.e. something like:

       if (known_eq (…))
         {
           if (!supportable_half_widening_operation (code, vectype_out,
                                                     vectype_in, &code1))
             goto unsupported;

           gcc_assert (!(multi_step_cvt && op_type == binary_op));
           break;
         }

This is because the fallback code later in the case statement doesn't
really apply here.  It also makes it easier to correlate this test
with the transform code below.

> +      else if (supportable_widening_operation (vinfo, code, stmt_info,
> +					       vectype_out, vectype_in, &code1,
> +					       &code2, &multi_step_cvt,
> +					       &interm_types))

After the change above, this would stay as a plain “if”.

>  	{
>  	  /* Binary widening operation can only be supported directly by the
>  	     architecture.  */
> @@ -4981,10 +5052,16 @@ vectorizable_conversion (vec_info *vinfo,
>  	      c1 = codecvt1;
>  	      c2 = codecvt2;
>  	    }
> -	  vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
> -						  &vec_oprnds1, stmt_info,
> -						  this_dest, gsi,
> -						  c1, c2, op_type);
> +	  if (known_eq(nunits_out, nunits_in))

Should be a space between “known_eq” and “(”.

> +	    vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
> +						    &vec_oprnds1, stmt_info,
> +						    this_dest, gsi,
> +						    c1, op_type);

Maybe it would be clearer to call this vect_create_half_widening_stmts,
to match the new query function.

Thanks,
Richard

> +	  else
> +	    vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
> +						    &vec_oprnds1, stmt_info,
> +						    this_dest, gsi,
> +						    c1, c2, op_type);
>  	}
>  
>        FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
Joel Hutton Feb. 11, 2021, 12:33 p.m. UTC | #4
Hi Richard,

I've revised the patch, sorry about sloppy formatting in the previous one.

Full bootstrap/regression tests are still running, but the changes are pretty trivial.

Ok for trunk assuming tests finish clean?

>Joel Hutton <Joel.Hutton@arm.com> writes:
>> @@ -277,6 +277,81 @@ optab_for_tree_code (enum tree_code code, const_tree type,
>>      }
>>  }
>>
>> +/* Function supportable_half_widening_operation
>> +
>
>I realise existing (related) functions do have a “Function foo” line,
>but it's not generally the GCC style, so I think it would be better
>to leave it out.
Done.

>> +   Check whether an operation represented by code is a 'half' widening operation
>
>Nit: CODE should be in caps since it's a parameter name.
Done.

>> +   using only half of the full vector width for the inputs. This allows the
>
>I think this could be misread as saying that we only look at half the
>input vector.  Maybe it would be clearer to say something like “in which
>the input vectors contain half as many bits as the output vectors”
>instead of “using…”.
Done.

>> +   output to be stored in a single vector, as opposed to using the full width
>> +   for input, where half the elements must be processed at a time into
>> +   respective full-width vector outputs i.e. a 'hi'/'lo' pair using codes such
>> +   as VEC_WIDEN_MINUS_HI/LO. This is handled by widening the inputs and using
>> +   non-widening instructions. RTL fusing converts these to the widening hardware
>> +   instructions if supported.
>
>And here the contrast is with cases in which the input vectors have
>the same number of bits as the output vectors, meaning that we need
>a lo/hi pair to generate twice the amount of output data.
Done.

> Might be clearer to make “This is handled…” a new paragraph.
Done.

>> +
>> +   Supported widening operations:
>> +    WIDEN_MINUS_EXPR
>> +    WIDEN_PLUS_EXPR
>> +    WIDEN_MULT_EXPR
>> +    WIDEN_LSHIFT_EXPR
>> +
>> +   Output:
>> +   - CODE1 - The non-widened code, which will be used after the inputs are
>> +     converted to the wide type.
>> +   */
>
>Formatting nit, */ should be on the previous line, i.e. “.” + two spaces
>+ “*/”.  (Sorry for all the formatting comments.)
Done (sorry for the formatting errors)

>> +bool
>> +supportable_half_widening_operation (enum tree_code code,
>> +                            tree vectype_out, tree vectype_in,
>> +                            enum tree_code *code1)
>> +{
>> +  machine_mode m1,m2;
>> +  bool truncp;
>> +  enum tree_code dummy_code;
>> +  optab op;
>> +
>> +  gcc_assert (VECTOR_TYPE_P (vectype_out) && VECTOR_TYPE_P (vectype_in));
>> +
>> +  m1 = TYPE_MODE (vectype_out);
>> +  m2 = TYPE_MODE (vectype_in);
>> +
>> +  if (!VECTOR_MODE_P (m1) || !VECTOR_MODE_P (m2))
>> +    return false;
>> +
>> +  if(!known_eq (TYPE_VECTOR_SUBPARTS (vectype_in),
>> +               TYPE_VECTOR_SUBPARTS (vectype_out)))
>> +      return false;
>
>Formatting nit: should be a space between “if” and “(”.  IMO
>maybe_ne is more readable than !known_eq.  The return statement
>should only be indented by 4 columns.
Done.

>> +  if(!(code == WIDEN_MINUS_EXPR
>> +       || code == WIDEN_PLUS_EXPR
>> +       || code == WIDEN_MULT_EXPR
>> +       || code == WIDEN_LSHIFT_EXPR))
>> +    return false;
>> +
>> +  switch (code)
>> +  {
>> +    case WIDEN_LSHIFT_EXPR:
>> +      *code1 = LSHIFT_EXPR;
>> +      break;
>> +    case WIDEN_MINUS_EXPR:
>> +      *code1 = MINUS_EXPR;
>> +      break;
>> +    case WIDEN_PLUS_EXPR:
>> +      *code1 = PLUS_EXPR;
>> +      break;
>> +    case WIDEN_MULT_EXPR:
>> +      *code1 = MULT_EXPR;
>> +      break;
>> +    default:
>> +      gcc_unreachable ();
>
>I think it would be better to return false in the default case and remove
>the “if” above, so that we only have one copy of the list of codes.
Done.

>> +  }
>
>Formatting nit: the { and } lines should be indented two spaces more.
>(The contents of the switch are OK as-is.)
Done.

>> +
>> +  if (!supportable_convert_operation(NOP_EXPR, vectype_out, vectype_in,
>> +                                  &dummy_code))
>
>Nit: should be a space between the function name and ”(”.
Done.

>> +    return false;
>> +
>> +  op = optab_for_tree_code (*code1, vectype_out, optab_vector);
>> +  return (optab_handler (op, TYPE_MODE (vectype_out)) != CODE_FOR_nothing);
>> +}
>> +
>>  /* Function supportable_convert_operation
>>
>>     Check whether an operation represented by the code CODE is a
>
>> […]
>> @@ -4697,7 +4755,13 @@ vectorizable_conversion (vec_info *vinfo,
>>    nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
>>    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
>>    if (known_eq (nunits_out, nunits_in))
>> -    modifier = NONE;
>> +    if (code == WIDEN_MINUS_EXPR
>> +     || code == WIDEN_PLUS_EXPR
>> +     || code == WIDEN_LSHIFT_EXPR
>> +     || code == WIDEN_MULT_EXPR)
>> +     modifier = WIDEN;
>
>The “modifier = WIDEN;” line is indented by a tab (i.e. 8 columns),
>but it should only be indented by 6 spaces.
Done.

>> +    else
>> +      modifier = NONE;
>>    else if (multiple_p (nunits_out, nunits_in))
>>      modifier = NARROW;
>>    else
>> @@ -4743,9 +4807,16 @@ vectorizable_conversion (vec_info *vinfo,
>>        return false;
>>
>>      case WIDEN:
>> -      if (supportable_widening_operation (vinfo, code, stmt_info, vectype_out,
>> -                                       vectype_in, &code1, &code2,
>> -                                       &multi_step_cvt, &interm_types))
>> +      if (supportable_half_widening_operation (code, vectype_out, vectype_in,
>> +                                         &code1))
>> +     {
>> +       gcc_assert (!(multi_step_cvt && op_type == binary_op));
>> +       break;
>> +     }
>
>I think here we should keep the known_eq test from earlier and not
>go into the code below for that case.  I.e. something like:
>
>       if (known_eq (…))
>         {
>           if (!supportable_half_widening_operation (code, vectype_out,
>                                                     vectype_in, &code1))
>             goto unsupported;
>
>           gcc_assert (!(multi_step_cvt && op_type == binary_op));
>           break;
>         }
>
>This is because the fallback code later in the case statement doesn't
>really apply here.  It also makes it easier to correlate this test
>with the transform code below.
Done.

>> +      else if (supportable_widening_operation (vinfo, code, stmt_info,
>> +                                            vectype_out, vectype_in, &code1,
>> +                                            &code2, &multi_step_cvt,
>> +                                            &interm_types))
>
>After the change above, this would stay as a plain “if”.
Done.

>>        {
>>          /* Binary widening operation can only be supported directly by the
>>             architecture.  */
>> @@ -4981,10 +5052,16 @@ vectorizable_conversion (vec_info *vinfo,
>>              c1 = codecvt1;
>>              c2 = codecvt2;
>>            }
>> -       vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
>> -                                               &vec_oprnds1, stmt_info,
>> -                                               this_dest, gsi,
>> -                                               c1, c2, op_type);
>> +       if (known_eq(nunits_out, nunits_in))
>
>Should be a space between “known_eq” and “(”.
>
>> +         vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
>> +                                                 &vec_oprnds1, stmt_info,
>> +                                                 this_dest, gsi,
>> +                                                 c1, op_type);
>
>Maybe it would be clearer to call this vect_create_half_widening_stmts,
>to match the new query function.
Done.
Richard Sandiford Feb. 11, 2021, 12:54 p.m. UTC | #5
One more formatting nit, sorry:

Joel Hutton <Joel.Hutton@arm.com> writes:
> +bool
> +supportable_half_widening_operation (enum tree_code code,
> +			       tree vectype_out, tree vectype_in,
> +			       enum tree_code *code1)

The arguments need reindenting for the new function name, so that
they line up under “enum”.  (Obviously doesn't need a full retest.)

OK with that change, thanks.

Richard

> +{
> +  machine_mode m1,m2;
> +  enum tree_code dummy_code;
> +  optab op;
> +
> +  gcc_assert (VECTOR_TYPE_P (vectype_out) && VECTOR_TYPE_P (vectype_in));
> +
> +  m1 = TYPE_MODE (vectype_out);
> +  m2 = TYPE_MODE (vectype_in);
> +
> +  if (!VECTOR_MODE_P (m1) || !VECTOR_MODE_P (m2))
> +    return false;
> +
> +  if (maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_in),
> +		  TYPE_VECTOR_SUBPARTS (vectype_out)))
> +    return false;
> +
> +  switch (code)
> +    {
> +    case WIDEN_LSHIFT_EXPR:
> +      *code1 = LSHIFT_EXPR;
> +      break;
> +    case WIDEN_MINUS_EXPR:
> +      *code1 = MINUS_EXPR;
> +      break;
> +    case WIDEN_PLUS_EXPR:
> +      *code1 = PLUS_EXPR;
> +      break;
> +    case WIDEN_MULT_EXPR:
> +      *code1 = MULT_EXPR;
> +      break;
> +    default:
> +      return false;
> +    }
> +
> +  if (!supportable_convert_operation (NOP_EXPR, vectype_out, vectype_in,
> +				     &dummy_code))
> +    return false;
> +
> +  op = optab_for_tree_code (*code1, vectype_out, optab_vector);
> +  return (optab_handler (op, TYPE_MODE (vectype_out)) != CODE_FOR_nothing);
> +}
> +
>  /* Function supportable_convert_operation
>  
>     Check whether an operation represented by the code CODE is a
> diff --git a/gcc/optabs-tree.h b/gcc/optabs-tree.h
> index 173da0d3bd2..c3aaa1a4169 100644
> --- a/gcc/optabs-tree.h
> +++ b/gcc/optabs-tree.h
> @@ -36,6 +36,9 @@ enum optab_subtype
>     the second argument.  The third argument distinguishes between the types of
>     vector shifts and rotates.  */
>  optab optab_for_tree_code (enum tree_code, const_tree, enum optab_subtype);
> +bool
> +supportable_half_widening_operation (enum tree_code, tree, tree,
> +				    enum tree_code *);
>  bool supportable_convert_operation (enum tree_code, tree, tree,
>  				    enum tree_code *);
>  bool expand_vec_cmp_expr_p (tree, tree, enum tree_code);
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr98772.c b/gcc/testsuite/gcc.target/aarch64/pr98772.c
> new file mode 100644
> index 00000000000..663221514e9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr98772.c
> @@ -0,0 +1,155 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -save-temps" } */
> +#include <stdint.h>
> +#include <string.h>
> +
> +#define DSIZE 16
> +#define PIXSIZE 64
> +
> +extern void
> +wplus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +	for (int x = 0; x < 4; x++ )
> +	    d[x + y*4] = pix1[x] + pix2[x];
> +	pix1 += 16;
> +	pix2 += 16;
> +    }
> +}
> +extern void __attribute__((optimize (0)))
> +wplus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +	for (int x = 0; x < 4; x++ )
> +	    d[x + y*4] = pix1[x] + pix2[x];
> +	pix1 += 16;
> +	pix2 += 16;
> +    }
> +}
> +
> +extern void
> +wminus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +	for (int x = 0; x < 4; x++ )
> +	    d[x + y*4] = pix1[x] - pix2[x];
> +	pix1 += 16;
> +	pix2 += 16;
> +    }
> +}
> +extern void __attribute__((optimize (0)))
> +wminus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +	for (int x = 0; x < 4; x++ )
> +	    d[x + y*4] = pix1[x] - pix2[x];
> +	pix1 += 16;
> +	pix2 += 16;
> +    }
> +}
> +
> +extern void
> +wmult (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +	for (int x = 0; x < 4; x++ )
> +	    d[x + y*4] = pix1[x] * pix2[x];
> +	pix1 += 16;
> +	pix2 += 16;
> +    }
> +}
> +extern void __attribute__((optimize (0)))
> +wmult_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +	for (int x = 0; x < 4; x++ )
> +	    d[x + y*4] = pix1[x] * pix2[x];
> +	pix1 += 16;
> +	pix2 += 16;
> +    }
> +}
> +
> +extern void
> +wlshift (uint16_t *d, uint8_t *restrict pix1)
> +
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +	for (int x = 0; x < 4; x++ )
> +	    d[x + y*4] = pix1[x] << 8;
> +	pix1 += 16;
> +    }
> +}
> +extern void __attribute__((optimize (0)))
> +wlshift_no_opt (uint16_t *d, uint8_t *restrict pix1)
> +
> +{
> +    for (int y = 0; y < 4; y++ )
> +    {
> +	for (int x = 0; x < 4; x++ )
> +	    d[x + y*4] = pix1[x] << 8;
> +	pix1 += 16;
> +    }
> +}
> +
> +void __attribute__((optimize (0)))
> +init_arrays (uint16_t *d_a, uint16_t *d_b, uint8_t *pix1, uint8_t *pix2)
> +{
> +  for (int i = 0; i < DSIZE; i++)
> +  {
> +    d_a[i] = (1074 * i)%17;
> +    d_b[i] = (1074 * i)%17;
> +  }
> +  for (int i = 0; i < PIXSIZE; i++)
> +  {
> +    pix1[i] = (1024 * i)%17;
> +    pix2[i] = (1024 * i)%17;
> +  }
> +}
> +
> +/* Don't optimize main so we don't get confused over where the vector
> +   instructions are generated. */
> +__attribute__((optimize (0)))
> +int main ()
> +{
> +  uint16_t d_a[DSIZE];
> +  uint16_t d_b[DSIZE];
> +  uint8_t pix1[PIXSIZE];
> +  uint8_t pix2[PIXSIZE];
> +
> +  init_arrays (d_a, d_b, pix1, pix2);
> +  wplus (d_a, pix1, pix2);
> +  wplus_no_opt (d_b, pix1, pix2);
> +  if (memcmp (d_a,d_b, DSIZE) != 0)
> +    return 1;
> +
> +  init_arrays (d_a, d_b, pix1, pix2);
> +  wminus (d_a, pix1, pix2);
> +  wminus_no_opt (d_b, pix1, pix2);
> +  if (memcmp (d_a,d_b, DSIZE) != 0)
> +    return 2;
> +
> +  init_arrays (d_a, d_b, pix1, pix2);
> +  wmult (d_a, pix1, pix2);
> +  wmult_no_opt (d_b, pix1, pix2);
> +  if (memcmp (d_a,d_b, DSIZE) != 0)
> +    return 3;
> +
> +  init_arrays (d_a, d_b, pix1, pix2);
> +  wlshift (d_a, pix1);
> +  wlshift_no_opt (d_b, pix1);
> +  if (memcmp (d_a,d_b, DSIZE) != 0)
> +    return 4;
> +
> +}
> +
> +/* { dg-final { scan-assembler-times "uaddl\\tv" 2 } } */
> +/* { dg-final { scan-assembler-times "usubl\\tv" 2 } } */
> +/* { dg-final { scan-assembler-times "umull\\tv" 2 } } */
> +/* { dg-final { scan-assembler-times "shl\\tv" 2 } } */
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index f180ced3124..bf23499f7e3 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -4545,6 +4545,64 @@ vect_create_vectorized_promotion_stmts (vec_info *vinfo,
>    *vec_oprnds0 = vec_tmp;
>  }
>  
> +/* Create vectorized promotion stmts for widening stmts using only half the
> +   potential vector size for input.  */
> +static void
> +vect_create_half_widening_stmts (vec_info *vinfo,
> +					vec<tree> *vec_oprnds0,
> +					vec<tree> *vec_oprnds1,
> +					stmt_vec_info stmt_info, tree vec_dest,
> +					gimple_stmt_iterator *gsi,
> +					enum tree_code code1,
> +					int op_type)
> +{
> +  int i;
> +  tree vop0, vop1;
> +  gimple *new_stmt1;
> +  gimple *new_stmt2;
> +  gimple *new_stmt3;
> +  vec<tree> vec_tmp = vNULL;
> +
> +  vec_tmp.create (vec_oprnds0->length ());
> +  FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
> +    {
> +      tree new_tmp1, new_tmp2, new_tmp3, out_type;
> +
> +      gcc_assert (op_type == binary_op);
> +      vop1 = (*vec_oprnds1)[i];
> +
> +      /* Widen the first vector input.  */
> +      out_type = TREE_TYPE (vec_dest);
> +      new_tmp1 = make_ssa_name (out_type);
> +      new_stmt1 = gimple_build_assign (new_tmp1, NOP_EXPR, vop0);
> +      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt1, gsi);
> +      if (VECTOR_TYPE_P (TREE_TYPE (vop1)))
> +	{
> +	  /* Widen the second vector input.  */
> +	  new_tmp2 = make_ssa_name (out_type);
> +	  new_stmt2 = gimple_build_assign (new_tmp2, NOP_EXPR, vop1);
> +	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt2, gsi);
> +	  /* Perform the operation.  With both vector inputs widened.  */
> +	  new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, new_tmp2);
> +	}
> +      else
> +	{
> +	  /* Perform the operation.  With the single vector input widened.  */
> +	  new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, vop1);
> +      }
> +
> +      new_tmp3 = make_ssa_name (vec_dest, new_stmt3);
> +      gimple_assign_set_lhs (new_stmt3, new_tmp3);
> +      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt3, gsi);
> +
> +      /* Store the results for the next step.  */
> +      vec_tmp.quick_push (new_tmp3);
> +    }
> +
> +  vec_oprnds0->release ();
> +  *vec_oprnds0 = vec_tmp;
> +}
> +
>  
>  /* Check if STMT_INFO performs a conversion operation that can be vectorized.
>     If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
> @@ -4697,7 +4755,13 @@ vectorizable_conversion (vec_info *vinfo,
>    nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
>    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
>    if (known_eq (nunits_out, nunits_in))
> -    modifier = NONE;
> +    if (code == WIDEN_MINUS_EXPR
> +	|| code == WIDEN_PLUS_EXPR
> +	|| code == WIDEN_LSHIFT_EXPR
> +	|| code == WIDEN_MULT_EXPR)
> +      modifier = WIDEN;
> +    else
> +      modifier = NONE;
>    else if (multiple_p (nunits_out, nunits_in))
>      modifier = NARROW;
>    else
> @@ -4743,9 +4807,18 @@ vectorizable_conversion (vec_info *vinfo,
>        return false;
>  
>      case WIDEN:
> -      if (supportable_widening_operation (vinfo, code, stmt_info, vectype_out,
> -					  vectype_in, &code1, &code2,
> -					  &multi_step_cvt, &interm_types))
> +      if (known_eq (nunits_in, nunits_out))
> +	{
> +	  if (!supportable_half_widening_operation (code, vectype_out,
> +						   vectype_in, &code1))
> +	    goto unsupported;
> +	  gcc_assert (!(multi_step_cvt && op_type == binary_op));
> +	  break;
> +	}
> +      if (supportable_widening_operation (vinfo, code, stmt_info,
> +					       vectype_out, vectype_in, &code1,
> +					       &code2, &multi_step_cvt,
> +					       &interm_types))
>  	{
>  	  /* Binary widening operation can only be supported directly by the
>  	     architecture.  */
> @@ -4981,10 +5054,16 @@ vectorizable_conversion (vec_info *vinfo,
>  	      c1 = codecvt1;
>  	      c2 = codecvt2;
>  	    }
> -	  vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
> -						  &vec_oprnds1, stmt_info,
> -						  this_dest, gsi,
> -						  c1, c2, op_type);
> +	  if (known_eq (nunits_out, nunits_in))
> +	    vect_create_half_widening_stmts (vinfo, &vec_oprnds0,
> +						    &vec_oprnds1, stmt_info,
> +						    this_dest, gsi,
> +						    c1, op_type);
> +	  else
> +	    vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
> +						    &vec_oprnds1, stmt_info,
> +						    this_dest, gsi,
> +						    c1, c2, op_type);
>  	}
>  
>        FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
diff mbox series

Patch

diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c
index c94073e3ed98f8c4cab65891f65dedebdb1ec274..b91ce3af6f0d4b3a62110bdb38f68ecc53765cad 100644
--- a/gcc/optabs-tree.c
+++ b/gcc/optabs-tree.c
@@ -308,6 +308,40 @@  supportable_convert_operation (enum tree_code code,
   if (!VECTOR_MODE_P (m1) || !VECTOR_MODE_P (m2))
     return false;
 
+  /* The case where a widening operation is not making use of the full width of
+     of the input vector, but using the full width of the output vector.
+     Return the non-wided code, which will be used after the inputs are
+     converted to the wide type.  */
+  if ((code == WIDEN_MINUS_EXPR
+      || code == WIDEN_PLUS_EXPR
+      || code == WIDEN_MULT_EXPR
+      || code == WIDEN_LSHIFT_EXPR)
+      && known_eq (TYPE_VECTOR_SUBPARTS (vectype_in),
+		  TYPE_VECTOR_SUBPARTS (vectype_out)))
+  {
+    switch (code)
+    {
+      case WIDEN_LSHIFT_EXPR:
+	*code1 = LSHIFT_EXPR;
+	return true;
+	break;
+      case WIDEN_MINUS_EXPR:
+	*code1 = MINUS_EXPR;
+	return true;
+	break;
+      case WIDEN_PLUS_EXPR:
+	*code1 = PLUS_EXPR;
+	return true;
+	break;
+      case WIDEN_MULT_EXPR:
+	*code1 = MULT_EXPR;
+	return true;
+	break;
+      default:
+	gcc_unreachable ();
+    }
+  }
+
   /* First check if we can done conversion directly.  */
   if ((code == FIX_TRUNC_EXPR
        && can_fix_p (m1,m2,TYPE_UNSIGNED (vectype_out), &truncp)
diff --git a/gcc/testsuite/gcc.target/aarch64/pr98772.c b/gcc/testsuite/gcc.target/aarch64/pr98772.c
new file mode 100644
index 0000000000000000000000000000000000000000..35568a9f9d60c44aa01a6afc5f7e6a0935009aaf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr98772.c
@@ -0,0 +1,155 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps" } */
+#include <stdint.h>
+#include <string.h>
+
+#define DSIZE 16
+#define PIXSIZE 64
+
+extern void
+wplus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] + pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+extern void __attribute__((optimize (0)))
+wplus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] + pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+
+extern void
+wminus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] - pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+extern void __attribute__((optimize (0)))
+wminus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] - pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+
+extern void
+wmult (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] * pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+extern void __attribute__((optimize (0)))
+wmult_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] * pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+
+extern void
+wlshift (uint16_t *d, uint8_t *restrict pix1)
+
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] << 8;
+	pix1 += 16;
+    }
+}
+extern void __attribute__((optimize (0)))
+wlshift_no_opt (uint16_t *d, uint8_t *restrict pix1)
+
+{
+    for( int y = 0; y < 4; y++ )
+    {
+	for( int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] << 8;
+	pix1 += 16;
+    }
+}
+
+void __attribute__((optimize (0)))
+init_arrays(uint16_t *d_a, uint16_t *d_b, uint8_t *pix1, uint8_t *pix2)
+{
+  for(int i = 0; i < DSIZE; i++)
+  {
+    d_a[i] = (1074 * i)%17;
+    d_b[i] = (1074 * i)%17;
+  }
+  for(int i = 0; i < PIXSIZE; i++)
+  {
+    pix1[i] = (1024 * i)%17;
+    pix2[i] = (1024 * i)%17;
+  }
+}
+
+/* Don't optimize main so we don't get confused over where the vector
+   instructions are generated. */
+__attribute__((optimize (0)))
+int main()
+{
+  uint16_t d_a[DSIZE];
+  uint16_t d_b[DSIZE];
+  uint8_t pix1[PIXSIZE];
+  uint8_t pix2[PIXSIZE];
+
+  init_arrays (d_a, d_b, pix1, pix2);
+  wplus(d_a, pix1, pix2);
+  wplus_no_opt(d_b, pix1, pix2);
+  if (memcmp(d_a,d_b, DSIZE) != 0)
+    return 1;
+
+  init_arrays (d_a, d_b, pix1, pix2);
+  wminus(d_a, pix1, pix2);
+  wminus_no_opt(d_b, pix1, pix2);
+  if (memcmp(d_a,d_b, DSIZE) != 0)
+    return 2;
+
+  init_arrays (d_a, d_b, pix1, pix2);
+  wmult(d_a, pix1, pix2);
+  wmult_no_opt(d_b, pix1, pix2);
+  if (memcmp(d_a,d_b, DSIZE) != 0)
+    return 3;
+
+  init_arrays (d_a, d_b, pix1, pix2);
+  wlshift(d_a, pix1);
+  wlshift_no_opt(d_b, pix1);
+  if (memcmp(d_a,d_b, DSIZE) != 0)
+    return 4;
+
+}
+
+/* { dg-final { scan-assembler-times "uaddl\\tv" 2 } } */
+/* { dg-final { scan-assembler-times "usubl\\tv" 2 } } */
+/* { dg-final { scan-assembler-times "umull\\tv" 2 } } */
+/* { dg-final { scan-assembler-times "shl\\tv" 2 } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index f180ced312443ba1e698932d5e8362208690b3fc..b34b00f67ea67943dee7023ab9bfd19c1be5ccbe 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -4545,6 +4545,72 @@  vect_create_vectorized_promotion_stmts (vec_info *vinfo,
   *vec_oprnds0 = vec_tmp;
 }
 
+/* Create vectorized promotion stmts for widening stmts using only half the
+   potential vector size for input.  */
+static void
+vect_create_vectorized_promotion_stmts (vec_info *vinfo,
+					vec<tree> *vec_oprnds0,
+					vec<tree> *vec_oprnds1,
+					stmt_vec_info stmt_info, tree vec_dest,
+					gimple_stmt_iterator *gsi,
+					enum tree_code code1,
+					int op_type)
+{
+  int i;
+  tree vop0, vop1, new_tmp;
+  gimple *new_stmt1;
+  gimple *new_stmt2;
+  gimple *new_stmt3;
+  vec<tree> vec_tmp = vNULL;
+
+  vec_tmp.create (vec_oprnds0->length () * 2);
+  FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
+    {
+      tree new_tmp1, new_tmp2, new_tmp3, out_type;
+
+      gcc_assert (op_type == binary_op);
+      vop1 = (*vec_oprnds1)[i];
+
+      /* Widen the first vector input.  */
+      out_type = TREE_TYPE (vec_dest);
+      new_tmp1 = make_ssa_name (out_type);
+      new_stmt1 = gimple_build_assign (new_tmp1, NOP_EXPR, vop0);
+      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt1, gsi);
+      if (VECTOR_TYPE_P (TREE_TYPE (vop1)))
+	{
+	  /* Widen the second vector input.  */
+	  new_tmp2 = make_ssa_name (out_type);
+	  new_stmt2 = gimple_build_assign (new_tmp2, NOP_EXPR, vop1);
+	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt2, gsi);
+	  /* Perform the operation.  With both vector inputs widened.  */
+	  new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, new_tmp2);
+	}
+      else
+	{
+	  /* Perform the operation.  With the single vector input widened.  */
+	  new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, vop1);
+      }
+
+      new_tmp3 = make_ssa_name (vec_dest, new_stmt3);
+      gimple_assign_set_lhs (new_stmt3, new_tmp3);
+      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt3, gsi);
+      if (is_gimple_call (new_stmt3))
+	{
+	  new_tmp = gimple_call_lhs (new_stmt3);
+	}
+      else
+	{
+	  new_tmp = gimple_assign_lhs (new_stmt3);
+	}
+
+      /* Store the results for the next step.  */
+      vec_tmp.quick_push (new_tmp);
+    }
+
+  vec_oprnds0->release ();
+  *vec_oprnds0 = vec_tmp;
+}
+
 
 /* Check if STMT_INFO performs a conversion operation that can be vectorized.
    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
@@ -4697,7 +4763,13 @@  vectorizable_conversion (vec_info *vinfo,
   nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
   nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
   if (known_eq (nunits_out, nunits_in))
-    modifier = NONE;
+    if (code == WIDEN_MINUS_EXPR
+	|| code == WIDEN_PLUS_EXPR
+	|| code == WIDEN_LSHIFT_EXPR
+	|| code == WIDEN_MULT_EXPR)
+	modifier = WIDEN;
+    else
+      modifier = NONE;
   else if (multiple_p (nunits_out, nunits_in))
     modifier = NARROW;
   else
@@ -4743,9 +4815,21 @@  vectorizable_conversion (vec_info *vinfo,
       return false;
 
     case WIDEN:
-      if (supportable_widening_operation (vinfo, code, stmt_info, vectype_out,
-					  vectype_in, &code1, &code2,
-					  &multi_step_cvt, &interm_types))
+      if (known_eq (nunits_out, nunits_in)
+	  && (code == WIDEN_MINUS_EXPR
+	      || code == WIDEN_LSHIFT_EXPR
+	      || code == WIDEN_PLUS_EXPR
+	      || code == WIDEN_MULT_EXPR)
+	  && supportable_convert_operation (code, vectype_out, vectype_in,
+					    &code1))
+	{
+	  gcc_assert (!(multi_step_cvt && op_type == binary_op));
+	  break;
+	}
+      else if (supportable_widening_operation (vinfo, code, stmt_info,
+					       vectype_out, vectype_in, &code1,
+					       &code2, &multi_step_cvt,
+					       &interm_types))
 	{
 	  /* Binary widening operation can only be supported directly by the
 	     architecture.  */
@@ -4981,10 +5065,20 @@  vectorizable_conversion (vec_info *vinfo,
 	      c1 = codecvt1;
 	      c2 = codecvt2;
 	    }
-	  vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
-						  &vec_oprnds1, stmt_info,
-						  this_dest, gsi,
-						  c1, c2, op_type);
+	  if ((code == WIDEN_MINUS_EXPR
+	       || code == WIDEN_PLUS_EXPR
+	       || code == WIDEN_LSHIFT_EXPR
+	       || code == WIDEN_MULT_EXPR)
+	      && known_eq (nunits_in, nunits_out))
+	    vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
+						    &vec_oprnds1, stmt_info,
+						    this_dest, gsi,
+						    c1, op_type);
+	  else
+	    vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
+						    &vec_oprnds1, stmt_info,
+						    this_dest, gsi,
+						    c1, c2, op_type);
 	}
 
       FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)