diff mbox series

[RFC] Consider lrotate const rotation in vectorizer

Message ID 32f89c4f-cd2d-a7bd-16d2-26fed6bb5f56@linux.ibm.com
State New
Headers show
Series [RFC] Consider lrotate const rotation in vectorizer | expand

Commit Message

Kewen.Lin July 16, 2019, 8:45 a.m. UTC
Hi all,

Based on the previous comments (thank you!), I tried to update the 
handling in expander and vectorizer.  Middle-end optimizes lrotate
with const rotation count to rrotate all the time, it makes vectorizer
fail to vectorize if rrotate isn't supported on the target.  We can at
least teach it on const rotation count, the cost should be the same? 
At the same time, the expander already tries to use the opposite 
rotation optable for scalar, we can teach it to deal with vector as well.

Is it on the right track and reasonable?


Thanks,
Kewen

--------------

One informal patch to help describing this new thought:

Comments

Richard Biener July 17, 2019, 10:37 a.m. UTC | #1
On Tue, Jul 16, 2019 at 10:45 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> Hi all,
>
> Based on the previous comments (thank you!), I tried to update the
> handling in expander and vectorizer.  Middle-end optimizes lrotate
> with const rotation count to rrotate all the time, it makes vectorizer
> fail to vectorize if rrotate isn't supported on the target.  We can at
> least teach it on const rotation count, the cost should be the same?
> At the same time, the expander already tries to use the opposite
> rotation optable for scalar, we can teach it to deal with vector as well.
>
> Is it on the right track and reasonable?

So you're basically fixing this up in the expander.  I think on
the GIMPLE level you then miss to update tree-vect-generic.c?

I'm not sure if it makes sense to have both LROTATE_EXPR and
RROTATE_EXPR on the GIMPLE level then (that CPUs only
support one direction is natural though).  So maybe simply get
rid of one?  Its semantics are also nowhere documented
(do we allow negative rotation amounts?  how are
non-mode-precision entities rotated? etc.).

Richard.

>
> Thanks,
> Kewen
>
> --------------
>
> One informal patch to help describing this new thought:
>
>
> diff --git a/gcc/optabs.c b/gcc/optabs.c
> index a0e361b8bfe..ebebb0ad145 100644
> --- a/gcc/optabs.c
> +++ b/gcc/optabs.c
> @@ -1273,6 +1273,7 @@ expand_binop (machine_mode mode, optab binoptab, rtx op0, rtx op1,
>    if (mclass == MODE_VECTOR_INT)
>      {
>        optab otheroptab = unknown_optab;
> +      optab otheroptab1 = unknown_optab;
>
>        if (binoptab == ashl_optab)
>         otheroptab = vashl_optab;
> @@ -1281,23 +1282,50 @@ expand_binop (machine_mode mode, optab binoptab, rtx op0, rtx op1,
>        else if (binoptab == lshr_optab)
>         otheroptab = vlshr_optab;
>        else if (binoptab == rotl_optab)
> -       otheroptab = vrotl_optab;
> +       {
> +         otheroptab = vrotl_optab;
> +         otheroptab1 = vrotr_optab;
> +       }
>        else if (binoptab == rotr_optab)
> -       otheroptab = vrotr_optab;
> +       {
> +         otheroptab = vrotr_optab;
> +         otheroptab1 = vrotl_optab;
> +       }
> +
> +      bool other_ok = (otheroptab && (icode = optab_handler (otheroptab, mode)) != CODE_FOR_nothing);
> +      bool other1_ok = false;
> +      if (!other_ok && otheroptab1)
> +       other1_ok
> +         = ((icode = optab_handler (otheroptab1, mode)) != CODE_FOR_nothing)
> +           && SCALAR_INT_MODE_P (GET_MODE_INNER (mode));
>
> -      if (otheroptab
> -         && (icode = optab_handler (otheroptab, mode)) != CODE_FOR_nothing)
> +      if (other_ok || other1_ok)
>         {
>           /* The scalar may have been extended to be too wide.  Truncate
>              it back to the proper size to fit in the broadcast vector.  */
>           scalar_mode inner_mode = GET_MODE_INNER (mode);
> -         if (!CONST_INT_P (op1)
> -             && (GET_MODE_BITSIZE (as_a <scalar_int_mode> (GET_MODE (op1)))
> +         rtx newop1 = op1;
> +         if (other1_ok)
> +           {
> +             unsigned int bits = GET_MODE_PRECISION (inner_mode);
> +
> +             if (CONST_INT_P (op1))
> +               newop1 = gen_int_shift_amount (int_mode, bits - INTVAL (op1));
> +             else if (targetm.shift_truncation_mask (int_mode) == bits - 1)
> +               newop1 = negate_rtx (GET_MODE (op1), op1);
> +             else
> +               newop1 = expand_binop (GET_MODE (op1), sub_optab,
> +                                      gen_int_mode (bits, GET_MODE (op1)), op1,
> +                                      NULL_RTX, unsignedp, OPTAB_DIRECT);
> +           }
> +         if (!CONST_INT_P (newop1)
> +             && (GET_MODE_BITSIZE (as_a<scalar_int_mode> (GET_MODE (newop1)))
>                   > GET_MODE_BITSIZE (inner_mode)))
> -           op1 = force_reg (inner_mode,
> -                            simplify_gen_unary (TRUNCATE, inner_mode, op1,
> -                                                GET_MODE (op1)));
> -         rtx vop1 = expand_vector_broadcast (mode, op1);
> +           newop1 = force_reg (inner_mode,
> +                               simplify_gen_unary (TRUNCATE, inner_mode,
> +                                                   newop1, GET_MODE (newop1)));
> +
> +         rtx vop1 = expand_vector_broadcast (mode, newop1);
>           if (vop1)
>             {
>               temp = expand_binop_directly (icode, mode, otheroptab, op0, vop1,
>
> diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
> index ff952d6f464..c05ce1acba4 100644
> --- a/gcc/tree-vect-patterns.c
> +++ b/gcc/tree-vect-patterns.c
> @@ -2039,6 +2039,15 @@ vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
>    if (optab1
>        && optab_handler (optab1, TYPE_MODE (vectype)) != CODE_FOR_nothing)
>      return NULL;
> +  /* middle-end canonicalizing LROTATE to RROTATE with const rotation count,
> +     let's try the LROTATE as well.  */
> +  if (rhs_code == RROTATE_EXPR && TREE_CODE(oprnd1) == INTEGER_CST)
> +    {
> +      optab1 = optab_for_tree_code (LROTATE_EXPR, vectype, optab_vector);
> +      if (optab1
> +         && optab_handler (optab1, TYPE_MODE (vectype)) != CODE_FOR_nothing)
> +       return NULL;
> +    }
>
>    if (is_a <bb_vec_info> (vinfo) || dt != vect_internal_def)
>      {
> @@ -2046,6 +2055,14 @@ vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
>        if (optab2
>           && optab_handler (optab2, TYPE_MODE (vectype)) != CODE_FOR_nothing)
>         return NULL;
> +      if (rhs_code == RROTATE_EXPR && TREE_CODE(oprnd1) == INTEGER_CST)
> +       {
> +         optab2 = optab_for_tree_code (LROTATE_EXPR, vectype, optab_scalar);
> +         if (optab2
> +             && optab_handler (optab2, TYPE_MODE (vectype))
> +                  != CODE_FOR_nothing)
> +           return NULL;
> +       }
>      }
>
>    /* If vector/vector or vector/scalar shifts aren't supported by the target,
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 21046931243..9e1a2f971a1 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -5665,6 +5665,12 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
>    if (!scalar_shift_arg)
>      {
>        optab = optab_for_tree_code (code, vectype, optab_vector);
> +
> +      if (TREE_CODE (op1) == INTEGER_CST && code == RROTATE_EXPR
> +         && !(optab
> +              && optab_handler (optab, TYPE_MODE (vectype))
> +                   != CODE_FOR_nothing))
> +       optab = optab_for_tree_code (LROTATE_EXPR, vectype, optab_vector);
>        if (dump_enabled_p ())
>          dump_printf_loc (MSG_NOTE, vect_location,
>                           "vector/vector shift/rotate found.\n");
> @@ -5696,7 +5702,10 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
>        else
>          {
>            optab = optab_for_tree_code (code, vectype, optab_vector);
> -          if (optab
> +         if (TREE_CODE (op1) == INTEGER_CST && code == RROTATE_EXPR
> +             && !(optab && optab_handler (optab, TYPE_MODE (vectype))))
> +           optab = optab_for_tree_code (LROTATE_EXPR, vectype, optab_vector);
> +         if (optab
>                 && (optab_handler (optab, TYPE_MODE (vectype))
>                        != CODE_FOR_nothing))
>              {
>
Jakub Jelinek July 17, 2019, 10:54 a.m. UTC | #2
On Wed, Jul 17, 2019 at 12:37:59PM +0200, Richard Biener wrote:
> On Tue, Jul 16, 2019 at 10:45 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
> > Based on the previous comments (thank you!), I tried to update the
> > handling in expander and vectorizer.  Middle-end optimizes lrotate
> > with const rotation count to rrotate all the time, it makes vectorizer
> > fail to vectorize if rrotate isn't supported on the target.  We can at
> > least teach it on const rotation count, the cost should be the same?
> > At the same time, the expander already tries to use the opposite
> > rotation optable for scalar, we can teach it to deal with vector as well.
> >
> > Is it on the right track and reasonable?
> 
> So you're basically fixing this up in the expander.  I think on
> the GIMPLE level you then miss to update tree-vect-generic.c?
> 
> I'm not sure if it makes sense to have both LROTATE_EXPR and
> RROTATE_EXPR on the GIMPLE level then (that CPUs only
> support one direction is natural though).  So maybe simply get
> rid of one?  Its semantics are also nowhere documented

A lot of targets support both, and I think not all targets do the
truncation, so at least with non-constant rotate count emitting one over the
other is important and trying to match it up only during combine might be
too late and not work well in many cases.
Then there are some targets that only support left rotates and not right
rotates (rs6000, s390, tilegx, ...), and other targets that only support
right rotates (mips, iq2000, ...).
So only having one GIMPLE code doesn't seem to be good enough.

I think handling it during expansion in generic code is fine, especially
when we clearly have several targets that do support only one of the
rotates.  As you wrote, it needs corresponding code in tree-vect-generic.c,
and shouldn't hardcode the rs6000 direction of mapping rotr to rotl, but
support also the other direction - rotl to rotr.  For the sake of
!SHIFT_COUNT_TRUNCATED targets for constant shift counts it needs to do
negation + masking and for variable shift counts probably punt and let the
backend code handle it if it can do the truncation in there?

	Jakub
Richard Biener July 17, 2019, 11:32 a.m. UTC | #3
On Wed, Jul 17, 2019 at 12:54 PM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Wed, Jul 17, 2019 at 12:37:59PM +0200, Richard Biener wrote:
> > On Tue, Jul 16, 2019 at 10:45 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
> > > Based on the previous comments (thank you!), I tried to update the
> > > handling in expander and vectorizer.  Middle-end optimizes lrotate
> > > with const rotation count to rrotate all the time, it makes vectorizer
> > > fail to vectorize if rrotate isn't supported on the target.  We can at
> > > least teach it on const rotation count, the cost should be the same?
> > > At the same time, the expander already tries to use the opposite
> > > rotation optable for scalar, we can teach it to deal with vector as well.
> > >
> > > Is it on the right track and reasonable?
> >
> > So you're basically fixing this up in the expander.  I think on
> > the GIMPLE level you then miss to update tree-vect-generic.c?
> >
> > I'm not sure if it makes sense to have both LROTATE_EXPR and
> > RROTATE_EXPR on the GIMPLE level then (that CPUs only
> > support one direction is natural though).  So maybe simply get
> > rid of one?  Its semantics are also nowhere documented
>
> A lot of targets support both, and I think not all targets do the
> truncation, so at least with non-constant rotate count emitting one over the
> other is important and trying to match it up only during combine might be
> too late and not work well in many cases.
> Then there are some targets that only support left rotates and not right
> rotates (rs6000, s390, tilegx, ...), and other targets that only support
> right rotates (mips, iq2000, ...).
> So only having one GIMPLE code doesn't seem to be good enough.

It seems for constants it is by means of canonicalization.  The lack
of canonicalization for non-constants then makes us fail to CSE
lrotate<x, a> and rrotate<x, width-a>.  Given rotates are only
detected on GIMPLE always creating one or the other should be
reasonably easy and fixup during expansion can happen either via TER
or via pre-expand generation of optab corresponding IFNs?

Might get tricky if we face width - (a + 5) so the pattern matching
of an opposing direction rotate gets harder.

> I think handling it during expansion in generic code is fine, especially
> when we clearly have several targets that do support only one of the
> rotates.

Yes.

> As you wrote, it needs corresponding code in tree-vect-generic.c,
> and shouldn't hardcode the rs6000 direction of mapping rotr to rotl, but
> support also the other direction - rotl to rotr.  For the sake of
> !SHIFT_COUNT_TRUNCATED targets for constant shift counts it needs to do
> negation + masking and for variable shift counts probably punt and let the
> backend code handle it if it can do the truncation in there?

Ick.  I wouldn't even touch SHIFT_COUNT_TRUNCATED with a 10-foot pole
here.  And for rotates we can simply always truncate constant amounts to
the rotated operands width, no?  For non-constant ones I fear targets
would need to support both to get reliable expansion.

Richard.

>         Jakub
Richard Biener July 17, 2019, 11:35 a.m. UTC | #4
On Wed, Jul 17, 2019 at 1:32 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, Jul 17, 2019 at 12:54 PM Jakub Jelinek <jakub@redhat.com> wrote:
> >
> > On Wed, Jul 17, 2019 at 12:37:59PM +0200, Richard Biener wrote:
> > > On Tue, Jul 16, 2019 at 10:45 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
> > > > Based on the previous comments (thank you!), I tried to update the
> > > > handling in expander and vectorizer.  Middle-end optimizes lrotate
> > > > with const rotation count to rrotate all the time, it makes vectorizer
> > > > fail to vectorize if rrotate isn't supported on the target.  We can at
> > > > least teach it on const rotation count, the cost should be the same?
> > > > At the same time, the expander already tries to use the opposite
> > > > rotation optable for scalar, we can teach it to deal with vector as well.
> > > >
> > > > Is it on the right track and reasonable?
> > >
> > > So you're basically fixing this up in the expander.  I think on
> > > the GIMPLE level you then miss to update tree-vect-generic.c?
> > >
> > > I'm not sure if it makes sense to have both LROTATE_EXPR and
> > > RROTATE_EXPR on the GIMPLE level then (that CPUs only
> > > support one direction is natural though).  So maybe simply get
> > > rid of one?  Its semantics are also nowhere documented
> >
> > A lot of targets support both, and I think not all targets do the
> > truncation, so at least with non-constant rotate count emitting one over the
> > other is important and trying to match it up only during combine might be
> > too late and not work well in many cases.
> > Then there are some targets that only support left rotates and not right
> > rotates (rs6000, s390, tilegx, ...), and other targets that only support
> > right rotates (mips, iq2000, ...).
> > So only having one GIMPLE code doesn't seem to be good enough.
>
> It seems for constants it is by means of canonicalization.  The lack
> of canonicalization for non-constants then makes us fail to CSE
> lrotate<x, a> and rrotate<x, width-a>.  Given rotates are only
> detected on GIMPLE always creating one or the other should be
> reasonably easy and fixup during expansion can happen either via TER
> or via pre-expand generation of optab corresponding IFNs?
>
> Might get tricky if we face width - (a + 5) so the pattern matching
> of an opposing direction rotate gets harder.
>
> > I think handling it during expansion in generic code is fine, especially
> > when we clearly have several targets that do support only one of the
> > rotates.
>
> Yes.
>
> > As you wrote, it needs corresponding code in tree-vect-generic.c,
> > and shouldn't hardcode the rs6000 direction of mapping rotr to rotl, but
> > support also the other direction - rotl to rotr.  For the sake of
> > !SHIFT_COUNT_TRUNCATED targets for constant shift counts it needs to do
> > negation + masking and for variable shift counts probably punt and let the
> > backend code handle it if it can do the truncation in there?
>
> Ick.  I wouldn't even touch SHIFT_COUNT_TRUNCATED with a 10-foot pole
> here.  And for rotates we can simply always truncate constant amounts to
> the rotated operands width, no?  For non-constant ones I fear targets
> would need to support both to get reliable expansion.

Btw, the docs of SHIFT_COUNT_TRUNCATED do not mention rotates
unless you treat a rotate as a shift.

Richard.

> Richard.
>
> >         Jakub
Segher Boessenkool July 17, 2019, 1:48 p.m. UTC | #5
On Wed, Jul 17, 2019 at 01:32:50PM +0200, Richard Biener wrote:
> On Wed, Jul 17, 2019 at 12:54 PM Jakub Jelinek <jakub@redhat.com> wrote:
> Ick.  I wouldn't even touch SHIFT_COUNT_TRUNCATED with a 10-foot pole
> here.  And for rotates we can simply always truncate constant amounts to
> the rotated operands width, no?  For non-constant ones I fear targets
> would need to support both to get reliable expansion.

SHIFT_COUNT_TRUNCATED has no meaning for rotate instructions, as far as
I can see.  Mathematically it doesn't, and are there any CPUs that fail
for it for no reason?  Any archs where some rotate amounts give
undefined results?  So if this is true, SHIFT_COUNT_TRUNCATED can be
treated as always *on* for rotates.


Segher
Segher Boessenkool July 17, 2019, 5 p.m. UTC | #6
On Wed, Jul 17, 2019 at 12:54:32PM +0200, Jakub Jelinek wrote:
> On Wed, Jul 17, 2019 at 12:37:59PM +0200, Richard Biener wrote:
> > I'm not sure if it makes sense to have both LROTATE_EXPR and
> > RROTATE_EXPR on the GIMPLE level then (that CPUs only
> > support one direction is natural though).  So maybe simply get
> > rid of one?  Its semantics are also nowhere documented
> 
> A lot of targets support both,

Of all the linux targets, we have:

No rotate:
  alpha microblaze riscv sparc

Both directions:
  aarch64 c6x ia64 m68k nios2 parisc sh x86 xtensa

Left only:
  csky h8300 powerpc s390

Right only:
  arc arm mips nds32 openrisc

> Then there are some targets that only support left rotates and not right
> rotates (rs6000, s390, tilegx, ...), and other targets that only support
> right rotates (mips, iq2000, ...).
> So only having one GIMPLE code doesn't seem to be good enough.
> 
> I think handling it during expansion in generic code is fine, especially
> when we clearly have several targets that do support only one of the
> rotates.  As you wrote, it needs corresponding code in tree-vect-generic.c,
> and shouldn't hardcode the rs6000 direction of mapping rotr to rotl, but
> support also the other direction - rotl to rotr.  For the sake of
> !SHIFT_COUNT_TRUNCATED targets for constant shift counts it needs to do
> negation + masking and for variable shift counts probably punt and let the
> backend code handle it if it can do the truncation in there?

I think we can say that *all* targets behave like SHIFT_COUNT_TRUNCATED
for rotates?  Not all immediates are valid of course, but that is a
separate issue.


Segher
Kewen.Lin July 18, 2019, 6:05 a.m. UTC | #7
on 2019/7/17 下午6:37, Richard Biener wrote:
> On Tue, Jul 16, 2019 at 10:45 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>
>> Hi all,
>>
>> Based on the previous comments (thank you!), I tried to update the
>> handling in expander and vectorizer.  Middle-end optimizes lrotate
>> with const rotation count to rrotate all the time, it makes vectorizer
>> fail to vectorize if rrotate isn't supported on the target.  We can at
>> least teach it on const rotation count, the cost should be the same?
>> At the same time, the expander already tries to use the opposite
>> rotation optable for scalar, we can teach it to deal with vector as well.
>>
>> Is it on the right track and reasonable?
> 
> So you're basically fixing this up in the expander.  I think on
> the GIMPLE level you then miss to update tree-vect-generic.c?
> 

Thanks, I will update it.  Another question on variable rotation
number, where is the best place I can add additional cost in 
vectorizer (for negate + possible maskgen/and)?  Or to avoid this,
transform the stmt to several stmts with opposite direction
before vectorizer?

> I'm not sure if it makes sense to have both LROTATE_EXPR and
> RROTATE_EXPR on the GIMPLE level then (that CPUs only
> support one direction is natural though).  So maybe simply get
> rid of one?  

One maybe impractical idea to have ROTATE_EXPR to unify and use 
positive or negative for the direction?

> Its semantics are also nowhere documented
> (do we allow negative rotation amounts?  how are
> non-mode-precision entities rotated? etc.).
> 

I think negative rotation amount is ok, not sure non-mode-prec,
it's a good point we should guard it when would like to use 
the opposite direction.


Thanks,
Kewen
Jakub Jelinek July 18, 2019, 7:01 a.m. UTC | #8
On Wed, Jul 17, 2019 at 12:00:32PM -0500, Segher Boessenkool wrote:
> I think we can say that *all* targets behave like SHIFT_COUNT_TRUNCATED
> for rotates?  Not all immediates are valid of course, but that is a
> separate issue.

Well, we'd need to double check all the hw rotate instructions on all the
targets to be sure.
As for the current GCC code, SHIFT_COUNT_TRUNCATED value is used even for
rotates at least in combine.c, expmed.c and simplify-rtx.c and in
optabs.c through targetm.shift_truncation_mask, but e.g. in cse.c is used
only for shifts and not rotates.

And speaking of optabs.c, it already has code to emit the other rotate
if one rotate isn't supported, the:
  /* If we were trying to rotate, and that didn't work, try rotating
     the other direction before falling back to shifts and bitwise-or.  */
  if (((binoptab == rotl_optab
        && (icode = optab_handler (rotr_optab, mode)) != CODE_FOR_nothing)
       || (binoptab == rotr_optab
           && (icode = optab_handler (rotl_optab, mode)) != CODE_FOR_nothing))
      && is_int_mode (mode, &int_mode))
    {
      optab otheroptab = (binoptab == rotl_optab ? rotr_optab : rotl_optab);
hunk in there, just it is limited to scalar rotates ATM rather than vector
ones through is_int_mode.  So I bet the problem with the vector shifts is just that
tree-vect-generic.c support isn't there.  And the above mentioned code
actually emits the
        newop1 = expand_binop (GET_MODE (op1), sub_optab,
                               gen_int_mode (bits, GET_MODE (op1)), op1,
                               NULL_RTX, unsignedp, OPTAB_DIRECT);
as the fallback, rather than masking of negation with some mask, so if there
is some target that doesn't truncate the rotate count, it will be
problematic with variable rotate by 0.  And note that the other rotate
code explicitly uses targetm.shift_truncation_mask.

	Jakub
Richard Earnshaw (lists) July 18, 2019, 3:12 p.m. UTC | #9
On 17/07/2019 18:00, Segher Boessenkool wrote:
> On Wed, Jul 17, 2019 at 12:54:32PM +0200, Jakub Jelinek wrote:
>> On Wed, Jul 17, 2019 at 12:37:59PM +0200, Richard Biener wrote:
>>> I'm not sure if it makes sense to have both LROTATE_EXPR and
>>> RROTATE_EXPR on the GIMPLE level then (that CPUs only
>>> support one direction is natural though).  So maybe simply get
>>> rid of one?  Its semantics are also nowhere documented
>>
>> A lot of targets support both,
> 
> Of all the linux targets, we have:
> 
> No rotate:
>    alpha microblaze riscv sparc
> 
> Both directions:
>    aarch64 c6x ia64 m68k nios2 parisc sh x86 xtensa

AArch64 is Right only.

R.

> 
> Left only:
>    csky h8300 powerpc s390
> 
> Right only:
>    arc arm mips nds32 openrisc
> 
>> Then there are some targets that only support left rotates and not right
>> rotates (rs6000, s390, tilegx, ...), and other targets that only support
>> right rotates (mips, iq2000, ...).
>> So only having one GIMPLE code doesn't seem to be good enough.
>>
>> I think handling it during expansion in generic code is fine, especially
>> when we clearly have several targets that do support only one of the
>> rotates.  As you wrote, it needs corresponding code in tree-vect-generic.c,
>> and shouldn't hardcode the rs6000 direction of mapping rotr to rotl, but
>> support also the other direction - rotl to rotr.  For the sake of
>> !SHIFT_COUNT_TRUNCATED targets for constant shift counts it needs to do
>> negation + masking and for variable shift counts probably punt and let the
>> backend code handle it if it can do the truncation in there?
> 
> I think we can say that *all* targets behave like SHIFT_COUNT_TRUNCATED
> for rotates?  Not all immediates are valid of course, but that is a
> separate issue.
> 
> 
> Segher
>
Jakub Jelinek July 18, 2019, 3:17 p.m. UTC | #10
On Thu, Jul 18, 2019 at 04:12:48PM +0100, Richard Earnshaw (lists) wrote:
> > Both directions:
> >    aarch64 c6x ia64 m68k nios2 parisc sh x86 xtensa
> 
> AArch64 is Right only.

Maybe hw-wise, but it has both rotr<mode>3 and rotl<mode>3 expanders.
At least for GPRs.

	Jakub
Richard Earnshaw (lists) July 18, 2019, 3:26 p.m. UTC | #11
On 18/07/2019 16:17, Jakub Jelinek wrote:
> On Thu, Jul 18, 2019 at 04:12:48PM +0100, Richard Earnshaw (lists) wrote:
>>> Both directions:
>>>     aarch64 c6x ia64 m68k nios2 parisc sh x86 xtensa
>>
>> AArch64 is Right only.
> 
> Maybe hw-wise, but it has both rotr<mode>3 and rotl<mode>3 expanders.
> At least for GPRs.
> 
> 	Jakub
> 

Only for immediates.  And the patterns that support that just write out 
assembly as "ror (<size> - n)".

R.
Jakub Jelinek July 18, 2019, 3:30 p.m. UTC | #12
On Thu, Jul 18, 2019 at 04:26:26PM +0100, Richard Earnshaw (lists) wrote:
> 
> 
> On 18/07/2019 16:17, Jakub Jelinek wrote:
> > On Thu, Jul 18, 2019 at 04:12:48PM +0100, Richard Earnshaw (lists) wrote:
> > > > Both directions:
> > > >     aarch64 c6x ia64 m68k nios2 parisc sh x86 xtensa
> > > 
> > > AArch64 is Right only.
> > 
> > Maybe hw-wise, but it has both rotr<mode>3 and rotl<mode>3 expanders.
> > At least for GPRs.
> > 
> > 	Jakub
> > 
> 
> Only for immediates.  And the patterns that support that just write out
> assembly as "ror (<size> - n)".

For registers too (for those it negates and uses rotr).
Note, the middle-end ought to be able to do the same thing already,
except if not SHIFT_COUNT_TRUNCATED it will use bits - count instead of
-count.
(define_expand "rotl<mode>3"
  [(set (match_operand:GPI 0 "register_operand")
        (rotatert:GPI (match_operand:GPI 1 "register_operand")
                      (match_operand:QI 2 "aarch64_reg_or_imm")))]
  ""
  {
    /* (SZ - cnt) % SZ == -cnt % SZ */
    if (CONST_INT_P (operands[2]))
      {
        operands[2] = GEN_INT ((-INTVAL (operands[2]))
                               & (GET_MODE_BITSIZE (<MODE>mode) - 1));
        if (operands[2] == const0_rtx)
          {
            emit_insn (gen_mov<mode> (operands[0], operands[1]));
            DONE;
          }
      }
    else
      operands[2] = expand_simple_unop (QImode, NEG, operands[2],
                                        NULL_RTX, 1);
  }
)

	Jakub
Richard Earnshaw (lists) July 18, 2019, 3:35 p.m. UTC | #13
On 18/07/2019 16:30, Jakub Jelinek wrote:
> On Thu, Jul 18, 2019 at 04:26:26PM +0100, Richard Earnshaw (lists) wrote:
>>
>>
>> On 18/07/2019 16:17, Jakub Jelinek wrote:
>>> On Thu, Jul 18, 2019 at 04:12:48PM +0100, Richard Earnshaw (lists) wrote:
>>>>> Both directions:
>>>>>      aarch64 c6x ia64 m68k nios2 parisc sh x86 xtensa
>>>>
>>>> AArch64 is Right only.
>>>
>>> Maybe hw-wise, but it has both rotr<mode>3 and rotl<mode>3 expanders.
>>> At least for GPRs.
>>>
>>> 	Jakub
>>>
>>
>> Only for immediates.  And the patterns that support that just write out
>> assembly as "ror (<size> - n)".
> 
> For registers too (for those it negates and uses rotr).
> Note, the middle-end ought to be able to do the same thing already,
> except if not SHIFT_COUNT_TRUNCATED it will use bits - count instead of
> -count.
> (define_expand "rotl<mode>3"
>    [(set (match_operand:GPI 0 "register_operand")
>          (rotatert:GPI (match_operand:GPI 1 "register_operand")
>                        (match_operand:QI 2 "aarch64_reg_or_imm")))]
>    ""
>    {
>      /* (SZ - cnt) % SZ == -cnt % SZ */
>      if (CONST_INT_P (operands[2]))
>        {
>          operands[2] = GEN_INT ((-INTVAL (operands[2]))
>                                 & (GET_MODE_BITSIZE (<MODE>mode) - 1));
>          if (operands[2] == const0_rtx)
>            {
>              emit_insn (gen_mov<mode> (operands[0], operands[1]));
>              DONE;
>            }
>        }
>      else
>        operands[2] = expand_simple_unop (QImode, NEG, operands[2],
>                                          NULL_RTX, 1);
>    }
> )
> 
> 	Jakub
> 

Well Arm has that sort of expansion as well, but it was listed as 'right 
only'.

R.
Segher Boessenkool July 18, 2019, 5:28 p.m. UTC | #14
On Thu, Jul 18, 2019 at 04:12:48PM +0100, Richard Earnshaw (lists) wrote:
> 
> 
> On 17/07/2019 18:00, Segher Boessenkool wrote:
> >On Wed, Jul 17, 2019 at 12:54:32PM +0200, Jakub Jelinek wrote:
> >>On Wed, Jul 17, 2019 at 12:37:59PM +0200, Richard Biener wrote:
> >>>I'm not sure if it makes sense to have both LROTATE_EXPR and
> >>>RROTATE_EXPR on the GIMPLE level then (that CPUs only
> >>>support one direction is natural though).  So maybe simply get
> >>>rid of one?  Its semantics are also nowhere documented
> >>
> >>A lot of targets support both,
> >
> >Of all the linux targets, we have:
> >
> >No rotate:
> >   alpha microblaze riscv sparc
> >
> >Both directions:
> >   aarch64 c6x ia64 m68k nios2 parisc sh x86 xtensa
> 
> AArch64 is Right only.

This is whether a port has any rotate resp. rotatert in recog, which
generates HAVE_rotate and HAVE_rotatert in insn-config.h .

If the hardware can only do right rotates you should either handle
immediate left rotates everywhere as well, or avoid using left rotates
in define_expand and define_insn.


Segher
Segher Boessenkool July 18, 2019, 7:43 p.m. UTC | #15
On Thu, Jul 18, 2019 at 09:01:13AM +0200, Jakub Jelinek wrote:
> On Wed, Jul 17, 2019 at 12:00:32PM -0500, Segher Boessenkool wrote:
> > I think we can say that *all* targets behave like SHIFT_COUNT_TRUNCATED
> > for rotates?  Not all immediates are valid of course, but that is a
> > separate issue.
> 
> Well, we'd need to double check all the hw rotate instructions on all the
> targets to be sure.

Yes :-(

> As for the current GCC code, SHIFT_COUNT_TRUNCATED value is used even for
> rotates at least in combine.c, expmed.c and simplify-rtx.c and in
> optabs.c through targetm.shift_truncation_mask, but e.g. in cse.c is used
> only for shifts and not rotates.

Many targets cannot use SHIFT_COUNT_TRUNCATED, and not even
targetm.shift_truncation_mask, because the actual mask depends on the
insn used, or rtl code used at least, not just the mode.

[snip]
> hunk in there, just it is limited to scalar rotates ATM rather than vector
> ones through is_int_mode.  So I bet the problem with the vector shifts is just that
> tree-vect-generic.c support isn't there.

:-)

Should we always allow both directions in gimple, and pretend both are
cheap?  Should we allow only one direction, and let the target select
which, or both?


Segher
diff mbox series

Patch

diff --git a/gcc/optabs.c b/gcc/optabs.c
index a0e361b8bfe..ebebb0ad145 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -1273,6 +1273,7 @@  expand_binop (machine_mode mode, optab binoptab, rtx op0, rtx op1,
   if (mclass == MODE_VECTOR_INT)
     {
       optab otheroptab = unknown_optab;
+      optab otheroptab1 = unknown_optab;

       if (binoptab == ashl_optab)
        otheroptab = vashl_optab;
@@ -1281,23 +1282,50 @@  expand_binop (machine_mode mode, optab binoptab, rtx op0, rtx op1,
       else if (binoptab == lshr_optab)
        otheroptab = vlshr_optab;
       else if (binoptab == rotl_optab)
-       otheroptab = vrotl_optab;
+       {
+         otheroptab = vrotl_optab;
+         otheroptab1 = vrotr_optab;
+       }
       else if (binoptab == rotr_optab)
-       otheroptab = vrotr_optab;
+       {
+         otheroptab = vrotr_optab;
+         otheroptab1 = vrotl_optab;
+       }
+
+      bool other_ok = (otheroptab && (icode = optab_handler (otheroptab, mode)) != CODE_FOR_nothing);
+      bool other1_ok = false;
+      if (!other_ok && otheroptab1)
+       other1_ok
+         = ((icode = optab_handler (otheroptab1, mode)) != CODE_FOR_nothing)
+           && SCALAR_INT_MODE_P (GET_MODE_INNER (mode));

-      if (otheroptab
-         && (icode = optab_handler (otheroptab, mode)) != CODE_FOR_nothing)
+      if (other_ok || other1_ok)
        {
          /* The scalar may have been extended to be too wide.  Truncate
             it back to the proper size to fit in the broadcast vector.  */
          scalar_mode inner_mode = GET_MODE_INNER (mode);
-         if (!CONST_INT_P (op1)
-             && (GET_MODE_BITSIZE (as_a <scalar_int_mode> (GET_MODE (op1)))
+         rtx newop1 = op1;
+         if (other1_ok)
+           {
+             unsigned int bits = GET_MODE_PRECISION (inner_mode);
+
+             if (CONST_INT_P (op1))
+               newop1 = gen_int_shift_amount (int_mode, bits - INTVAL (op1));
+             else if (targetm.shift_truncation_mask (int_mode) == bits - 1)
+               newop1 = negate_rtx (GET_MODE (op1), op1);
+             else
+               newop1 = expand_binop (GET_MODE (op1), sub_optab,
+                                      gen_int_mode (bits, GET_MODE (op1)), op1,
+                                      NULL_RTX, unsignedp, OPTAB_DIRECT);
+           }
+         if (!CONST_INT_P (newop1)
+             && (GET_MODE_BITSIZE (as_a<scalar_int_mode> (GET_MODE (newop1)))
                  > GET_MODE_BITSIZE (inner_mode)))
-           op1 = force_reg (inner_mode,
-                            simplify_gen_unary (TRUNCATE, inner_mode, op1,
-                                                GET_MODE (op1)));
-         rtx vop1 = expand_vector_broadcast (mode, op1);
+           newop1 = force_reg (inner_mode,
+                               simplify_gen_unary (TRUNCATE, inner_mode,
+                                                   newop1, GET_MODE (newop1)));
+
+         rtx vop1 = expand_vector_broadcast (mode, newop1);
          if (vop1)
            {
              temp = expand_binop_directly (icode, mode, otheroptab, op0, vop1,

diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index ff952d6f464..c05ce1acba4 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -2039,6 +2039,15 @@  vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
   if (optab1
       && optab_handler (optab1, TYPE_MODE (vectype)) != CODE_FOR_nothing)
     return NULL;
+  /* middle-end canonicalizing LROTATE to RROTATE with const rotation count,
+     let's try the LROTATE as well.  */
+  if (rhs_code == RROTATE_EXPR && TREE_CODE(oprnd1) == INTEGER_CST)
+    {
+      optab1 = optab_for_tree_code (LROTATE_EXPR, vectype, optab_vector);
+      if (optab1
+         && optab_handler (optab1, TYPE_MODE (vectype)) != CODE_FOR_nothing)
+       return NULL;
+    }

   if (is_a <bb_vec_info> (vinfo) || dt != vect_internal_def)
     {
@@ -2046,6 +2055,14 @@  vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
       if (optab2
          && optab_handler (optab2, TYPE_MODE (vectype)) != CODE_FOR_nothing)
        return NULL;
+      if (rhs_code == RROTATE_EXPR && TREE_CODE(oprnd1) == INTEGER_CST)
+       {
+         optab2 = optab_for_tree_code (LROTATE_EXPR, vectype, optab_scalar);
+         if (optab2
+             && optab_handler (optab2, TYPE_MODE (vectype))
+                  != CODE_FOR_nothing)
+           return NULL;
+       }
     }

   /* If vector/vector or vector/scalar shifts aren't supported by the target,
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 21046931243..9e1a2f971a1 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -5665,6 +5665,12 @@  vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   if (!scalar_shift_arg)
     {
       optab = optab_for_tree_code (code, vectype, optab_vector);
+
+      if (TREE_CODE (op1) == INTEGER_CST && code == RROTATE_EXPR
+         && !(optab
+              && optab_handler (optab, TYPE_MODE (vectype))
+                   != CODE_FOR_nothing))
+       optab = optab_for_tree_code (LROTATE_EXPR, vectype, optab_vector);
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
                          "vector/vector shift/rotate found.\n");
@@ -5696,7 +5702,10 @@  vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       else
         {
           optab = optab_for_tree_code (code, vectype, optab_vector);
-          if (optab
+         if (TREE_CODE (op1) == INTEGER_CST && code == RROTATE_EXPR
+             && !(optab && optab_handler (optab, TYPE_MODE (vectype))))
+           optab = optab_for_tree_code (LROTATE_EXPR, vectype, optab_vector);
+         if (optab
                && (optab_handler (optab, TYPE_MODE (vectype))
                       != CODE_FOR_nothing))
             {