diff mbox series

[RFC] AArch64 SVE: Fix multiple comparison masks on inverted operands

Message ID patch-14553-tamar@arm.com
State New
Headers show
Series [RFC] AArch64 SVE: Fix multiple comparison masks on inverted operands | expand

Commit Message

Tamar Christina June 14, 2021, 1:43 p.m. UTC
Hi All,

This RFC is trying to address the following inefficiency when vectorizing
conditional statements with SVE.

Consider the case

void f10(double * restrict z, double * restrict w, double * restrict x,
	 double * restrict y, int n)
{
    for (int i = 0; i < n; i++) {
        z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
    }
}


For which we currently generate at -O3:

f10:
        cmp     w4, 0
        ble     .L1
        mov     x5, 0
        whilelo p1.d, wzr, w4
        ptrue   p3.b, all
.L3:
        ld1d    z1.d, p1/z, [x1, x5, lsl 3]
        fcmgt   p2.d, p1/z, z1.d, #0.0
        fcmgt   p0.d, p3/z, z1.d, #0.0
        ld1d    z2.d, p2/z, [x2, x5, lsl 3]
        bic     p0.b, p3/z, p1.b, p0.b
        ld1d    z0.d, p0/z, [x3, x5, lsl 3]
        fsub    z0.d, p0/m, z0.d, z1.d
        movprfx z0.d, p2/m, z1.d
        fadd    z0.d, p2/m, z0.d, z2.d
        st1d    z0.d, p1, [x0, x5, lsl 3]
        incd    x5
        whilelo p1.d, w5, w4
        b.any   .L3
.L1:
        ret

Notice that the condition for the else branch duplicates the same predicate as
the then branch and then uses BIC to negate the results.

The reason for this is that during instruction generation in the vectorizer we
emit

  mask__41.11_66 = vect__4.10_64 > vect_cst__65;
  vec_mask_and_69 = mask__41.11_66 & loop_mask_63;
  vec_mask_and_71 = mask__41.11_66 & loop_mask_63;
  mask__43.16_73 = ~mask__41.11_66;
  vec_mask_and_76 = mask__43.16_73 & loop_mask_63;
  vec_mask_and_78 = mask__43.16_73 & loop_mask_63;

which ultimately gets optimized to

  mask__41.11_66 = vect__4.10_64 > { 0.0, ... };
  vec_mask_and_69 = loop_mask_63 & mask__41.11_66;
  mask__43.16_73 = ~mask__41.11_66;
  vec_mask_and_76 = loop_mask_63 & mask__43.16_73;

Notice how the negate is on the operation and not the predicate resulting from
the operation.  When this is expanded this turns into RTL where the negate is on
the compare directly.  This means the RTL is different from the one without the
negate and so CSE is unable to recognize that they are essentially same operation.

To fix this my patch changes it so you negate the mask rather than the operation

  mask__41.13_55 = vect__4.12_53 > { 0.0, ... };
  vec_mask_and_58 = loop_mask_52 & mask__41.13_55;
  vec_mask_op_67 = ~vec_mask_and_58;
  vec_mask_and_65 = loop_mask_52 & vec_mask_op_67;

which means the negate end up on the masked operation.  This removes the
additional comparisons

f10:
        cmp     w4, 0
        ble     .L1
        mov     x5, 0
        whilelo p0.d, wzr, w4
        ptrue   p3.b, all
        .p2align 5,,15
.L3:
        ld1d    z1.d, p0/z, [x1, x5, lsl 3]
        fcmgt   p1.d, p0/z, z1.d, #0.0
        bic     p2.b, p3/z, p0.b, p1.b
        ld1d    z2.d, p1/z, [x2, x5, lsl 3]
        ld1d    z0.d, p2/z, [x3, x5, lsl 3]
        fsub    z0.d, p2/m, z0.d, z1.d
        movprfx z0.d, p1/m, z1.d
        fadd    z0.d, p1/m, z0.d, z2.d
        st1d    z0.d, p0, [x0, x5, lsl 3]
        incd    x5
        whilelo p0.d, w5, w4
        b.any   .L3
.L1:
        ret

But is still not optimal.  The problem is the BIC pattern,
aarch64_pred_<nlogical><mode>_z which will replace the NOT and AND with BIC.

However in this case since p1 is the result of a predicate operation on p0 the
BIC should instead be a NEG disabling the pattern for combine (adding
&& reload_completed)

gives me the codegen I'm after:

f10:
        cmp     w4, 0
        ble     .L1
        mov     x5, 0
        whilelo p0.d, wzr, w4
        .p2align 5,,15
.L3:
        ld1d    z1.d, p0/z, [x1, x5, lsl 3]
        fcmgt   p1.d, p0/z, z1.d, #0.0
        not     p2.b, p0/z, p1.b
        ld1d    z2.d, p1/z, [x2, x5, lsl 3]
        ld1d    z0.d, p2/z, [x3, x5, lsl 3]
        fsub    z0.d, p2/m, z0.d, z1.d
        movprfx z0.d, p1/m, z1.d
        fadd    z0.d, p1/m, z0.d, z2.d
        st1d    z0.d, p0, [x0, x5, lsl 3]
        incd    x5
        whilelo p0.d, w5, w4
        b.any   .L3
.L1:
        ret

Which used NOT pedicated on p0 instead.  Which is what the code was pre-combine
and also removed the need of having a third predicate p3 with the BIC case.

I can't remove combine to remove the BIC since in the case above the fcmgt isn't
single use so combine won't try.  Of course disabling the early recog for BIC
isn't ideal since you miss genuine BICs.

Any feedback on the approach and how to fix the BIC issue?  I did try an
approach using combine where I matched against the full sequence and spit it
early in combine.  This works for the case above but falls apart in other cases
where the cmp isn't single use from the start.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

But would like to solve the remaining issues.

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-stmts.c (prepare_load_store_mask): Expand unary operators
	on the mask instead of the operation.

--- inline copy of patch -- 
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index eeef96a2eb60853e9c18a288af9e49ae9ad65128..35e5212c77d7cb26b1a2b9645cbac22c30078fb8 100644


--

Comments

Richard Sandiford June 14, 2021, 2:50 p.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> This RFC is trying to address the following inefficiency when vectorizing
> conditional statements with SVE.
>
> Consider the case
>
> void f10(double * restrict z, double * restrict w, double * restrict x,
> 	 double * restrict y, int n)
> {
>     for (int i = 0; i < n; i++) {
>         z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
>     }
> }
>
>
> For which we currently generate at -O3:
>
> f10:
>         cmp     w4, 0
>         ble     .L1
>         mov     x5, 0
>         whilelo p1.d, wzr, w4
>         ptrue   p3.b, all
> .L3:
>         ld1d    z1.d, p1/z, [x1, x5, lsl 3]
>         fcmgt   p2.d, p1/z, z1.d, #0.0
>         fcmgt   p0.d, p3/z, z1.d, #0.0
>         ld1d    z2.d, p2/z, [x2, x5, lsl 3]
>         bic     p0.b, p3/z, p1.b, p0.b
>         ld1d    z0.d, p0/z, [x3, x5, lsl 3]
>         fsub    z0.d, p0/m, z0.d, z1.d
>         movprfx z0.d, p2/m, z1.d
>         fadd    z0.d, p2/m, z0.d, z2.d
>         st1d    z0.d, p1, [x0, x5, lsl 3]
>         incd    x5
>         whilelo p1.d, w5, w4
>         b.any   .L3
> .L1:
>         ret
>
> Notice that the condition for the else branch duplicates the same predicate as
> the then branch and then uses BIC to negate the results.
>
> The reason for this is that during instruction generation in the vectorizer we
> emit
>
>   mask__41.11_66 = vect__4.10_64 > vect_cst__65;
>   vec_mask_and_69 = mask__41.11_66 & loop_mask_63;
>   vec_mask_and_71 = mask__41.11_66 & loop_mask_63;
>   mask__43.16_73 = ~mask__41.11_66;
>   vec_mask_and_76 = mask__43.16_73 & loop_mask_63;
>   vec_mask_and_78 = mask__43.16_73 & loop_mask_63;
>
> which ultimately gets optimized to
>
>   mask__41.11_66 = vect__4.10_64 > { 0.0, ... };
>   vec_mask_and_69 = loop_mask_63 & mask__41.11_66;
>   mask__43.16_73 = ~mask__41.11_66;
>   vec_mask_and_76 = loop_mask_63 & mask__43.16_73;
>
> Notice how the negate is on the operation and not the predicate resulting from
> the operation.  When this is expanded this turns into RTL where the negate is on
> the compare directly.  This means the RTL is different from the one without the
> negate and so CSE is unable to recognize that they are essentially same operation.
>
> To fix this my patch changes it so you negate the mask rather than the operation
>
>   mask__41.13_55 = vect__4.12_53 > { 0.0, ... };
>   vec_mask_and_58 = loop_mask_52 & mask__41.13_55;
>   vec_mask_op_67 = ~vec_mask_and_58;
>   vec_mask_and_65 = loop_mask_52 & vec_mask_op_67;

But to me this looks like a pessimisation in gimple terms.  We've increased
the length of the critical path: vec_mask_and_65 now needs a chain of
4 operations instead of 3.

We also need to be careful not to pessimise the case in which the
comparison is an integer one.  At the moment we'll generate opposed
conditions, which is the intended behaviour:

.L3:
        ld1d    z1.d, p0/z, [x1, x5, lsl 3]
        cmpgt   p2.d, p0/z, z1.d, #0
        movprfx z2, z1
        scvtf   z2.d, p3/m, z1.d
        cmple   p1.d, p0/z, z1.d, #0
        ld1d    z0.d, p2/z, [x2, x5, lsl 3]
        ld1d    z1.d, p1/z, [x3, x5, lsl 3]
        fadd    z0.d, p2/m, z0.d, z2.d
        movprfx z0.d, p1/m, z1.d
        fsub    z0.d, p1/m, z0.d, z2.d
        st1d    z0.d, p0, [x0, x5, lsl 3]
        add     x5, x5, x6
        whilelo p0.d, w5, w4
        b.any   .L3

Could we handle the fcmp case using a 3->2 define_split instead: convert

   (set res (and (not (fcmp X Y)) Z)) ->
     (set res (fcmp X Y))
     (set res (and (not res) Z))

?

Thanks,
Richard
Tamar Christina June 14, 2021, 3:05 p.m. UTC | #2
Hi Richard,

> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Monday, June 14, 2021 3:50 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH][RFC]AArch64 SVE: Fix multiple comparison masks on
> inverted operands
> 
> Tamar Christina <tamar.christina@arm.com> writes:
> > Hi All,
> >
> > This RFC is trying to address the following inefficiency when
> > vectorizing conditional statements with SVE.
> >
> > Consider the case
> >
> > void f10(double * restrict z, double * restrict w, double * restrict x,
> > 	 double * restrict y, int n)
> > {
> >     for (int i = 0; i < n; i++) {
> >         z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
> >     }
> > }
> >
> >
> > For which we currently generate at -O3:
> >
> > f10:
> >         cmp     w4, 0
> >         ble     .L1
> >         mov     x5, 0
> >         whilelo p1.d, wzr, w4
> >         ptrue   p3.b, all
> > .L3:
> >         ld1d    z1.d, p1/z, [x1, x5, lsl 3]
> >         fcmgt   p2.d, p1/z, z1.d, #0.0
> >         fcmgt   p0.d, p3/z, z1.d, #0.0
> >         ld1d    z2.d, p2/z, [x2, x5, lsl 3]
> >         bic     p0.b, p3/z, p1.b, p0.b
> >         ld1d    z0.d, p0/z, [x3, x5, lsl 3]
> >         fsub    z0.d, p0/m, z0.d, z1.d
> >         movprfx z0.d, p2/m, z1.d
> >         fadd    z0.d, p2/m, z0.d, z2.d
> >         st1d    z0.d, p1, [x0, x5, lsl 3]
> >         incd    x5
> >         whilelo p1.d, w5, w4
> >         b.any   .L3
> > .L1:
> >         ret
> >
> > Notice that the condition for the else branch duplicates the same
> > predicate as the then branch and then uses BIC to negate the results.
> >
> > The reason for this is that during instruction generation in the
> > vectorizer we emit
> >
> >   mask__41.11_66 = vect__4.10_64 > vect_cst__65;
> >   vec_mask_and_69 = mask__41.11_66 & loop_mask_63;
> >   vec_mask_and_71 = mask__41.11_66 & loop_mask_63;
> >   mask__43.16_73 = ~mask__41.11_66;
> >   vec_mask_and_76 = mask__43.16_73 & loop_mask_63;
> >   vec_mask_and_78 = mask__43.16_73 & loop_mask_63;
> >
> > which ultimately gets optimized to
> >
> >   mask__41.11_66 = vect__4.10_64 > { 0.0, ... };
> >   vec_mask_and_69 = loop_mask_63 & mask__41.11_66;
> >   mask__43.16_73 = ~mask__41.11_66;
> >   vec_mask_and_76 = loop_mask_63 & mask__43.16_73;
> >
> > Notice how the negate is on the operation and not the predicate
> > resulting from the operation.  When this is expanded this turns into
> > RTL where the negate is on the compare directly.  This means the RTL
> > is different from the one without the negate and so CSE is unable to
> recognize that they are essentially same operation.
> >
> > To fix this my patch changes it so you negate the mask rather than the
> > operation
> >
> >   mask__41.13_55 = vect__4.12_53 > { 0.0, ... };
> >   vec_mask_and_58 = loop_mask_52 & mask__41.13_55;
> >   vec_mask_op_67 = ~vec_mask_and_58;
> >   vec_mask_and_65 = loop_mask_52 & vec_mask_op_67;
> 
> But to me this looks like a pessimisation in gimple terms.  We've increased
> the length of the critical path: vec_mask_and_65 now needs a chain of
> 4 operations instead of 3.

True, but it should reduce the number of RTL patterns.  I would have thought
RTL is more expensive to handle than gimple.

> 
> We also need to be careful not to pessimise the case in which the
> comparison is an integer one.  At the moment we'll generate opposed
> conditions, which is the intended behaviour:

This still happens with this patch at `-Ofast` because that flips the conditions,
So the different representation doesn't harm it.

> 
> .L3:
>         ld1d    z1.d, p0/z, [x1, x5, lsl 3]
>         cmpgt   p2.d, p0/z, z1.d, #0
>         movprfx z2, z1
>         scvtf   z2.d, p3/m, z1.d
>         cmple   p1.d, p0/z, z1.d, #0
>         ld1d    z0.d, p2/z, [x2, x5, lsl 3]
>         ld1d    z1.d, p1/z, [x3, x5, lsl 3]
>         fadd    z0.d, p2/m, z0.d, z2.d
>         movprfx z0.d, p1/m, z1.d
>         fsub    z0.d, p1/m, z0.d, z2.d
>         st1d    z0.d, p0, [x0, x5, lsl 3]
>         add     x5, x5, x6
>         whilelo p0.d, w5, w4
>         b.any   .L3
> 
> Could we handle the fcmp case using a 3->2 define_split instead: convert
> 
>    (set res (and (not (fcmp X Y)) Z)) ->
>      (set res (fcmp X Y))
>      (set res (and (not res) Z))
> 

This was the other approach I mentioned. It works, and gives you the neg, but only in the case where the compare is single use.
e.g. in 

void f11(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n)
{
    for (int i = 0; i < n; i++) {
        z[i] = (w[i] > 0) ? w[i] : y[i] - w[i];
    }
}

You have some of the same problem. It generates

        ld1d    z0.d, p0/z, [x1, x2, lsl 3]
        fcmgt   p2.d, p3/z, z0.d, #0.0
        bic     p1.b, p3/z, p0.b, p2.b
        ld1d    z1.d, p1/z, [x3, x2, lsl 3]
        fsub    z1.d, p1/m, z1.d, z0.d
        sel     z0.d, p2, z0.d, z1.d
        st1d    z0.d, p0, [x0, x2, lsl 3]
        incd    x2
        whilelo p0.d, w2, w4

which has two problems. fcmgt doesn't need to be predicated on p3 which is ptrue all,
it can/should be p0.

With that fixed the splitter won't match because p2 is needed in the sel, so it's not single use and so combine won't
try to build the RTL so it can be split.

Regards,
Tamar

> ?
> 
> Thanks,
> Richard
Richard Sandiford June 14, 2021, 3:54 p.m. UTC | #3
Tamar Christina <Tamar.Christina@arm.com> writes:
> Hi Richard,
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Monday, June 14, 2021 3:50 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH][RFC]AArch64 SVE: Fix multiple comparison masks on
>> inverted operands
>> 
>> Tamar Christina <tamar.christina@arm.com> writes:
>> > Hi All,
>> >
>> > This RFC is trying to address the following inefficiency when
>> > vectorizing conditional statements with SVE.
>> >
>> > Consider the case
>> >
>> > void f10(double * restrict z, double * restrict w, double * restrict x,
>> > 	 double * restrict y, int n)
>> > {
>> >     for (int i = 0; i < n; i++) {
>> >         z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
>> >     }
>> > }
>> >
>> >
>> > For which we currently generate at -O3:
>> >
>> > f10:
>> >         cmp     w4, 0
>> >         ble     .L1
>> >         mov     x5, 0
>> >         whilelo p1.d, wzr, w4
>> >         ptrue   p3.b, all
>> > .L3:
>> >         ld1d    z1.d, p1/z, [x1, x5, lsl 3]
>> >         fcmgt   p2.d, p1/z, z1.d, #0.0
>> >         fcmgt   p0.d, p3/z, z1.d, #0.0
>> >         ld1d    z2.d, p2/z, [x2, x5, lsl 3]
>> >         bic     p0.b, p3/z, p1.b, p0.b
>> >         ld1d    z0.d, p0/z, [x3, x5, lsl 3]
>> >         fsub    z0.d, p0/m, z0.d, z1.d
>> >         movprfx z0.d, p2/m, z1.d
>> >         fadd    z0.d, p2/m, z0.d, z2.d
>> >         st1d    z0.d, p1, [x0, x5, lsl 3]
>> >         incd    x5
>> >         whilelo p1.d, w5, w4
>> >         b.any   .L3
>> > .L1:
>> >         ret
>> >
>> > Notice that the condition for the else branch duplicates the same
>> > predicate as the then branch and then uses BIC to negate the results.
>> >
>> > The reason for this is that during instruction generation in the
>> > vectorizer we emit
>> >
>> >   mask__41.11_66 = vect__4.10_64 > vect_cst__65;
>> >   vec_mask_and_69 = mask__41.11_66 & loop_mask_63;
>> >   vec_mask_and_71 = mask__41.11_66 & loop_mask_63;
>> >   mask__43.16_73 = ~mask__41.11_66;
>> >   vec_mask_and_76 = mask__43.16_73 & loop_mask_63;
>> >   vec_mask_and_78 = mask__43.16_73 & loop_mask_63;
>> >
>> > which ultimately gets optimized to
>> >
>> >   mask__41.11_66 = vect__4.10_64 > { 0.0, ... };
>> >   vec_mask_and_69 = loop_mask_63 & mask__41.11_66;
>> >   mask__43.16_73 = ~mask__41.11_66;
>> >   vec_mask_and_76 = loop_mask_63 & mask__43.16_73;
>> >
>> > Notice how the negate is on the operation and not the predicate
>> > resulting from the operation.  When this is expanded this turns into
>> > RTL where the negate is on the compare directly.  This means the RTL
>> > is different from the one without the negate and so CSE is unable to
>> recognize that they are essentially same operation.
>> >
>> > To fix this my patch changes it so you negate the mask rather than the
>> > operation
>> >
>> >   mask__41.13_55 = vect__4.12_53 > { 0.0, ... };
>> >   vec_mask_and_58 = loop_mask_52 & mask__41.13_55;
>> >   vec_mask_op_67 = ~vec_mask_and_58;
>> >   vec_mask_and_65 = loop_mask_52 & vec_mask_op_67;
>> 
>> But to me this looks like a pessimisation in gimple terms.  We've increased
>> the length of the critical path: vec_mask_and_65 now needs a chain of
>> 4 operations instead of 3.
>
> True, but it should reduce the number of RTL patterns.  I would have thought
> RTL is more expensive to handle than gimple.

I think this is only a fair gimple optimisation if gimple does the isel
itself (to a predicated compare and a predicated NOT).

>> We also need to be careful not to pessimise the case in which the
>> comparison is an integer one.  At the moment we'll generate opposed
>> conditions, which is the intended behaviour:
>
> This still happens with this patch at `-Ofast` because that flips the conditions,
> So the different representation doesn't harm it.

OK, that's good.

>> 
>> .L3:
>>         ld1d    z1.d, p0/z, [x1, x5, lsl 3]
>>         cmpgt   p2.d, p0/z, z1.d, #0
>>         movprfx z2, z1
>>         scvtf   z2.d, p3/m, z1.d
>>         cmple   p1.d, p0/z, z1.d, #0
>>         ld1d    z0.d, p2/z, [x2, x5, lsl 3]
>>         ld1d    z1.d, p1/z, [x3, x5, lsl 3]
>>         fadd    z0.d, p2/m, z0.d, z2.d
>>         movprfx z0.d, p1/m, z1.d
>>         fsub    z0.d, p1/m, z0.d, z2.d
>>         st1d    z0.d, p0, [x0, x5, lsl 3]
>>         add     x5, x5, x6
>>         whilelo p0.d, w5, w4
>>         b.any   .L3
>> 
>> Could we handle the fcmp case using a 3->2 define_split instead: convert
>> 
>>    (set res (and (not (fcmp X Y)) Z)) ->
>>      (set res (fcmp X Y))
>>      (set res (and (not res) Z))
>> 
>
> This was the other approach I mentioned. It works, and gives you the neg, but only in the case where the compare is single use.

But in the original example we duplicate the comparison through a
2->2 combine, which leaves the original comparison as a single use.
Isn't that enough?

> e.g. in 
>
> void f11(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n)
> {
>     for (int i = 0; i < n; i++) {
>         z[i] = (w[i] > 0) ? w[i] : y[i] - w[i];
>     }
> }
>
> You have some of the same problem. It generates
>
>         ld1d    z0.d, p0/z, [x1, x2, lsl 3]
>         fcmgt   p2.d, p3/z, z0.d, #0.0
>         bic     p1.b, p3/z, p0.b, p2.b
>         ld1d    z1.d, p1/z, [x3, x2, lsl 3]
>         fsub    z1.d, p1/m, z1.d, z0.d
>         sel     z0.d, p2, z0.d, z1.d
>         st1d    z0.d, p0, [x0, x2, lsl 3]
>         incd    x2
>         whilelo p0.d, w2, w4
>
> which has two problems. fcmgt doesn't need to be predicated on p3 which is ptrue all,
> it can/should be p0.
>
> With that fixed the splitter won't match because p2 is needed in the sel, so it's not single use and so combine won't
> try to build the RTL so it can be split.

I think having the vectoriser avoid the dual use between the
IFN_MASK_LOAD/STORE and the VEC_COND_EXPR is fair game, since that
is the only pass that has the context to prove that including the loop
mask in the VEC_COND_EXPR condition is correct.  We already try to do
that to some extent:

  /* See whether another part of the vectorized code applies a loop
     mask to the condition, or to its inverse.  */

but it would need extending to handle this case.

Thanks,
Richard
Tamar Christina June 30, 2021, 4:08 p.m. UTC | #4
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Monday, June 14, 2021 4:55 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH][RFC]AArch64 SVE: Fix multiple comparison masks on
> inverted operands
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> > Hi Richard,
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Monday, June 14, 2021 3:50 PM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> >> Subject: Re: [PATCH][RFC]AArch64 SVE: Fix multiple comparison masks
> >> on inverted operands
> >>
> >> Tamar Christina <tamar.christina@arm.com> writes:
> >> > Hi All,
> >> >
> >> > This RFC is trying to address the following inefficiency when
> >> > vectorizing conditional statements with SVE.
> >> >
> >> > Consider the case
> >> >
> >> > void f10(double * restrict z, double * restrict w, double * restrict x,
> >> > 	 double * restrict y, int n)
> >> > {
> >> >     for (int i = 0; i < n; i++) {
> >> >         z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
> >> >     }
> >> > }
> >> >
> >> >
> >> > For which we currently generate at -O3:
> >> >
> >> > f10:
> >> >         cmp     w4, 0
> >> >         ble     .L1
> >> >         mov     x5, 0
> >> >         whilelo p1.d, wzr, w4
> >> >         ptrue   p3.b, all
> >> > .L3:
> >> >         ld1d    z1.d, p1/z, [x1, x5, lsl 3]
> >> >         fcmgt   p2.d, p1/z, z1.d, #0.0
> >> >         fcmgt   p0.d, p3/z, z1.d, #0.0
> >> >         ld1d    z2.d, p2/z, [x2, x5, lsl 3]
> >> >         bic     p0.b, p3/z, p1.b, p0.b
> >> >         ld1d    z0.d, p0/z, [x3, x5, lsl 3]
> >> >         fsub    z0.d, p0/m, z0.d, z1.d
> >> >         movprfx z0.d, p2/m, z1.d
> >> >         fadd    z0.d, p2/m, z0.d, z2.d
> >> >         st1d    z0.d, p1, [x0, x5, lsl 3]
> >> >         incd    x5
> >> >         whilelo p1.d, w5, w4
> >> >         b.any   .L3
> >> > .L1:
> >> >         ret
> >> >
> >> > Notice that the condition for the else branch duplicates the same
> >> > predicate as the then branch and then uses BIC to negate the results.
> >> >
> >> > The reason for this is that during instruction generation in the
> >> > vectorizer we emit
> >> >
> >> >   mask__41.11_66 = vect__4.10_64 > vect_cst__65;
> >> >   vec_mask_and_69 = mask__41.11_66 & loop_mask_63;
> >> >   vec_mask_and_71 = mask__41.11_66 & loop_mask_63;
> >> >   mask__43.16_73 = ~mask__41.11_66;
> >> >   vec_mask_and_76 = mask__43.16_73 & loop_mask_63;
> >> >   vec_mask_and_78 = mask__43.16_73 & loop_mask_63;
> >> >
> >> > which ultimately gets optimized to
> >> >
> >> >   mask__41.11_66 = vect__4.10_64 > { 0.0, ... };
> >> >   vec_mask_and_69 = loop_mask_63 & mask__41.11_66;
> >> >   mask__43.16_73 = ~mask__41.11_66;
> >> >   vec_mask_and_76 = loop_mask_63 & mask__43.16_73;
> >> >
> >> > Notice how the negate is on the operation and not the predicate
> >> > resulting from the operation.  When this is expanded this turns
> >> > into RTL where the negate is on the compare directly.  This means
> >> > the RTL is different from the one without the negate and so CSE is
> >> > unable to
> >> recognize that they are essentially same operation.
> >> >
> >> > To fix this my patch changes it so you negate the mask rather than
> >> > the operation
> >> >
> >> >   mask__41.13_55 = vect__4.12_53 > { 0.0, ... };
> >> >   vec_mask_and_58 = loop_mask_52 & mask__41.13_55;
> >> >   vec_mask_op_67 = ~vec_mask_and_58;
> >> >   vec_mask_and_65 = loop_mask_52 & vec_mask_op_67;
> >>
> >> But to me this looks like a pessimisation in gimple terms.  We've
> >> increased the length of the critical path: vec_mask_and_65 now needs
> >> a chain of
> >> 4 operations instead of 3.
> >
> > True, but it should reduce the number of RTL patterns.  I would have
> > thought RTL is more expensive to handle than gimple.
> 
> I think this is only a fair gimple optimisation if gimple does the isel itself (to a
> predicated compare and a predicated NOT).
> 
> >> We also need to be careful not to pessimise the case in which the
> >> comparison is an integer one.  At the moment we'll generate opposed
> >> conditions, which is the intended behaviour:
> >
> > This still happens with this patch at `-Ofast` because that flips the
> > conditions, So the different representation doesn't harm it.
> 
> OK, that's good.
> 
> >>
> >> .L3:
> >>         ld1d    z1.d, p0/z, [x1, x5, lsl 3]
> >>         cmpgt   p2.d, p0/z, z1.d, #0
> >>         movprfx z2, z1
> >>         scvtf   z2.d, p3/m, z1.d
> >>         cmple   p1.d, p0/z, z1.d, #0
> >>         ld1d    z0.d, p2/z, [x2, x5, lsl 3]
> >>         ld1d    z1.d, p1/z, [x3, x5, lsl 3]
> >>         fadd    z0.d, p2/m, z0.d, z2.d
> >>         movprfx z0.d, p1/m, z1.d
> >>         fsub    z0.d, p1/m, z0.d, z2.d
> >>         st1d    z0.d, p0, [x0, x5, lsl 3]
> >>         add     x5, x5, x6
> >>         whilelo p0.d, w5, w4
> >>         b.any   .L3
> >>
> >> Could we handle the fcmp case using a 3->2 define_split instead:
> >> convert
> >>
> >>    (set res (and (not (fcmp X Y)) Z)) ->
> >>      (set res (fcmp X Y))
> >>      (set res (and (not res) Z))
> >>
> >
> > This was the other approach I mentioned. It works, and gives you the neg,
> but only in the case where the compare is single use.
> 
> But in the original example we duplicate the comparison through a
> 2->2 combine, which leaves the original comparison as a single use.
> Isn't that enough?
> 
> > e.g. in
> >
> > void f11(double * restrict z, double * restrict w, double * restrict
> > x, double * restrict y, int n) {
> >     for (int i = 0; i < n; i++) {
> >         z[i] = (w[i] > 0) ? w[i] : y[i] - w[i];
> >     }
> > }
> >
> > You have some of the same problem. It generates
> >
> >         ld1d    z0.d, p0/z, [x1, x2, lsl 3]
> >         fcmgt   p2.d, p3/z, z0.d, #0.0
> >         bic     p1.b, p3/z, p0.b, p2.b
> >         ld1d    z1.d, p1/z, [x3, x2, lsl 3]
> >         fsub    z1.d, p1/m, z1.d, z0.d
> >         sel     z0.d, p2, z0.d, z1.d
> >         st1d    z0.d, p0, [x0, x2, lsl 3]
> >         incd    x2
> >         whilelo p0.d, w2, w4
> >
> > which has two problems. fcmgt doesn't need to be predicated on p3
> > which is ptrue all, it can/should be p0.
> >
> > With that fixed the splitter won't match because p2 is needed in the
> > sel, so it's not single use and so combine won't try to build the RTL so it can
> be split.
> 
> I think having the vectoriser avoid the dual use between the
> IFN_MASK_LOAD/STORE and the VEC_COND_EXPR is fair game, since that is
> the only pass that has the context to prove that including the loop mask in
> the VEC_COND_EXPR condition is correct.  We already try to do that to some
> extent:
> 

Sorry I have been looking at this these past couple of days and I just don't know how
this is supposed to work.

In the above example the problem is not just the use of p2 in the VEC_COND_EXPR. If
the VEC_COND_EXPR is changed to use p1 then p1 now has 3 uses which makes
combine still not try the combination.

But the general case

void f10(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n)
{
    for (int i = 0; i < n; i++) {
        z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
    }
}

Produces

f10:
        cmp     w4, 0
        ble     .L1
        mov     x5, 0
        whilelo p1.d, wzr, w4
        ptrue   p3.b, all
        .p2align 5,,15
.L3:
        ld1d    z1.d, p1/z, [x1, x5, lsl 3]
        fcmgt   p2.d, p1/z, z1.d, #0.0
        fcmgt   p0.d, p3/z, z1.d, #0.0
        ld1d    z2.d, p2/z, [x2, x5, lsl 3]
        bic     p0.b, p3/z, p1.b, p0.b
        ld1d    z0.d, p0/z, [x3, x5, lsl 3]
        fsub    z0.d, p0/m, z0.d, z1.d
        movprfx z0.d, p2/m, z1.d
        fadd    z0.d, p2/m, z0.d, z2.d
        st1d    z0.d, p1, [x0, x5, lsl 3]
        incd    x5
        whilelo p1.d, w5, w4
        b.any   .L3

where the VEC_COND_EXPR has been elided.

The problem is that the comparison for the inverse case is unpredicated.

Which causes the compiler to predicate it on ptrue when it's being generated from the load.
The resulting Gimple is

  <bb 3> [local count: 105119322]:
  bnd.9_49 = (unsigned int) n_14(D);
  max_mask_79 = .WHILE_ULT (0, bnd.9_49, { 0, ... });

  <bb 4> [local count: 630715932]:
  # ivtmp_76 = PHI <ivtmp_77(4), 0(3)>
  # loop_mask_52 = PHI <next_mask_80(4), max_mask_79(3)>
  _27 = &MEM <vector([2,2]) double> [(double *)w_15(D) + ivtmp_76 * 8];
  vect__4.12_53 = .MASK_LOAD (_27, 64B, loop_mask_52);
  mask__41.13_55 = vect__4.12_53 > { 0.0, ... };
  vec_mask_and_58 = loop_mask_52 & mask__41.13_55;
  _48 = &MEM <vector([2,2]) double> [(double *)x_18(D) + ivtmp_76 * 8];
  vect__6.16_59 = .MASK_LOAD (_48, 64B, vec_mask_and_58);
  mask__43.18_62 = ~mask__41.13_55;
  vec_mask_and_65 = loop_mask_52 & mask__43.18_62;
  _25 = &MEM <vector([2,2]) double> [(double *)y_16(D) + ivtmp_76 * 8];
  vect__8.21_66 = .MASK_LOAD (_25, 64B, vec_mask_and_65);
  vect_iftmp.22_68 = .COND_SUB (vec_mask_and_65, vect__8.21_66, vect__4.12_53, vect__8.21_66);
  _50 = .COND_ADD (vec_mask_and_58, vect__4.12_53, vect__6.16_59, vect_iftmp.22_68);
  _1 = &MEM <vector([2,2]) double> [(double *)z_20(D) + ivtmp_76 * 8];
  .MASK_STORE (_1, 64B, loop_mask_52, _50);
  ivtmp_77 = ivtmp_76 + POLY_INT_CST [2, 2];
  _2 = (unsigned int) ivtmp_77;
  next_mask_80 = .WHILE_ULT (_2, bnd.9_49, { 0, ... });

where the mask vec_mask_and_65 is masking the negate and not the compare because of how the expansion
is done.  

>   /* See whether another part of the vectorized code applies a loop
>      mask to the condition, or to its inverse.  */
> 
> but it would need extending to handle this case.

This code is fine as far as I can tell.  But there's nothing you can do here. The mask it needs is ~original
So it does not find an inverse mask to use because it has to honor floating point exceptions.

And indeed `-fno-trapping-math` or `-Ofast` generates the most optimal sequence, but when honoring
traps it can't re-use invert existing mask, which leaves the operation unpredicated.

So is what you're requesting that it looks inside unary operators and tries to CSE the thing they're pointed to?
In which case isn't it about the same as what I had before just that the vectorizer did the CSE itself?

If that's the case maybe it's better to do lookups into loop_vinfo->scalar_cond_masked_set in prepare_load_store_mask?
So that it just applies to everything?

Thanks,
Tamar

> 
> Thanks,
> Richard
Richard Sandiford June 30, 2021, 5:55 p.m. UTC | #5
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Monday, June 14, 2021 4:55 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH][RFC]AArch64 SVE: Fix multiple comparison masks on
>> inverted operands
>> 
>> Tamar Christina <Tamar.Christina@arm.com> writes:
>> > Hi Richard,
>> >> -----Original Message-----
>> >> From: Richard Sandiford <richard.sandiford@arm.com>
>> >> Sent: Monday, June 14, 2021 3:50 PM
>> >> To: Tamar Christina <Tamar.Christina@arm.com>
>> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> <Kyrylo.Tkachov@arm.com>
>> >> Subject: Re: [PATCH][RFC]AArch64 SVE: Fix multiple comparison masks
>> >> on inverted operands
>> >>
>> >> Tamar Christina <tamar.christina@arm.com> writes:
>> >> > Hi All,
>> >> >
>> >> > This RFC is trying to address the following inefficiency when
>> >> > vectorizing conditional statements with SVE.
>> >> >
>> >> > Consider the case
>> >> >
>> >> > void f10(double * restrict z, double * restrict w, double * restrict x,
>> >> > 	 double * restrict y, int n)
>> >> > {
>> >> >     for (int i = 0; i < n; i++) {
>> >> >         z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
>> >> >     }
>> >> > }
>> >> >
>> >> >
>> >> > For which we currently generate at -O3:
>> >> >
>> >> > f10:
>> >> >         cmp     w4, 0
>> >> >         ble     .L1
>> >> >         mov     x5, 0
>> >> >         whilelo p1.d, wzr, w4
>> >> >         ptrue   p3.b, all
>> >> > .L3:
>> >> >         ld1d    z1.d, p1/z, [x1, x5, lsl 3]
>> >> >         fcmgt   p2.d, p1/z, z1.d, #0.0
>> >> >         fcmgt   p0.d, p3/z, z1.d, #0.0
>> >> >         ld1d    z2.d, p2/z, [x2, x5, lsl 3]
>> >> >         bic     p0.b, p3/z, p1.b, p0.b
>> >> >         ld1d    z0.d, p0/z, [x3, x5, lsl 3]
>> >> >         fsub    z0.d, p0/m, z0.d, z1.d
>> >> >         movprfx z0.d, p2/m, z1.d
>> >> >         fadd    z0.d, p2/m, z0.d, z2.d
>> >> >         st1d    z0.d, p1, [x0, x5, lsl 3]
>> >> >         incd    x5
>> >> >         whilelo p1.d, w5, w4
>> >> >         b.any   .L3
>> >> > .L1:
>> >> >         ret
>> >> >
>> >> > Notice that the condition for the else branch duplicates the same
>> >> > predicate as the then branch and then uses BIC to negate the results.
>> >> >
>> >> > The reason for this is that during instruction generation in the
>> >> > vectorizer we emit
>> >> >
>> >> >   mask__41.11_66 = vect__4.10_64 > vect_cst__65;
>> >> >   vec_mask_and_69 = mask__41.11_66 & loop_mask_63;
>> >> >   vec_mask_and_71 = mask__41.11_66 & loop_mask_63;
>> >> >   mask__43.16_73 = ~mask__41.11_66;
>> >> >   vec_mask_and_76 = mask__43.16_73 & loop_mask_63;
>> >> >   vec_mask_and_78 = mask__43.16_73 & loop_mask_63;
>> >> >
>> >> > which ultimately gets optimized to
>> >> >
>> >> >   mask__41.11_66 = vect__4.10_64 > { 0.0, ... };
>> >> >   vec_mask_and_69 = loop_mask_63 & mask__41.11_66;
>> >> >   mask__43.16_73 = ~mask__41.11_66;
>> >> >   vec_mask_and_76 = loop_mask_63 & mask__43.16_73;
>> >> >
>> >> > Notice how the negate is on the operation and not the predicate
>> >> > resulting from the operation.  When this is expanded this turns
>> >> > into RTL where the negate is on the compare directly.  This means
>> >> > the RTL is different from the one without the negate and so CSE is
>> >> > unable to
>> >> recognize that they are essentially same operation.
>> >> >
>> >> > To fix this my patch changes it so you negate the mask rather than
>> >> > the operation
>> >> >
>> >> >   mask__41.13_55 = vect__4.12_53 > { 0.0, ... };
>> >> >   vec_mask_and_58 = loop_mask_52 & mask__41.13_55;
>> >> >   vec_mask_op_67 = ~vec_mask_and_58;
>> >> >   vec_mask_and_65 = loop_mask_52 & vec_mask_op_67;
>> >>
>> >> But to me this looks like a pessimisation in gimple terms.  We've
>> >> increased the length of the critical path: vec_mask_and_65 now needs
>> >> a chain of
>> >> 4 operations instead of 3.
>> >
>> > True, but it should reduce the number of RTL patterns.  I would have
>> > thought RTL is more expensive to handle than gimple.
>> 
>> I think this is only a fair gimple optimisation if gimple does the isel itself (to a
>> predicated compare and a predicated NOT).
>> 
>> >> We also need to be careful not to pessimise the case in which the
>> >> comparison is an integer one.  At the moment we'll generate opposed
>> >> conditions, which is the intended behaviour:
>> >
>> > This still happens with this patch at `-Ofast` because that flips the
>> > conditions, So the different representation doesn't harm it.
>> 
>> OK, that's good.
>> 
>> >>
>> >> .L3:
>> >>         ld1d    z1.d, p0/z, [x1, x5, lsl 3]
>> >>         cmpgt   p2.d, p0/z, z1.d, #0
>> >>         movprfx z2, z1
>> >>         scvtf   z2.d, p3/m, z1.d
>> >>         cmple   p1.d, p0/z, z1.d, #0
>> >>         ld1d    z0.d, p2/z, [x2, x5, lsl 3]
>> >>         ld1d    z1.d, p1/z, [x3, x5, lsl 3]
>> >>         fadd    z0.d, p2/m, z0.d, z2.d
>> >>         movprfx z0.d, p1/m, z1.d
>> >>         fsub    z0.d, p1/m, z0.d, z2.d
>> >>         st1d    z0.d, p0, [x0, x5, lsl 3]
>> >>         add     x5, x5, x6
>> >>         whilelo p0.d, w5, w4
>> >>         b.any   .L3
>> >>
>> >> Could we handle the fcmp case using a 3->2 define_split instead:
>> >> convert
>> >>
>> >>    (set res (and (not (fcmp X Y)) Z)) ->
>> >>      (set res (fcmp X Y))
>> >>      (set res (and (not res) Z))
>> >>
>> >
>> > This was the other approach I mentioned. It works, and gives you the neg,
>> but only in the case where the compare is single use.
>> 
>> But in the original example we duplicate the comparison through a
>> 2->2 combine, which leaves the original comparison as a single use.
>> Isn't that enough?
>> 
>> > e.g. in
>> >
>> > void f11(double * restrict z, double * restrict w, double * restrict
>> > x, double * restrict y, int n) {
>> >     for (int i = 0; i < n; i++) {
>> >         z[i] = (w[i] > 0) ? w[i] : y[i] - w[i];
>> >     }
>> > }
>> >
>> > You have some of the same problem. It generates
>> >
>> >         ld1d    z0.d, p0/z, [x1, x2, lsl 3]
>> >         fcmgt   p2.d, p3/z, z0.d, #0.0
>> >         bic     p1.b, p3/z, p0.b, p2.b
>> >         ld1d    z1.d, p1/z, [x3, x2, lsl 3]
>> >         fsub    z1.d, p1/m, z1.d, z0.d
>> >         sel     z0.d, p2, z0.d, z1.d
>> >         st1d    z0.d, p0, [x0, x2, lsl 3]
>> >         incd    x2
>> >         whilelo p0.d, w2, w4
>> >
>> > which has two problems. fcmgt doesn't need to be predicated on p3
>> > which is ptrue all, it can/should be p0.
>> >
>> > With that fixed the splitter won't match because p2 is needed in the
>> > sel, so it's not single use and so combine won't try to build the RTL so it can
>> be split.
>> 
>> I think having the vectoriser avoid the dual use between the
>> IFN_MASK_LOAD/STORE and the VEC_COND_EXPR is fair game, since that is
>> the only pass that has the context to prove that including the loop mask in
>> the VEC_COND_EXPR condition is correct.  We already try to do that to some
>> extent:
>> 
>
> Sorry I have been looking at this these past couple of days and I just don't know how
> this is supposed to work.
>
> In the above example the problem is not just the use of p2 in the VEC_COND_EXPR. If
> the VEC_COND_EXPR is changed to use p1 then p1 now has 3 uses which makes
> combine still not try the combination.
>
> But the general case
>
> void f10(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n)
> {
>     for (int i = 0; i < n; i++) {
>         z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
>     }
> }
>
> Produces
>
> f10:
>         cmp     w4, 0
>         ble     .L1
>         mov     x5, 0
>         whilelo p1.d, wzr, w4
>         ptrue   p3.b, all
>         .p2align 5,,15
> .L3:
>         ld1d    z1.d, p1/z, [x1, x5, lsl 3]
>         fcmgt   p2.d, p1/z, z1.d, #0.0
>         fcmgt   p0.d, p3/z, z1.d, #0.0
>         ld1d    z2.d, p2/z, [x2, x5, lsl 3]
>         bic     p0.b, p3/z, p1.b, p0.b
>         ld1d    z0.d, p0/z, [x3, x5, lsl 3]
>         fsub    z0.d, p0/m, z0.d, z1.d
>         movprfx z0.d, p2/m, z1.d
>         fadd    z0.d, p2/m, z0.d, z2.d
>         st1d    z0.d, p1, [x0, x5, lsl 3]
>         incd    x5
>         whilelo p1.d, w5, w4
>         b.any   .L3
>
> where the VEC_COND_EXPR has been elided.
>
> The problem is that the comparison for the inverse case is unpredicated.

Yeah, for the original f10 example I was suggesting that we use combine
instead of changing the vectoriser.  It turns out that the 3->2 split
thing I mentioned above won't work though, because we match the BIC first.
But I think something like:

(define_insn_and_split "*inverted_fcmgt"
  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
	(and:<VPRED>
	  (and:<VPRED>
	    (not:<VPRED>
	      (unspec:<VPRED>
		[(match_operand:<VPRED> 1 "register_operand" "Upl")
		 (const_int SVE_KNOWN_PTRUE)
		 (match_operand:SVE_FULL_F 2 "register_operand" "w")
		 (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")]
		SVE_COND_FP_CMP_I0))
	    (match_operand:<VPRED> 4 "register_operand" "w"))
	  (match_dup 1)))]
  "TARGET_SVE && !reg_overlap_mentioned_p (operands[4], operands[0])"
  "#"
  "&& 1"
  [(set (match_dup 0)
	(unspec:<VPRED>
	  [(match_dup 4)
	   (const_int SVE_MAYBE_NOT_PTRUE)
	   (match_dup 2)
	   (match_dup 3)]
	  SVE_COND_FP_CMP_I0))
   (set (match_dup 0)
	(and:<VPRED> (not:<VPRED> (match_dup 0)) (match_dup 4)))]
)

is a legitimate optimisation in its own right because it gets rid
of the PTRUE operand (at split time).  This would be a good thing
to do wherever the FCMxx and BIC come from.

The snag is that we don't then CSE the comparisons between split1 and RA.
The post-RA optimisers might then leave a move.  E.g. for plain -O3 the
above pattern gives the expected:

        cmp     w4, 0
        ble     .L1
        mov     x5, 0
        cntd    x6
        whilelo p0.d, wzr, w4
        .p2align 3,,7
.L3:
        ld1d    z1.d, p0/z, [x1, x5, lsl 3]
        fcmgt   p2.d, p0/z, z1.d, #0.0
        not     p1.b, p0/z, p2.b
        ld1d    z2.d, p2/z, [x2, x5, lsl 3]
        ld1d    z0.d, p1/z, [x3, x5, lsl 3]
        fsub    z0.d, p1/m, z0.d, z1.d
        movprfx z0.d, p2/m, z1.d
        fadd    z0.d, p2/m, z0.d, z2.d
        st1d    z0.d, p0, [x0, x5, lsl 3]
        add     x5, x5, x6
        whilelo p0.d, w5, w4
        b.any   .L3
.L1:
        ret

but -O3 -fno-trapping-math has a predicate move:

        cmp     w4, 0
        ble     .L1
        mov     x5, 0
        cntd    x6
        whilelo p0.d, wzr, w4
        .p2align 3,,7
.L3:
        ld1d    z1.d, p0/z, [x1, x5, lsl 3]
        fcmgt   p1.d, p0/z, z1.d, #0.0
        mov     p2.b, p1.b
        not     p1.b, p0/z, p1.b
        ld1d    z0.d, p2/z, [x2, x5, lsl 3]
        ld1d    z2.d, p1/z, [x3, x5, lsl 3]
        fadd    z0.d, z1.d, z0.d
        movprfx z0.d, p1/m, z2.d
        fsub    z0.d, p1/m, z0.d, z1.d
        st1d    z0.d, p0, [x0, x5, lsl 3]
        add     x5, x5, x6
        whilelo p0.d, w5, w4
        b.any   .L3
.L1:
        ret

It would be good to fix this in RTL “somehow”.

But for f11 we still get:

f11:
.LFB1:
        .cfi_startproc
        cmp     w4, 0
        ble     .L6
        mov     x2, 0
        cntd    x5
        whilelo p1.d, wzr, w4
        ptrue   p3.b, all
        .p2align 3,,7
.L8:
        ld1d    z0.d, p1/z, [x1, x2, lsl 3]
        fcmgt   p0.d, p1/z, z0.d, #0.0
        fcmgt   p2.d, p3/z, z0.d, #0.0
        not     p0.b, p1/z, p0.b
        ld1d    z1.d, p0/z, [x3, x2, lsl 3]
        fsub    z1.d, p0/m, z1.d, z0.d
        sel     z0.d, p2, z0.d, z1.d
        st1d    z0.d, p1, [x0, x2, lsl 3]
        add     x2, x2, x5
        whilelo p1.d, w2, w4
        b.any   .L8
.L6:
        ret

This is where the VEC_COND_EXPR code I mentioned should come in.
At the moment, before generating:

  VEC_COND_EXPR<C, E1, E2>

we check whether there are any other operations predicated on
C & LOOP_MASK.  If so, we use:

  VEC_COND_EXPR<C & LOOP_MASK, E1, E2>

instead.  This avoids both C and C & LOOP_MASK being live at the
same time.

The change I was suggesting was that we should also check whether there
are any operations predicated on ~C & LOOP_MASK.  If so, we should
generate:

  VEC_COND_EXPR<~C & LOOP_MASK, E2, E1>

Again, the justification is that we don't have C and ~C & LOOP_MASK
live at the same time.  If C itself has the form ~C' then swapping
the operands also saves a negation.

> This code is fine as far as I can tell.  But there's nothing you can do here. The mask it needs is ~original
> So it does not find an inverse mask to use because it has to honor floating point exceptions.
>
> And indeed `-fno-trapping-math` or `-Ofast` generates the most optimal sequence, but when honoring
> traps it can't re-use invert existing mask, which leaves the operation unpredicated.
>
> So is what you're requesting that it looks inside unary operators and tries to CSE the thing they're pointed to?

[Hopefully answered this above]

> In which case isn't it about the same as what I had before just that the vectorizer did the CSE itself?

IMO the VEC_COND_EXPR optimisation is more selective.  It only changes
the gimple if we know that it will have an effect (in terms of reducing
the number of SSA_NAMEs that have multiple uses).  Also, unlike the original
patch, there's no double-masking invovled: ~C & LOOP_MASK will only apply
LOOP_MASK once.

> If that's the case maybe it's better to do lookups into loop_vinfo->scalar_cond_masked_set in prepare_load_store_mask?
> So that it just applies to everything?

scalar_cond_masked_set only exists to convert a mask that ignores
LOOP_MASK into one that is ANDed with LOOP_MASK.  For loads and stores
we always AND with LOOP_MASK, so a lookup isn't necessary.

Thanks,
Richard
Tamar Christina July 1, 2021, 7:04 a.m. UTC | #6
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Wednesday, June 30, 2021 6:55 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH][RFC]AArch64 SVE: Fix multiple comparison masks on
> inverted operands
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Monday, June 14, 2021 4:55 PM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> >> Subject: Re: [PATCH][RFC]AArch64 SVE: Fix multiple comparison masks
> >> on inverted operands
> >>
> >> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> > Hi Richard,
> >> >> -----Original Message-----
> >> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> >> Sent: Monday, June 14, 2021 3:50 PM
> >> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> >> <Kyrylo.Tkachov@arm.com>
> >> >> Subject: Re: [PATCH][RFC]AArch64 SVE: Fix multiple comparison
> >> >> masks on inverted operands
> >> >>
> >> >> Tamar Christina <tamar.christina@arm.com> writes:
> >> >> > Hi All,
> >> >> >
> >> >> > This RFC is trying to address the following inefficiency when
> >> >> > vectorizing conditional statements with SVE.
> >> >> >
> >> >> > Consider the case
> >> >> >
> >> >> > void f10(double * restrict z, double * restrict w, double * restrict x,
> >> >> > 	 double * restrict y, int n)
> >> >> > {
> >> >> >     for (int i = 0; i < n; i++) {
> >> >> >         z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
> >> >> >     }
> >> >> > }
> >> >> >
> >> >> >
> >> >> > For which we currently generate at -O3:
> >> >> >
> >> >> > f10:
> >> >> >         cmp     w4, 0
> >> >> >         ble     .L1
> >> >> >         mov     x5, 0
> >> >> >         whilelo p1.d, wzr, w4
> >> >> >         ptrue   p3.b, all
> >> >> > .L3:
> >> >> >         ld1d    z1.d, p1/z, [x1, x5, lsl 3]
> >> >> >         fcmgt   p2.d, p1/z, z1.d, #0.0
> >> >> >         fcmgt   p0.d, p3/z, z1.d, #0.0
> >> >> >         ld1d    z2.d, p2/z, [x2, x5, lsl 3]
> >> >> >         bic     p0.b, p3/z, p1.b, p0.b
> >> >> >         ld1d    z0.d, p0/z, [x3, x5, lsl 3]
> >> >> >         fsub    z0.d, p0/m, z0.d, z1.d
> >> >> >         movprfx z0.d, p2/m, z1.d
> >> >> >         fadd    z0.d, p2/m, z0.d, z2.d
> >> >> >         st1d    z0.d, p1, [x0, x5, lsl 3]
> >> >> >         incd    x5
> >> >> >         whilelo p1.d, w5, w4
> >> >> >         b.any   .L3
> >> >> > .L1:
> >> >> >         ret
> >> >> >
> >> >> > Notice that the condition for the else branch duplicates the
> >> >> > same predicate as the then branch and then uses BIC to negate the
> results.
> >> >> >
> >> >> > The reason for this is that during instruction generation in the
> >> >> > vectorizer we emit
> >> >> >
> >> >> >   mask__41.11_66 = vect__4.10_64 > vect_cst__65;
> >> >> >   vec_mask_and_69 = mask__41.11_66 & loop_mask_63;
> >> >> >   vec_mask_and_71 = mask__41.11_66 & loop_mask_63;
> >> >> >   mask__43.16_73 = ~mask__41.11_66;
> >> >> >   vec_mask_and_76 = mask__43.16_73 & loop_mask_63;
> >> >> >   vec_mask_and_78 = mask__43.16_73 & loop_mask_63;
> >> >> >
> >> >> > which ultimately gets optimized to
> >> >> >
> >> >> >   mask__41.11_66 = vect__4.10_64 > { 0.0, ... };
> >> >> >   vec_mask_and_69 = loop_mask_63 & mask__41.11_66;
> >> >> >   mask__43.16_73 = ~mask__41.11_66;
> >> >> >   vec_mask_and_76 = loop_mask_63 & mask__43.16_73;
> >> >> >
> >> >> > Notice how the negate is on the operation and not the predicate
> >> >> > resulting from the operation.  When this is expanded this turns
> >> >> > into RTL where the negate is on the compare directly.  This
> >> >> > means the RTL is different from the one without the negate and
> >> >> > so CSE is unable to
> >> >> recognize that they are essentially same operation.
> >> >> >
> >> >> > To fix this my patch changes it so you negate the mask rather
> >> >> > than the operation
> >> >> >
> >> >> >   mask__41.13_55 = vect__4.12_53 > { 0.0, ... };
> >> >> >   vec_mask_and_58 = loop_mask_52 & mask__41.13_55;
> >> >> >   vec_mask_op_67 = ~vec_mask_and_58;
> >> >> >   vec_mask_and_65 = loop_mask_52 & vec_mask_op_67;
> >> >>
> >> >> But to me this looks like a pessimisation in gimple terms.  We've
> >> >> increased the length of the critical path: vec_mask_and_65 now
> >> >> needs a chain of
> >> >> 4 operations instead of 3.
> >> >
> >> > True, but it should reduce the number of RTL patterns.  I would
> >> > have thought RTL is more expensive to handle than gimple.
> >>
> >> I think this is only a fair gimple optimisation if gimple does the
> >> isel itself (to a predicated compare and a predicated NOT).
> >>
> >> >> We also need to be careful not to pessimise the case in which the
> >> >> comparison is an integer one.  At the moment we'll generate
> >> >> opposed conditions, which is the intended behaviour:
> >> >
> >> > This still happens with this patch at `-Ofast` because that flips
> >> > the conditions, So the different representation doesn't harm it.
> >>
> >> OK, that's good.
> >>
> >> >>
> >> >> .L3:
> >> >>         ld1d    z1.d, p0/z, [x1, x5, lsl 3]
> >> >>         cmpgt   p2.d, p0/z, z1.d, #0
> >> >>         movprfx z2, z1
> >> >>         scvtf   z2.d, p3/m, z1.d
> >> >>         cmple   p1.d, p0/z, z1.d, #0
> >> >>         ld1d    z0.d, p2/z, [x2, x5, lsl 3]
> >> >>         ld1d    z1.d, p1/z, [x3, x5, lsl 3]
> >> >>         fadd    z0.d, p2/m, z0.d, z2.d
> >> >>         movprfx z0.d, p1/m, z1.d
> >> >>         fsub    z0.d, p1/m, z0.d, z2.d
> >> >>         st1d    z0.d, p0, [x0, x5, lsl 3]
> >> >>         add     x5, x5, x6
> >> >>         whilelo p0.d, w5, w4
> >> >>         b.any   .L3
> >> >>
> >> >> Could we handle the fcmp case using a 3->2 define_split instead:
> >> >> convert
> >> >>
> >> >>    (set res (and (not (fcmp X Y)) Z)) ->
> >> >>      (set res (fcmp X Y))
> >> >>      (set res (and (not res) Z))
> >> >>
> >> >
> >> > This was the other approach I mentioned. It works, and gives you
> >> > the neg,
> >> but only in the case where the compare is single use.
> >>
> >> But in the original example we duplicate the comparison through a
> >> 2->2 combine, which leaves the original comparison as a single use.
> >> Isn't that enough?
> >>
> >> > e.g. in
> >> >
> >> > void f11(double * restrict z, double * restrict w, double *
> >> > restrict x, double * restrict y, int n) {
> >> >     for (int i = 0; i < n; i++) {
> >> >         z[i] = (w[i] > 0) ? w[i] : y[i] - w[i];
> >> >     }
> >> > }
> >> >
> >> > You have some of the same problem. It generates
> >> >
> >> >         ld1d    z0.d, p0/z, [x1, x2, lsl 3]
> >> >         fcmgt   p2.d, p3/z, z0.d, #0.0
> >> >         bic     p1.b, p3/z, p0.b, p2.b
> >> >         ld1d    z1.d, p1/z, [x3, x2, lsl 3]
> >> >         fsub    z1.d, p1/m, z1.d, z0.d
> >> >         sel     z0.d, p2, z0.d, z1.d
> >> >         st1d    z0.d, p0, [x0, x2, lsl 3]
> >> >         incd    x2
> >> >         whilelo p0.d, w2, w4
> >> >
> >> > which has two problems. fcmgt doesn't need to be predicated on p3
> >> > which is ptrue all, it can/should be p0.
> >> >
> >> > With that fixed the splitter won't match because p2 is needed in
> >> > the sel, so it's not single use and so combine won't try to build
> >> > the RTL so it can
> >> be split.
> >>
> >> I think having the vectoriser avoid the dual use between the
> >> IFN_MASK_LOAD/STORE and the VEC_COND_EXPR is fair game, since
> that is
> >> the only pass that has the context to prove that including the loop
> >> mask in the VEC_COND_EXPR condition is correct.  We already try to do
> >> that to some
> >> extent:
> >>
> >
> > Sorry I have been looking at this these past couple of days and I just
> > don't know how this is supposed to work.
> >
> > In the above example the problem is not just the use of p2 in the
> > VEC_COND_EXPR. If the VEC_COND_EXPR is changed to use p1 then p1
> now
> > has 3 uses which makes combine still not try the combination.
> >
> > But the general case
> >
> > void f10(double * restrict z, double * restrict w, double * restrict
> > x, double * restrict y, int n) {
> >     for (int i = 0; i < n; i++) {
> >         z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
> >     }
> > }
> >
> > Produces
> >
> > f10:
> >         cmp     w4, 0
> >         ble     .L1
> >         mov     x5, 0
> >         whilelo p1.d, wzr, w4
> >         ptrue   p3.b, all
> >         .p2align 5,,15
> > .L3:
> >         ld1d    z1.d, p1/z, [x1, x5, lsl 3]
> >         fcmgt   p2.d, p1/z, z1.d, #0.0
> >         fcmgt   p0.d, p3/z, z1.d, #0.0
> >         ld1d    z2.d, p2/z, [x2, x5, lsl 3]
> >         bic     p0.b, p3/z, p1.b, p0.b
> >         ld1d    z0.d, p0/z, [x3, x5, lsl 3]
> >         fsub    z0.d, p0/m, z0.d, z1.d
> >         movprfx z0.d, p2/m, z1.d
> >         fadd    z0.d, p2/m, z0.d, z2.d
> >         st1d    z0.d, p1, [x0, x5, lsl 3]
> >         incd    x5
> >         whilelo p1.d, w5, w4
> >         b.any   .L3
> >
> > where the VEC_COND_EXPR has been elided.
> >
> > The problem is that the comparison for the inverse case is unpredicated.
> 
> Yeah, for the original f10 example I was suggesting that we use combine
> instead of changing the vectoriser.  It turns out that the 3->2 split thing I
> mentioned above won't work though, because we match the BIC first.
> But I think something like:
> 
> (define_insn_and_split "*inverted_fcmgt"
>   [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
> 	(and:<VPRED>
> 	  (and:<VPRED>
> 	    (not:<VPRED>
> 	      (unspec:<VPRED>
> 		[(match_operand:<VPRED> 1 "register_operand" "Upl")
> 		 (const_int SVE_KNOWN_PTRUE)
> 		 (match_operand:SVE_FULL_F 2 "register_operand" "w")
> 		 (match_operand:SVE_FULL_F 3
> "aarch64_simd_reg_or_zero" "wDz")]
> 		SVE_COND_FP_CMP_I0))
> 	    (match_operand:<VPRED> 4 "register_operand" "w"))
> 	  (match_dup 1)))]
>   "TARGET_SVE && !reg_overlap_mentioned_p (operands[4], operands[0])"
>   "#"
>   "&& 1"
>   [(set (match_dup 0)
> 	(unspec:<VPRED>
> 	  [(match_dup 4)
> 	   (const_int SVE_MAYBE_NOT_PTRUE)
> 	   (match_dup 2)
> 	   (match_dup 3)]
> 	  SVE_COND_FP_CMP_I0))
>    (set (match_dup 0)
> 	(and:<VPRED> (not:<VPRED> (match_dup 0)) (match_dup 4)))]
> )
> 
> is a legitimate optimisation in its own right because it gets rid of the PTRUE
> operand (at split time).  This would be a good thing to do wherever the
> FCMxx and BIC come from.

Ah I see...

I was trying with

;; Make sure that inversions of masked comparisons are always on the mask
;; instead of on the operation.
(define_insn_and_split "*mask_inv_combine"
  [(match_scratch:<VPRED> 5 "=Upa, Upa")
   (set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
       (and:<VPRED>
         (and:<VPRED>
           (not:<VPRED>
             (unspec:<VPRED>
               [(match_operand:<VPRED> 1)
                (const_int SVE_KNOWN_PTRUE)
                (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
                (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "Dz, w")]
               SVE_COND_FP_CMP_I0))
           (match_operand:<VPRED> 4 "register_operand" "Upa, Upa"))
             (match_dup:<VPRED> 1)))]
  "TARGET_SVE"
  "#"
  "&& 1"
  [(set (match_dup 0)
       (and:<VPRED>
         (unspec:<VPRED>
           [(match_dup 1)
            (const_int SVE_KNOWN_PTRUE)
            (match_dup 2)
            (match_dup 3)]
           SVE_COND_FP_CMP_I0)
             (match_dup 4)))
   (set (match_dup 0)
       (and:<VPRED>
         (not:<VPRED>
           (match_dup 0))
         (match_dup 4)))]
)

But the difference seems to be I need to use    (const_int SVE_MAYBE_NOT_PTRUE)
After the split. Could you perhaps point me to what this means in RTL?

The predicate itself is operand1 so just want to know what exactly this does to the semantics.

> 
> The snag is that we don't then CSE the comparisons between split1 and RA.
> The post-RA optimisers might then leave a move.  E.g. for plain -O3 the
> above pattern gives the expected:
> 
>         cmp     w4, 0
>         ble     .L1
>         mov     x5, 0
>         cntd    x6
>         whilelo p0.d, wzr, w4
>         .p2align 3,,7
> .L3:
>         ld1d    z1.d, p0/z, [x1, x5, lsl 3]
>         fcmgt   p2.d, p0/z, z1.d, #0.0
>         not     p1.b, p0/z, p2.b
>         ld1d    z2.d, p2/z, [x2, x5, lsl 3]
>         ld1d    z0.d, p1/z, [x3, x5, lsl 3]
>         fsub    z0.d, p1/m, z0.d, z1.d
>         movprfx z0.d, p2/m, z1.d
>         fadd    z0.d, p2/m, z0.d, z2.d
>         st1d    z0.d, p0, [x0, x5, lsl 3]
>         add     x5, x5, x6
>         whilelo p0.d, w5, w4
>         b.any   .L3
> .L1:
>         ret
> 
> but -O3 -fno-trapping-math has a predicate move:
> 
>         cmp     w4, 0
>         ble     .L1
>         mov     x5, 0
>         cntd    x6
>         whilelo p0.d, wzr, w4
>         .p2align 3,,7
> .L3:
>         ld1d    z1.d, p0/z, [x1, x5, lsl 3]
>         fcmgt   p1.d, p0/z, z1.d, #0.0
>         mov     p2.b, p1.b
>         not     p1.b, p0/z, p1.b
>         ld1d    z0.d, p2/z, [x2, x5, lsl 3]
>         ld1d    z2.d, p1/z, [x3, x5, lsl 3]
>         fadd    z0.d, z1.d, z0.d
>         movprfx z0.d, p1/m, z2.d
>         fsub    z0.d, p1/m, z0.d, z1.d
>         st1d    z0.d, p0, [x0, x5, lsl 3]
>         add     x5, x5, x6
>         whilelo p0.d, w5, w4
>         b.any   .L3
> .L1:
>         ret
> 
> It would be good to fix this in RTL “somehow”.
> 
> But for f11 we still get:
> 
> f11:
> .LFB1:
>         .cfi_startproc
>         cmp     w4, 0
>         ble     .L6
>         mov     x2, 0
>         cntd    x5
>         whilelo p1.d, wzr, w4
>         ptrue   p3.b, all
>         .p2align 3,,7
> .L8:
>         ld1d    z0.d, p1/z, [x1, x2, lsl 3]
>         fcmgt   p0.d, p1/z, z0.d, #0.0
>         fcmgt   p2.d, p3/z, z0.d, #0.0
>         not     p0.b, p1/z, p0.b
>         ld1d    z1.d, p0/z, [x3, x2, lsl 3]
>         fsub    z1.d, p0/m, z1.d, z0.d
>         sel     z0.d, p2, z0.d, z1.d
>         st1d    z0.d, p1, [x0, x2, lsl 3]
>         add     x2, x2, x5
>         whilelo p1.d, w2, w4
>         b.any   .L8
> .L6:
>         ret
> 
> This is where the VEC_COND_EXPR code I mentioned should come in.
> At the moment, before generating:
> 
>   VEC_COND_EXPR<C, E1, E2>
> 
> we check whether there are any other operations predicated on C &
> LOOP_MASK.  If so, we use:
> 
>   VEC_COND_EXPR<C & LOOP_MASK, E1, E2>
> 
> instead.  This avoids both C and C & LOOP_MASK being live at the same time.
> 
> The change I was suggesting was that we should also check whether there
> are any operations predicated on ~C & LOOP_MASK.  If so, we should
> generate:
> 
>   VEC_COND_EXPR<~C & LOOP_MASK, E2, E1>
> 
> Again, the justification is that we don't have C and ~C & LOOP_MASK live at
> the same time.  If C itself has the form ~C' then swapping the operands also
> saves a negation.
> 
> > This code is fine as far as I can tell.  But there's nothing you can
> > do here. The mask it needs is ~original So it does not find an inverse mask
> to use because it has to honor floating point exceptions.
> >
> > And indeed `-fno-trapping-math` or `-Ofast` generates the most optimal
> > sequence, but when honoring traps it can't re-use invert existing mask,
> which leaves the operation unpredicated.
> >
> > So is what you're requesting that it looks inside unary operators and tries to
> CSE the thing they're pointed to?
> 
> [Hopefully answered this above]

Yes it does, along with the split above. I had originally thought you meant the
VEC_COND_EXPR should take care of both problems..

> 
> > In which case isn't it about the same as what I had before just that the
> vectorizer did the CSE itself?
> 
> IMO the VEC_COND_EXPR optimisation is more selective.  It only changes
> the gimple if we know that it will have an effect (in terms of reducing the
> number of SSA_NAMEs that have multiple uses).  Also, unlike the original
> patch, there's no double-masking invovled: ~C & LOOP_MASK will only apply
> LOOP_MASK once.
> 
> > If that's the case maybe it's better to do lookups into loop_vinfo-
> >scalar_cond_masked_set in prepare_load_store_mask?
> > So that it just applies to everything?
> 
> scalar_cond_masked_set only exists to convert a mask that ignores
> LOOP_MASK into one that is ANDed with LOOP_MASK.  For loads and stores
> we always AND with LOOP_MASK, so a lookup isn't necessary.

Fair enough, 

Thanks for the explanation!

I'll keep this in mind for the other patches.

Thanks,
Tamar
> 
> Thanks,
> Richard
Richard Sandiford July 1, 2021, 9:15 a.m. UTC | #7
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Wednesday, June 30, 2021 6:55 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH][RFC]AArch64 SVE: Fix multiple comparison masks on
>> inverted operands
>> 
>> Tamar Christina <Tamar.Christina@arm.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford <richard.sandiford@arm.com>
>> >> Sent: Monday, June 14, 2021 4:55 PM
>> >> To: Tamar Christina <Tamar.Christina@arm.com>
>> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> <Kyrylo.Tkachov@arm.com>
>> >> Subject: Re: [PATCH][RFC]AArch64 SVE: Fix multiple comparison masks
>> >> on inverted operands
>> >>
>> >> Tamar Christina <Tamar.Christina@arm.com> writes:
>> >> > Hi Richard,
>> >> >> -----Original Message-----
>> >> >> From: Richard Sandiford <richard.sandiford@arm.com>
>> >> >> Sent: Monday, June 14, 2021 3:50 PM
>> >> >> To: Tamar Christina <Tamar.Christina@arm.com>
>> >> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> >> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> >> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> >> <Kyrylo.Tkachov@arm.com>
>> >> >> Subject: Re: [PATCH][RFC]AArch64 SVE: Fix multiple comparison
>> >> >> masks on inverted operands
>> >> >>
>> >> >> Tamar Christina <tamar.christina@arm.com> writes:
>> >> >> > Hi All,
>> >> >> >
>> >> >> > This RFC is trying to address the following inefficiency when
>> >> >> > vectorizing conditional statements with SVE.
>> >> >> >
>> >> >> > Consider the case
>> >> >> >
>> >> >> > void f10(double * restrict z, double * restrict w, double * restrict x,
>> >> >> > 	 double * restrict y, int n)
>> >> >> > {
>> >> >> >     for (int i = 0; i < n; i++) {
>> >> >> >         z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
>> >> >> >     }
>> >> >> > }
>> >> >> >
>> >> >> >
>> >> >> > For which we currently generate at -O3:
>> >> >> >
>> >> >> > f10:
>> >> >> >         cmp     w4, 0
>> >> >> >         ble     .L1
>> >> >> >         mov     x5, 0
>> >> >> >         whilelo p1.d, wzr, w4
>> >> >> >         ptrue   p3.b, all
>> >> >> > .L3:
>> >> >> >         ld1d    z1.d, p1/z, [x1, x5, lsl 3]
>> >> >> >         fcmgt   p2.d, p1/z, z1.d, #0.0
>> >> >> >         fcmgt   p0.d, p3/z, z1.d, #0.0
>> >> >> >         ld1d    z2.d, p2/z, [x2, x5, lsl 3]
>> >> >> >         bic     p0.b, p3/z, p1.b, p0.b
>> >> >> >         ld1d    z0.d, p0/z, [x3, x5, lsl 3]
>> >> >> >         fsub    z0.d, p0/m, z0.d, z1.d
>> >> >> >         movprfx z0.d, p2/m, z1.d
>> >> >> >         fadd    z0.d, p2/m, z0.d, z2.d
>> >> >> >         st1d    z0.d, p1, [x0, x5, lsl 3]
>> >> >> >         incd    x5
>> >> >> >         whilelo p1.d, w5, w4
>> >> >> >         b.any   .L3
>> >> >> > .L1:
>> >> >> >         ret
>> >> >> >
>> >> >> > Notice that the condition for the else branch duplicates the
>> >> >> > same predicate as the then branch and then uses BIC to negate the
>> results.
>> >> >> >
>> >> >> > The reason for this is that during instruction generation in the
>> >> >> > vectorizer we emit
>> >> >> >
>> >> >> >   mask__41.11_66 = vect__4.10_64 > vect_cst__65;
>> >> >> >   vec_mask_and_69 = mask__41.11_66 & loop_mask_63;
>> >> >> >   vec_mask_and_71 = mask__41.11_66 & loop_mask_63;
>> >> >> >   mask__43.16_73 = ~mask__41.11_66;
>> >> >> >   vec_mask_and_76 = mask__43.16_73 & loop_mask_63;
>> >> >> >   vec_mask_and_78 = mask__43.16_73 & loop_mask_63;
>> >> >> >
>> >> >> > which ultimately gets optimized to
>> >> >> >
>> >> >> >   mask__41.11_66 = vect__4.10_64 > { 0.0, ... };
>> >> >> >   vec_mask_and_69 = loop_mask_63 & mask__41.11_66;
>> >> >> >   mask__43.16_73 = ~mask__41.11_66;
>> >> >> >   vec_mask_and_76 = loop_mask_63 & mask__43.16_73;
>> >> >> >
>> >> >> > Notice how the negate is on the operation and not the predicate
>> >> >> > resulting from the operation.  When this is expanded this turns
>> >> >> > into RTL where the negate is on the compare directly.  This
>> >> >> > means the RTL is different from the one without the negate and
>> >> >> > so CSE is unable to
>> >> >> recognize that they are essentially same operation.
>> >> >> >
>> >> >> > To fix this my patch changes it so you negate the mask rather
>> >> >> > than the operation
>> >> >> >
>> >> >> >   mask__41.13_55 = vect__4.12_53 > { 0.0, ... };
>> >> >> >   vec_mask_and_58 = loop_mask_52 & mask__41.13_55;
>> >> >> >   vec_mask_op_67 = ~vec_mask_and_58;
>> >> >> >   vec_mask_and_65 = loop_mask_52 & vec_mask_op_67;
>> >> >>
>> >> >> But to me this looks like a pessimisation in gimple terms.  We've
>> >> >> increased the length of the critical path: vec_mask_and_65 now
>> >> >> needs a chain of
>> >> >> 4 operations instead of 3.
>> >> >
>> >> > True, but it should reduce the number of RTL patterns.  I would
>> >> > have thought RTL is more expensive to handle than gimple.
>> >>
>> >> I think this is only a fair gimple optimisation if gimple does the
>> >> isel itself (to a predicated compare and a predicated NOT).
>> >>
>> >> >> We also need to be careful not to pessimise the case in which the
>> >> >> comparison is an integer one.  At the moment we'll generate
>> >> >> opposed conditions, which is the intended behaviour:
>> >> >
>> >> > This still happens with this patch at `-Ofast` because that flips
>> >> > the conditions, So the different representation doesn't harm it.
>> >>
>> >> OK, that's good.
>> >>
>> >> >>
>> >> >> .L3:
>> >> >>         ld1d    z1.d, p0/z, [x1, x5, lsl 3]
>> >> >>         cmpgt   p2.d, p0/z, z1.d, #0
>> >> >>         movprfx z2, z1
>> >> >>         scvtf   z2.d, p3/m, z1.d
>> >> >>         cmple   p1.d, p0/z, z1.d, #0
>> >> >>         ld1d    z0.d, p2/z, [x2, x5, lsl 3]
>> >> >>         ld1d    z1.d, p1/z, [x3, x5, lsl 3]
>> >> >>         fadd    z0.d, p2/m, z0.d, z2.d
>> >> >>         movprfx z0.d, p1/m, z1.d
>> >> >>         fsub    z0.d, p1/m, z0.d, z2.d
>> >> >>         st1d    z0.d, p0, [x0, x5, lsl 3]
>> >> >>         add     x5, x5, x6
>> >> >>         whilelo p0.d, w5, w4
>> >> >>         b.any   .L3
>> >> >>
>> >> >> Could we handle the fcmp case using a 3->2 define_split instead:
>> >> >> convert
>> >> >>
>> >> >>    (set res (and (not (fcmp X Y)) Z)) ->
>> >> >>      (set res (fcmp X Y))
>> >> >>      (set res (and (not res) Z))
>> >> >>
>> >> >
>> >> > This was the other approach I mentioned. It works, and gives you
>> >> > the neg,
>> >> but only in the case where the compare is single use.
>> >>
>> >> But in the original example we duplicate the comparison through a
>> >> 2->2 combine, which leaves the original comparison as a single use.
>> >> Isn't that enough?
>> >>
>> >> > e.g. in
>> >> >
>> >> > void f11(double * restrict z, double * restrict w, double *
>> >> > restrict x, double * restrict y, int n) {
>> >> >     for (int i = 0; i < n; i++) {
>> >> >         z[i] = (w[i] > 0) ? w[i] : y[i] - w[i];
>> >> >     }
>> >> > }
>> >> >
>> >> > You have some of the same problem. It generates
>> >> >
>> >> >         ld1d    z0.d, p0/z, [x1, x2, lsl 3]
>> >> >         fcmgt   p2.d, p3/z, z0.d, #0.0
>> >> >         bic     p1.b, p3/z, p0.b, p2.b
>> >> >         ld1d    z1.d, p1/z, [x3, x2, lsl 3]
>> >> >         fsub    z1.d, p1/m, z1.d, z0.d
>> >> >         sel     z0.d, p2, z0.d, z1.d
>> >> >         st1d    z0.d, p0, [x0, x2, lsl 3]
>> >> >         incd    x2
>> >> >         whilelo p0.d, w2, w4
>> >> >
>> >> > which has two problems. fcmgt doesn't need to be predicated on p3
>> >> > which is ptrue all, it can/should be p0.
>> >> >
>> >> > With that fixed the splitter won't match because p2 is needed in
>> >> > the sel, so it's not single use and so combine won't try to build
>> >> > the RTL so it can
>> >> be split.
>> >>
>> >> I think having the vectoriser avoid the dual use between the
>> >> IFN_MASK_LOAD/STORE and the VEC_COND_EXPR is fair game, since
>> that is
>> >> the only pass that has the context to prove that including the loop
>> >> mask in the VEC_COND_EXPR condition is correct.  We already try to do
>> >> that to some
>> >> extent:
>> >>
>> >
>> > Sorry I have been looking at this these past couple of days and I just
>> > don't know how this is supposed to work.
>> >
>> > In the above example the problem is not just the use of p2 in the
>> > VEC_COND_EXPR. If the VEC_COND_EXPR is changed to use p1 then p1
>> now
>> > has 3 uses which makes combine still not try the combination.
>> >
>> > But the general case
>> >
>> > void f10(double * restrict z, double * restrict w, double * restrict
>> > x, double * restrict y, int n) {
>> >     for (int i = 0; i < n; i++) {
>> >         z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
>> >     }
>> > }
>> >
>> > Produces
>> >
>> > f10:
>> >         cmp     w4, 0
>> >         ble     .L1
>> >         mov     x5, 0
>> >         whilelo p1.d, wzr, w4
>> >         ptrue   p3.b, all
>> >         .p2align 5,,15
>> > .L3:
>> >         ld1d    z1.d, p1/z, [x1, x5, lsl 3]
>> >         fcmgt   p2.d, p1/z, z1.d, #0.0
>> >         fcmgt   p0.d, p3/z, z1.d, #0.0
>> >         ld1d    z2.d, p2/z, [x2, x5, lsl 3]
>> >         bic     p0.b, p3/z, p1.b, p0.b
>> >         ld1d    z0.d, p0/z, [x3, x5, lsl 3]
>> >         fsub    z0.d, p0/m, z0.d, z1.d
>> >         movprfx z0.d, p2/m, z1.d
>> >         fadd    z0.d, p2/m, z0.d, z2.d
>> >         st1d    z0.d, p1, [x0, x5, lsl 3]
>> >         incd    x5
>> >         whilelo p1.d, w5, w4
>> >         b.any   .L3
>> >
>> > where the VEC_COND_EXPR has been elided.
>> >
>> > The problem is that the comparison for the inverse case is unpredicated.
>> 
>> Yeah, for the original f10 example I was suggesting that we use combine
>> instead of changing the vectoriser.  It turns out that the 3->2 split thing I
>> mentioned above won't work though, because we match the BIC first.
>> But I think something like:
>> 
>> (define_insn_and_split "*inverted_fcmgt"
>>   [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")

Oops, should have been =&Upa

>> 	(and:<VPRED>
>> 	  (and:<VPRED>
>> 	    (not:<VPRED>
>> 	      (unspec:<VPRED>
>> 		[(match_operand:<VPRED> 1 "register_operand" "Upl")
>> 		 (const_int SVE_KNOWN_PTRUE)
>> 		 (match_operand:SVE_FULL_F 2 "register_operand" "w")
>> 		 (match_operand:SVE_FULL_F 3
>> "aarch64_simd_reg_or_zero" "wDz")]
>> 		SVE_COND_FP_CMP_I0))
>> 	    (match_operand:<VPRED> 4 "register_operand" "w"))
>> 	  (match_dup 1)))]
>>   "TARGET_SVE && !reg_overlap_mentioned_p (operands[4], operands[0])"
>>   "#"
>>   "&& 1"
>>   [(set (match_dup 0)
>> 	(unspec:<VPRED>
>> 	  [(match_dup 4)
>> 	   (const_int SVE_MAYBE_NOT_PTRUE)
>> 	   (match_dup 2)
>> 	   (match_dup 3)]
>> 	  SVE_COND_FP_CMP_I0))
>>    (set (match_dup 0)
>> 	(and:<VPRED> (not:<VPRED> (match_dup 0)) (match_dup 4)))]
>> )
>> 
>> is a legitimate optimisation in its own right because it gets rid of the PTRUE
>> operand (at split time).  This would be a good thing to do wherever the
>> FCMxx and BIC come from.
>
> Ah I see...
>
> I was trying with
>
> ;; Make sure that inversions of masked comparisons are always on the mask
> ;; instead of on the operation.
> (define_insn_and_split "*mask_inv_combine"
>   [(match_scratch:<VPRED> 5 "=Upa, Upa")
>    (set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
>        (and:<VPRED>
>          (and:<VPRED>
>            (not:<VPRED>
>              (unspec:<VPRED>
>                [(match_operand:<VPRED> 1)
>                 (const_int SVE_KNOWN_PTRUE)
>                 (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
>                 (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "Dz, w")]
>                SVE_COND_FP_CMP_I0))
>            (match_operand:<VPRED> 4 "register_operand" "Upa, Upa"))
>              (match_dup:<VPRED> 1)))]
>   "TARGET_SVE"

The reg_overlap_mentioned_p check above is needed because the split pattern
uses operand 4 after writing to operand 0.

>   "#"
>   "&& 1"
>   [(set (match_dup 0)
>        (and:<VPRED>
>          (unspec:<VPRED>
>            [(match_dup 1)
>             (const_int SVE_KNOWN_PTRUE)
>             (match_dup 2)
>             (match_dup 3)]
>            SVE_COND_FP_CMP_I0)
>              (match_dup 4)))
>    (set (match_dup 0)
>        (and:<VPRED>
>          (not:<VPRED>
>            (match_dup 0))
>          (match_dup 4)))]
> )
>
> But the difference seems to be I need to use    (const_int SVE_MAYBE_NOT_PTRUE)
> After the split. Could you perhaps point me to what this means in RTL?
>
> The predicate itself is operand1 so just want to know what exactly this does to the semantics.

Yeah, the split fcmp predicate is operand 1 in your version and operand 4
in mine.  SVE_KNOWN_PTRUE is correct for operand 1 but not for operand 4.

This is from:

;; - PTRUE_FLAG is a CONST_INT (conceptually of mode SI) that has the value
;;   SVE_KNOWN_PTRUE if we know that CAST_GP (rather than GP) is all-true and
;;   SVE_MAYBE_NOT_PTRUE otherwise.

(in the context of UNSPEC_PTEST).  So SVE_KNOWN_PTRUE guarantees that the
gp is an all-true predicate while SVE_MAYBE_NOT_PTRUE doesn't.

Apart from the reg_overlap_mentioned_p and earlyclobber, the pattern
above looks correct as well.  However, the fcmp instruction in the
split pattern will match:

;; Floating-point comparisons predicated on a PTRUE, with the results ANDed
;; with another predicate P.  This does not have the same trapping behavior
;; as predicating the comparison itself on P, but it's a legitimate fold,
;; since we can drop any potentially-trapping operations whose results
;; are not needed.
;;
;; Split the instruction into its preferred form (below) at the earliest
;; opportunity, in order to get rid of the redundant operand 1.
(define_insn_and_split "*fcm<cmp_op><mode>_and_combine"
  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
	(and:<VPRED>
	  (unspec:<VPRED>
	    [(match_operand:<VPRED> 1)
	     (const_int SVE_KNOWN_PTRUE)
	     (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
	     (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "Dz, w")]
	    SVE_COND_FP_CMP_I0)
	  (match_operand:<VPRED> 4 "register_operand" "Upl, Upl")))]
  "TARGET_SVE"
  "#"
  "&& 1"
  [(set (match_dup 0)
	(unspec:<VPRED>
	  [(match_dup 4)
	   (const_int SVE_MAYBE_NOT_PTRUE)
	   (match_dup 2)
	   (match_dup 3)]
	  SVE_COND_FP_CMP_I0))]
)

so itself needs to be split.  I think it'd be better to generate the
final form directly, partly because it's simpler, but mostly because
it makes it clearer that we really are getting rid of the PTRUE operand
(which is the justification for this being an optimisation).

>> The snag is that we don't then CSE the comparisons between split1 and RA.
>> The post-RA optimisers might then leave a move.  E.g. for plain -O3 the
>> above pattern gives the expected:
>> 
>>         cmp     w4, 0
>>         ble     .L1
>>         mov     x5, 0
>>         cntd    x6
>>         whilelo p0.d, wzr, w4
>>         .p2align 3,,7
>> .L3:
>>         ld1d    z1.d, p0/z, [x1, x5, lsl 3]
>>         fcmgt   p2.d, p0/z, z1.d, #0.0
>>         not     p1.b, p0/z, p2.b
>>         ld1d    z2.d, p2/z, [x2, x5, lsl 3]
>>         ld1d    z0.d, p1/z, [x3, x5, lsl 3]
>>         fsub    z0.d, p1/m, z0.d, z1.d
>>         movprfx z0.d, p2/m, z1.d
>>         fadd    z0.d, p2/m, z0.d, z2.d
>>         st1d    z0.d, p0, [x0, x5, lsl 3]
>>         add     x5, x5, x6
>>         whilelo p0.d, w5, w4
>>         b.any   .L3
>> .L1:
>>         ret
>> 
>> but -O3 -fno-trapping-math has a predicate move:
>> 
>>         cmp     w4, 0
>>         ble     .L1
>>         mov     x5, 0
>>         cntd    x6
>>         whilelo p0.d, wzr, w4
>>         .p2align 3,,7
>> .L3:
>>         ld1d    z1.d, p0/z, [x1, x5, lsl 3]
>>         fcmgt   p1.d, p0/z, z1.d, #0.0
>>         mov     p2.b, p1.b
>>         not     p1.b, p0/z, p1.b
>>         ld1d    z0.d, p2/z, [x2, x5, lsl 3]
>>         ld1d    z2.d, p1/z, [x3, x5, lsl 3]
>>         fadd    z0.d, z1.d, z0.d
>>         movprfx z0.d, p1/m, z2.d
>>         fsub    z0.d, p1/m, z0.d, z1.d
>>         st1d    z0.d, p0, [x0, x5, lsl 3]
>>         add     x5, x5, x6
>>         whilelo p0.d, w5, w4
>>         b.any   .L3
>> .L1:
>>         ret
>> 
>> It would be good to fix this in RTL “somehow”.
>> 
>> But for f11 we still get:
>> 
>> f11:
>> .LFB1:
>>         .cfi_startproc
>>         cmp     w4, 0
>>         ble     .L6
>>         mov     x2, 0
>>         cntd    x5
>>         whilelo p1.d, wzr, w4
>>         ptrue   p3.b, all
>>         .p2align 3,,7
>> .L8:
>>         ld1d    z0.d, p1/z, [x1, x2, lsl 3]
>>         fcmgt   p0.d, p1/z, z0.d, #0.0
>>         fcmgt   p2.d, p3/z, z0.d, #0.0
>>         not     p0.b, p1/z, p0.b
>>         ld1d    z1.d, p0/z, [x3, x2, lsl 3]
>>         fsub    z1.d, p0/m, z1.d, z0.d
>>         sel     z0.d, p2, z0.d, z1.d
>>         st1d    z0.d, p1, [x0, x2, lsl 3]
>>         add     x2, x2, x5
>>         whilelo p1.d, w2, w4
>>         b.any   .L8
>> .L6:
>>         ret
>> 
>> This is where the VEC_COND_EXPR code I mentioned should come in.
>> At the moment, before generating:
>> 
>>   VEC_COND_EXPR<C, E1, E2>
>> 
>> we check whether there are any other operations predicated on C &
>> LOOP_MASK.  If so, we use:
>> 
>>   VEC_COND_EXPR<C & LOOP_MASK, E1, E2>
>> 
>> instead.  This avoids both C and C & LOOP_MASK being live at the same time.
>> 
>> The change I was suggesting was that we should also check whether there
>> are any operations predicated on ~C & LOOP_MASK.  If so, we should
>> generate:
>> 
>>   VEC_COND_EXPR<~C & LOOP_MASK, E2, E1>
>> 
>> Again, the justification is that we don't have C and ~C & LOOP_MASK live at
>> the same time.  If C itself has the form ~C' then swapping the operands also
>> saves a negation.
>> 
>> > This code is fine as far as I can tell.  But there's nothing you can
>> > do here. The mask it needs is ~original So it does not find an inverse mask
>> to use because it has to honor floating point exceptions.
>> >
>> > And indeed `-fno-trapping-math` or `-Ofast` generates the most optimal
>> > sequence, but when honoring traps it can't re-use invert existing mask,
>> which leaves the operation unpredicated.
>> >
>> > So is what you're requesting that it looks inside unary operators and tries to
>> CSE the thing they're pointed to?
>> 
>> [Hopefully answered this above]
>
> Yes it does, along with the split above. I had originally thought you meant the
> VEC_COND_EXPR should take care of both problems..

Ah, no, I just meant the f11 example, sorry.

Thanks,
Richard
diff mbox series

Patch

diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index eeef96a2eb60853e9c18a288af9e49ae9ad65128..35e5212c77d7cb26b1a2b9645cbac22c30078fb8 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1785,8 +1785,38 @@  prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask,
 
   gcc_assert (TREE_TYPE (loop_mask) == mask_type);
   tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
+  tree final_mask = vec_mask;
+
+  /* Check if what vec_mask is pointing at is a unary operator and if so
+     expand the operand before the mask and not on the operation to allow
+     for better CSE.  */
+  if (TREE_CODE (vec_mask) == SSA_NAME)
+    {
+      gimple *stmt = SSA_NAME_DEF_STMT (vec_mask);
+      if (is_gimple_assign (stmt)
+	  && gimple_assign_rhs_class (stmt) == GIMPLE_UNARY_RHS)
+	{
+	  tree_code code = gimple_assign_rhs_code (stmt);
+	  tree pred_op = gimple_assign_rhs1 (stmt);
+
+	  /* Predicate the operation first.  */
+	  gimple *pred_stmt;
+	  tree pred_res1 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op");
+	  pred_stmt = gimple_build_assign (pred_res1, BIT_AND_EXPR,
+					   pred_op, loop_mask);
+	  gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT);
+
+	  /* Now move the operation to the top and predicate it.  */
+	  tree pred_res2 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op");
+	  pred_stmt = gimple_build_assign (pred_res2, code,
+					   pred_res1);
+	  gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT);
+	  final_mask = pred_res2;
+	}
+    }
+
   gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
-					  vec_mask, loop_mask);
+					  final_mask, loop_mask);
   gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT);
   return and_res;
 }