diff mbox series

[v2,10/16] AArch64: Add NEON RTL patterns for Complex Addition, Multiply and FMA.

Message ID 20200925143005.GA24088@arm.com
State New
Headers show
Series middle-end Add support for SLP vectorization of complex number instructions. | expand

Commit Message

Tamar Christina Sept. 25, 2020, 2:30 p.m. UTC
Hi All,

This adds implementation for the optabs for complex operations.  With this the
following C code:

  void f90 (float complex a[restrict N], float complex b[restrict N],
	    float complex c[restrict N])
  {
    for (int i=0; i < N; i++)
      c[i] = a[i] + (b[i] * I);
  }

generates

  f90:
	  mov     x3, 0
	  .p2align 3,,7
  .L2:
	  ldr     q0, [x0, x3]
	  ldr     q1, [x1, x3]
	  fcadd   v0.4s, v0.4s, v1.4s, #90
	  str     q0, [x2, x3]
	  add     x3, x3, 16
	  cmp     x3, 1600
	  bne     .L2
	  ret

instead of

  f90:
	  add     x3, x1, 1600
	  .p2align 3,,7
  .L2:
	  ld2     {v4.4s - v5.4s}, [x0], 32
	  ld2     {v2.4s - v3.4s}, [x1], 32
	  fsub    v0.4s, v4.4s, v3.4s
	  fadd    v1.4s, v5.4s, v2.4s
	  st2     {v0.4s - v1.4s}, [x2], 32
	  cmp     x3, x1
	  bne     .L2
	  ret

It defined a new iterator VALL_ARITH which contains types for which we can do
general arithmetic (excludes bfloat16).

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (cadd<rot><mode>3,
	cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New.
	* config/aarch64/iterators.md (VALL_ARITH, UNSPEC_FCMUL,
	UNSPEC_FCMUL180, UNSPEC_FCMLS, UNSPEC_FCMLS180, UNSPEC_CMLS,
	UNSPEC_CMLS180, UNSPEC_CMUL, UNSPEC_CMUL180, FCMLA_OP, FCMUL_OP, rot_op,
	rotsplit1, rotsplit2, fcmac1): New.
	(rot): Add UNSPEC_FCMLS, UNSPEC_FCMUL, UNSPEC_FCMUL180.

--

Comments

Tamar Christina Nov. 14, 2020, 3:11 p.m. UTC | #1
ping

> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of Tamar
> Christina
> Sent: Friday, September 25, 2020 3:30 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>
> Subject: [PATCH v2 10/16]AArch64: Add NEON RTL patterns for Complex
> Addition, Multiply and FMA.
> 
> Hi All,
> 
> This adds implementation for the optabs for complex operations.  With this
> the following C code:
> 
>   void f90 (float complex a[restrict N], float complex b[restrict N],
> 	    float complex c[restrict N])
>   {
>     for (int i=0; i < N; i++)
>       c[i] = a[i] + (b[i] * I);
>   }
> 
> generates
> 
>   f90:
> 	  mov     x3, 0
> 	  .p2align 3,,7
>   .L2:
> 	  ldr     q0, [x0, x3]
> 	  ldr     q1, [x1, x3]
> 	  fcadd   v0.4s, v0.4s, v1.4s, #90
> 	  str     q0, [x2, x3]
> 	  add     x3, x3, 16
> 	  cmp     x3, 1600
> 	  bne     .L2
> 	  ret
> 
> instead of
> 
>   f90:
> 	  add     x3, x1, 1600
> 	  .p2align 3,,7
>   .L2:
> 	  ld2     {v4.4s - v5.4s}, [x0], 32
> 	  ld2     {v2.4s - v3.4s}, [x1], 32
> 	  fsub    v0.4s, v4.4s, v3.4s
> 	  fadd    v1.4s, v5.4s, v2.4s
> 	  st2     {v0.4s - v1.4s}, [x2], 32
> 	  cmp     x3, x1
> 	  bne     .L2
> 	  ret
> 
> It defined a new iterator VALL_ARITH which contains types for which we can
> do general arithmetic (excludes bfloat16).
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-simd.md (cadd<rot><mode>3,
> 	cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New.
> 	* config/aarch64/iterators.md (VALL_ARITH, UNSPEC_FCMUL,
> 	UNSPEC_FCMUL180, UNSPEC_FCMLS, UNSPEC_FCMLS180,
> UNSPEC_CMLS,
> 	UNSPEC_CMLS180, UNSPEC_CMUL, UNSPEC_CMUL180, FCMLA_OP,
> FCMUL_OP, rot_op,
> 	rotsplit1, rotsplit2, fcmac1): New.
> 	(rot): Add UNSPEC_FCMLS, UNSPEC_FCMUL, UNSPEC_FCMUL180.
> 
> --
Kyrylo Tkachov Nov. 16, 2020, 9:58 a.m. UTC | #2
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: 25 September 2020 15:30
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: [PATCH v2 10/16]AArch64: Add NEON RTL patterns for Complex
> Addition, Multiply and FMA.
> 
> Hi All,
> 
> This adds implementation for the optabs for complex operations.  With this
> the
> following C code:
> 
>   void f90 (float complex a[restrict N], float complex b[restrict N],
> 	    float complex c[restrict N])
>   {
>     for (int i=0; i < N; i++)
>       c[i] = a[i] + (b[i] * I);
>   }
> 
> generates
> 
>   f90:
> 	  mov     x3, 0
> 	  .p2align 3,,7
>   .L2:
> 	  ldr     q0, [x0, x3]
> 	  ldr     q1, [x1, x3]
> 	  fcadd   v0.4s, v0.4s, v1.4s, #90
> 	  str     q0, [x2, x3]
> 	  add     x3, x3, 16
> 	  cmp     x3, 1600
> 	  bne     .L2
> 	  ret
> 
> instead of
> 
>   f90:
> 	  add     x3, x1, 1600
> 	  .p2align 3,,7
>   .L2:
> 	  ld2     {v4.4s - v5.4s}, [x0], 32
> 	  ld2     {v2.4s - v3.4s}, [x1], 32
> 	  fsub    v0.4s, v4.4s, v3.4s
> 	  fadd    v1.4s, v5.4s, v2.4s
> 	  st2     {v0.4s - v1.4s}, [x2], 32
> 	  cmp     x3, x1
> 	  bne     .L2
> 	  ret
> 
> It defined a new iterator VALL_ARITH which contains types for which we can
> do
> general arithmetic (excludes bfloat16).
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?

Ok.
Thanks,
Kyrill

> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-simd.md (cadd<rot><mode>3,
> 	cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New.
> 	* config/aarch64/iterators.md (VALL_ARITH, UNSPEC_FCMUL,
> 	UNSPEC_FCMUL180, UNSPEC_FCMLS, UNSPEC_FCMLS180,
> UNSPEC_CMLS,
> 	UNSPEC_CMLS180, UNSPEC_CMUL, UNSPEC_CMUL180, FCMLA_OP,
> FCMUL_OP, rot_op,
> 	rotsplit1, rotsplit2, fcmac1): New.
> 	(rot): Add UNSPEC_FCMLS, UNSPEC_FCMUL, UNSPEC_FCMUL180.
> 
> --
Richard Sandiford Nov. 16, 2020, 11:56 a.m. UTC | #3
> +;; A conjucate is a rotation of 180* around the argand plane, or * I.

Hmm, but a complex conjugate is a reflection around the real axis rather
than a rotation.  Also, 180 degrees around the Argand plane is * -1
rather than * I.  So…

> +(define_int_attr rot_op [(UNSPEC_FCMLS "")
> +			 (UNSPEC_FCMLS180 "_conj")
> +			 (UNSPEC_FCMLA "")
> +			 (UNSPEC_FCMLA180 "_conj")
> +			 (UNSPEC_FCMUL "")
> +			 (UNSPEC_FCMUL180 "_conj")
> +			 (UNSPEC_CMLS "")
> +			 (UNSPEC_CMLA "")
> +			 (UNSPEC_CMLA180 "_conj")
> +			 (UNSPEC_CMUL "")
> +			 (UNSPEC_CMUL180 "_conj")])
> +
> +(define_int_attr rotsplit1 [(UNSPEC_FCMLA "0")
> +			    (UNSPEC_FCMLA180 "0")
> +			    (UNSPEC_FCMUL "0")
> +			    (UNSPEC_FCMUL180 "0")
> +			    (UNSPEC_FCMLS "270")
> +			    (UNSPEC_FCMLS180 "90")])
> +
> +(define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
> +			    (UNSPEC_FCMLA180 "270")
> +			    (UNSPEC_FCMUL "90")
> +			    (UNSPEC_FCMUL180 "270")
> +			    (UNSPEC_FCMLS "180")
> +			    (UNSPEC_FCMLS180 "180")])

I think it would be worth clarifying what the patterns do.  AIUI, the
effect of the MUL180 and MLA180 “rot_op” cases as written in the patch
is to multiply the conjugate of the first operand by the second operand,
whereas the documentation in the earlier patches implied that they
should multiply the conjugate of the second operand by the first.
But maybe I've got this wrong.

  operands to the pattern: a + bi, c + di

  conjugate of first operand * second operand [A]:

    (a - bi) * (c + di) = (ac + bd) + (ad - bc)i

  first operand * conjugate of second operand [B]:

    (a + bi) * (c - di) = (ac + bd) + (bc - ad)i

  the FCMUL180 sequence chosen by the patch:

    0: a * (c + di) = ac + adi
  270: b * (d - ci) = bd - bci

  which gives A rather than B.

> +;; The complex mla/mls operations always need to expand to two instructions.
> +;; The first operation does half the computation and the second does the
> +;; remainder.  Because of this, expand early.
> +(define_expand "cml<fcmac1><rot_op><mode>4"
> +  [(set (match_operand:VHSDF 0 "register_operand")
> +	(plus:VHSDF (match_operand:VHSDF 1 "register_operand")
> +		    (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand")
> +				   (match_operand:VHSDF 3 "register_operand")]
> +				   FCMLA_OP)))]
> +  "TARGET_COMPLEX"
> +{
> +  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (operands[0], operands[1],
> +						 operands[2], operands[3]));
> +  emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0], operands[0],
> +						 operands[2], operands[3]));
> +  DONE;
> +})

The interface doesn't guarantee that operands[0] is distinct from the
inputs, so I think the result of the first instruction should be a
temporary register.

Thanks,
Richard
Tamar Christina Nov. 16, 2020, 12:18 p.m. UTC | #4
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Monday, November 16, 2020 11:56 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>
> Subject: Re: [PATCH v2 10/16]AArch64: Add NEON RTL patterns for Complex
> Addition, Multiply and FMA.
> 
> > +;; A conjucate is a rotation of 180* around the argand plane, or * I.
> 
> Hmm, but a complex conjugate is a reflection around the real axis rather than
> a rotation.  Also, 180 degrees around the Argand plane is * -1 rather than * I.
> So…

Yes indeed, Sorry this is a comment from waaaaay in the beginning that I forgot to update..
The operation itself expects an actual reflection :) 

> 
> > +(define_int_attr rot_op [(UNSPEC_FCMLS "")
> > +			 (UNSPEC_FCMLS180 "_conj")
> > +			 (UNSPEC_FCMLA "")
> > +			 (UNSPEC_FCMLA180 "_conj")
> > +			 (UNSPEC_FCMUL "")
> > +			 (UNSPEC_FCMUL180 "_conj")
> > +			 (UNSPEC_CMLS "")
> > +			 (UNSPEC_CMLA "")
> > +			 (UNSPEC_CMLA180 "_conj")
> > +			 (UNSPEC_CMUL "")
> > +			 (UNSPEC_CMUL180 "_conj")])
> > +
> > +(define_int_attr rotsplit1 [(UNSPEC_FCMLA "0")
> > +			    (UNSPEC_FCMLA180 "0")
> > +			    (UNSPEC_FCMUL "0")
> > +			    (UNSPEC_FCMUL180 "0")
> > +			    (UNSPEC_FCMLS "270")
> > +			    (UNSPEC_FCMLS180 "90")])
> > +
> > +(define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
> > +			    (UNSPEC_FCMLA180 "270")
> > +			    (UNSPEC_FCMUL "90")
> > +			    (UNSPEC_FCMUL180 "270")
> > +			    (UNSPEC_FCMLS "180")
> > +			    (UNSPEC_FCMLS180 "180")])
> 
> I think it would be worth clarifying what the patterns do.  AIUI, the effect of
> the MUL180 and MLA180 “rot_op” cases as written in the patch is to multiply
> the conjugate of the first operand by the second operand, whereas the
> documentation in the earlier patches implied that they should multiply the
> conjugate of the second operand by the first.
> But maybe I've got this wrong.
> 
>   operands to the pattern: a + bi, c + di
> 
>   conjugate of first operand * second operand [A]:
> 
>     (a - bi) * (c + di) = (ac + bd) + (ad - bc)i
> 
>   first operand * conjugate of second operand [B]:
> 
>     (a + bi) * (c - di) = (ac + bd) + (bc - ad)i
> 
>   the FCMUL180 sequence chosen by the patch:
> 
>     0: a * (c + di) = ac + adi
>   270: b * (d - ci) = bd - bci
> 
>   which gives A rather than B.

Correct, it expects the conjucate In the second operand and corrects the argument order to account
for it during vectorization if it's on the first operand.

i.e. conjucate first returns

        mov     v0.16b, v3.16b
        ldr     q1, [x0, x3]
        ldr     q2, [x1, x3]
        fcmla   v0.4s, v1.4s, v2.4s, #0
        fcmla   v0.4s, v1.4s, v2.4s, #270
        str     q0, [x2, x3]

and conjucate second returns

        mov     v0.16b, v3.16b
        ldr     q1, [x1, x3]
        ldr     q2, [x0, x3]
        fcmla   v0.4s, v1.4s, v2.4s, #0
        fcmla   v0.4s, v1.4s, v2.4s, #270
        str     q0, [x2, x3]

I think at some point the documentation in got out of sync with the implementation because it turned out to be easier to conj first
and flip conj second.  I'll update the optab docs.

> 
> > +;; The complex mla/mls operations always need to expand to two
> instructions.
> > +;; The first operation does half the computation and the second does
> > +the ;; remainder.  Because of this, expand early.
> > +(define_expand "cml<fcmac1><rot_op><mode>4"
> > +  [(set (match_operand:VHSDF 0 "register_operand")
> > +	(plus:VHSDF (match_operand:VHSDF 1 "register_operand")
> > +		    (unspec:VHSDF [(match_operand:VHSDF 2
> "register_operand")
> > +				   (match_operand:VHSDF 3
> "register_operand")]
> > +				   FCMLA_OP)))]
> > +  "TARGET_COMPLEX"
> > +{
> > +  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (operands[0],
> operands[1],
> > +						 operands[2], operands[3]));
> > +  emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0],
> operands[0],
> > +						 operands[2], operands[3]));
> > +  DONE;
> > +})
> 
> The interface doesn't guarantee that operands[0] is distinct from the inputs,
> so I think the result of the first instruction should be a temporary register.

Hmm sorry I'm not sure I follow.. operands[0] doesn't have to be distinct from the inputs,
and in most cases it wouldn't be.

In c = c + (a * b)

operands[0] should be the same as operands[1].

I'm just trying to figure out why It needs to be different :) since it's an accumulation instruction..

Thanks,
Tamar

> 
> Thanks,
> Richard
Richard Sandiford Nov. 16, 2020, 12:47 p.m. UTC | #5
Tamar Christina <Tamar.Christina@arm.com> writes:
>> > +(define_int_attr rot_op [(UNSPEC_FCMLS "")
>> > +			 (UNSPEC_FCMLS180 "_conj")
>> > +			 (UNSPEC_FCMLA "")
>> > +			 (UNSPEC_FCMLA180 "_conj")
>> > +			 (UNSPEC_FCMUL "")
>> > +			 (UNSPEC_FCMUL180 "_conj")
>> > +			 (UNSPEC_CMLS "")
>> > +			 (UNSPEC_CMLA "")
>> > +			 (UNSPEC_CMLA180 "_conj")
>> > +			 (UNSPEC_CMUL "")
>> > +			 (UNSPEC_CMUL180 "_conj")])
>> > +
>> > +(define_int_attr rotsplit1 [(UNSPEC_FCMLA "0")
>> > +			    (UNSPEC_FCMLA180 "0")
>> > +			    (UNSPEC_FCMUL "0")
>> > +			    (UNSPEC_FCMUL180 "0")
>> > +			    (UNSPEC_FCMLS "270")
>> > +			    (UNSPEC_FCMLS180 "90")])
>> > +
>> > +(define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
>> > +			    (UNSPEC_FCMLA180 "270")
>> > +			    (UNSPEC_FCMUL "90")
>> > +			    (UNSPEC_FCMUL180 "270")
>> > +			    (UNSPEC_FCMLS "180")
>> > +			    (UNSPEC_FCMLS180 "180")])
>> 
>> I think it would be worth clarifying what the patterns do.  AIUI, the effect of
>> the MUL180 and MLA180 “rot_op” cases as written in the patch is to multiply
>> the conjugate of the first operand by the second operand, whereas the
>> documentation in the earlier patches implied that they should multiply the
>> conjugate of the second operand by the first.
>> But maybe I've got this wrong.
>> 
>>   operands to the pattern: a + bi, c + di
>> 
>>   conjugate of first operand * second operand [A]:
>> 
>>     (a - bi) * (c + di) = (ac + bd) + (ad - bc)i
>> 
>>   first operand * conjugate of second operand [B]:
>> 
>>     (a + bi) * (c - di) = (ac + bd) + (bc - ad)i
>> 
>>   the FCMUL180 sequence chosen by the patch:
>> 
>>     0: a * (c + di) = ac + adi
>>   270: b * (d - ci) = bd - bci
>> 
>>   which gives A rather than B.
>
> Correct, it expects the conjucate In the second operand and corrects the argument order to account
> for it during vectorization if it's on the first operand.

What's “it” here though?  The point of my comment above is that the
AArch64 expansion seems to conjugate the first operand rather than the
second.  When I read the iterator part of the patch originally, I was
wondering whether the AArch64 define_expand would swap the inputs to ensure
that the second operand is conjugated instead, but the define_expand
seems to preserve the original order.

So it seemed like it was the other way around from what you said above:
the optabs interface treats conjugating the first operand as the
“native” choice and the vectoriser would need to correct the argument
order if the scalar gimple code conjugated the second operand instead.

TBH I'm not sure which is the “natural” order in gimple.  Guess that's
one for Richi.

Still…

> i.e. conjucate first returns
>
>         mov     v0.16b, v3.16b
>         ldr     q1, [x0, x3]
>         ldr     q2, [x1, x3]
>         fcmla   v0.4s, v1.4s, v2.4s, #0
>         fcmla   v0.4s, v1.4s, v2.4s, #270
>         str     q0, [x2, x3]
>
> and conjucate second returns
>
>         mov     v0.16b, v3.16b
>         ldr     q1, [x1, x3]
>         ldr     q2, [x0, x3]
>         fcmla   v0.4s, v1.4s, v2.4s, #0
>         fcmla   v0.4s, v1.4s, v2.4s, #270
>         str     q0, [x2, x3]

…I agree this looks good, so it seems like everything works out in the end.
I guess it's just a question of clarifying the interface.

> I think at some point the documentation in got out of sync with the implementation because it turned out to be easier to conj first
> and flip conj second.  I'll update the optab docs.

Thanks.

>> > +;; The complex mla/mls operations always need to expand to two
>> instructions.
>> > +;; The first operation does half the computation and the second does
>> > +the ;; remainder.  Because of this, expand early.
>> > +(define_expand "cml<fcmac1><rot_op><mode>4"
>> > +  [(set (match_operand:VHSDF 0 "register_operand")
>> > +	(plus:VHSDF (match_operand:VHSDF 1 "register_operand")
>> > +		    (unspec:VHSDF [(match_operand:VHSDF 2
>> "register_operand")
>> > +				   (match_operand:VHSDF 3
>> "register_operand")]
>> > +				   FCMLA_OP)))]
>> > +  "TARGET_COMPLEX"
>> > +{
>> > +  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (operands[0],
>> operands[1],
>> > +						 operands[2], operands[3]));
>> > +  emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0],
>> operands[0],
>> > +						 operands[2], operands[3]));
>> > +  DONE;
>> > +})
>> 
>> The interface doesn't guarantee that operands[0] is distinct from the inputs,
>> so I think the result of the first instruction should be a temporary register.
>
> Hmm sorry I'm not sure I follow.. operands[0] doesn't have to be distinct from the inputs,
> and in most cases it wouldn't be.
>
> In c = c + (a * b)
>
> operands[0] should be the same as operands[1].
>
> I'm just trying to figure out why It needs to be different :) since it's an accumulation instruction..

Consider:

  c = c + (a * c)

With the expansion above, the first instruction would clobber the
operands[3] input to the second instruction.

Thanks,
Richard
Tamar Christina Nov. 16, 2020, 1:09 p.m. UTC | #6
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Monday, November 16, 2020 12:47 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>
> Subject: Re: [PATCH v2 10/16]AArch64: Add NEON RTL patterns for Complex
> Addition, Multiply and FMA.
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> > +(define_int_attr rot_op [(UNSPEC_FCMLS "")
> >> > +			 (UNSPEC_FCMLS180 "_conj")
> >> > +			 (UNSPEC_FCMLA "")
> >> > +			 (UNSPEC_FCMLA180 "_conj")
> >> > +			 (UNSPEC_FCMUL "")
> >> > +			 (UNSPEC_FCMUL180 "_conj")
> >> > +			 (UNSPEC_CMLS "")
> >> > +			 (UNSPEC_CMLA "")
> >> > +			 (UNSPEC_CMLA180 "_conj")
> >> > +			 (UNSPEC_CMUL "")
> >> > +			 (UNSPEC_CMUL180 "_conj")])
> >> > +
> >> > +(define_int_attr rotsplit1 [(UNSPEC_FCMLA "0")
> >> > +			    (UNSPEC_FCMLA180 "0")
> >> > +			    (UNSPEC_FCMUL "0")
> >> > +			    (UNSPEC_FCMUL180 "0")
> >> > +			    (UNSPEC_FCMLS "270")
> >> > +			    (UNSPEC_FCMLS180 "90")])
> >> > +
> >> > +(define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
> >> > +			    (UNSPEC_FCMLA180 "270")
> >> > +			    (UNSPEC_FCMUL "90")
> >> > +			    (UNSPEC_FCMUL180 "270")
> >> > +			    (UNSPEC_FCMLS "180")
> >> > +			    (UNSPEC_FCMLS180 "180")])
> >>
> >> I think it would be worth clarifying what the patterns do.  AIUI, the
> >> effect of the MUL180 and MLA180 “rot_op” cases as written in the
> >> patch is to multiply the conjugate of the first operand by the second
> >> operand, whereas the documentation in the earlier patches implied
> >> that they should multiply the conjugate of the second operand by the first.
> >> But maybe I've got this wrong.
> >>
> >>   operands to the pattern: a + bi, c + di
> >>
> >>   conjugate of first operand * second operand [A]:
> >>
> >>     (a - bi) * (c + di) = (ac + bd) + (ad - bc)i
> >>
> >>   first operand * conjugate of second operand [B]:
> >>
> >>     (a + bi) * (c - di) = (ac + bd) + (bc - ad)i
> >>
> >>   the FCMUL180 sequence chosen by the patch:
> >>
> >>     0: a * (c + di) = ac + adi
> >>   270: b * (d - ci) = bd - bci
> >>
> >>   which gives A rather than B.
> >
> > Correct, it expects the conjucate In the second operand and corrects
> > the argument order to account for it during vectorization if it's on the first
> operand.
> 
> What's “it” here though?  The point of my comment above is that the
> AArch64 expansion seems to conjugate the first operand rather than the
> second.  When I read the iterator part of the patch originally, I was
> wondering whether the AArch64 define_expand would swap the inputs to
> ensure that the second operand is conjugated instead, but the
> define_expand seems to preserve the original order.
> 
> So it seemed like it was the other way around from what you said above:
> the optabs interface treats conjugating the first operand as the “native”
> choice and the vectoriser would need to correct the argument order if the
> scalar gimple code conjugated the second operand instead.
> 
> TBH I'm not sure which is the “natural” order in gimple.  Guess that's one for
> Richi.
> 
> Still…
> 
> > i.e. conjucate first returns
> >
> >         mov     v0.16b, v3.16b
> >         ldr     q1, [x0, x3]
> >         ldr     q2, [x1, x3]
> >         fcmla   v0.4s, v1.4s, v2.4s, #0
> >         fcmla   v0.4s, v1.4s, v2.4s, #270
> >         str     q0, [x2, x3]
> >
> > and conjucate second returns
> >
> >         mov     v0.16b, v3.16b
> >         ldr     q1, [x1, x3]
> >         ldr     q2, [x0, x3]
> >         fcmla   v0.4s, v1.4s, v2.4s, #0
> >         fcmla   v0.4s, v1.4s, v2.4s, #270
> >         str     q0, [x2, x3]
> 
> …I agree this looks good, so it seems like everything works out in the end.
> I guess it's just a question of clarifying the interface.
> 
> > I think at some point the documentation in got out of sync with the
> > implementation because it turned out to be easier to conj first and flip conj
> second.  I'll update the optab docs.
> 
> Thanks.
> 
> >> > +;; The complex mla/mls operations always need to expand to two
> >> instructions.
> >> > +;; The first operation does half the computation and the second
> >> > +does the ;; remainder.  Because of this, expand early.
> >> > +(define_expand "cml<fcmac1><rot_op><mode>4"
> >> > +  [(set (match_operand:VHSDF 0 "register_operand")
> >> > +	(plus:VHSDF (match_operand:VHSDF 1 "register_operand")
> >> > +		    (unspec:VHSDF [(match_operand:VHSDF 2
> >> "register_operand")
> >> > +				   (match_operand:VHSDF 3
> >> "register_operand")]
> >> > +				   FCMLA_OP)))]
> >> > +  "TARGET_COMPLEX"
> >> > +{
> >> > +  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (operands[0],
> >> operands[1],
> >> > +						 operands[2], operands[3]));
> >> > +  emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0],
> >> operands[0],
> >> > +						 operands[2], operands[3]));
> >> > +  DONE;
> >> > +})
> >>
> >> The interface doesn't guarantee that operands[0] is distinct from the
> >> inputs, so I think the result of the first instruction should be a temporary
> register.
> >
> > Hmm sorry I'm not sure I follow.. operands[0] doesn't have to be
> > distinct from the inputs, and in most cases it wouldn't be.
> >
> > In c = c + (a * b)
> >
> > operands[0] should be the same as operands[1].
> >
> > I'm just trying to figure out why It needs to be different :) since it's an
> accumulation instruction..
> 
> Consider:
> 
>   c = c + (a * c)
> 
> With the expansion above, the first instruction would clobber the operands[3]
> input to the second instruction.

AHHH! I had never considered that.. Thanks! I'll update all the target patches.

Regards,
Tamar

> 
> Thanks,
> Richard
Richard Biener Nov. 16, 2020, 1:58 p.m. UTC | #7
On Mon, Nov 16, 2020 at 1:48 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> > +(define_int_attr rot_op [(UNSPEC_FCMLS "")
> >> > +                   (UNSPEC_FCMLS180 "_conj")
> >> > +                   (UNSPEC_FCMLA "")
> >> > +                   (UNSPEC_FCMLA180 "_conj")
> >> > +                   (UNSPEC_FCMUL "")
> >> > +                   (UNSPEC_FCMUL180 "_conj")
> >> > +                   (UNSPEC_CMLS "")
> >> > +                   (UNSPEC_CMLA "")
> >> > +                   (UNSPEC_CMLA180 "_conj")
> >> > +                   (UNSPEC_CMUL "")
> >> > +                   (UNSPEC_CMUL180 "_conj")])
> >> > +
> >> > +(define_int_attr rotsplit1 [(UNSPEC_FCMLA "0")
> >> > +                      (UNSPEC_FCMLA180 "0")
> >> > +                      (UNSPEC_FCMUL "0")
> >> > +                      (UNSPEC_FCMUL180 "0")
> >> > +                      (UNSPEC_FCMLS "270")
> >> > +                      (UNSPEC_FCMLS180 "90")])
> >> > +
> >> > +(define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
> >> > +                      (UNSPEC_FCMLA180 "270")
> >> > +                      (UNSPEC_FCMUL "90")
> >> > +                      (UNSPEC_FCMUL180 "270")
> >> > +                      (UNSPEC_FCMLS "180")
> >> > +                      (UNSPEC_FCMLS180 "180")])
> >>
> >> I think it would be worth clarifying what the patterns do.  AIUI, the effect of
> >> the MUL180 and MLA180 “rot_op” cases as written in the patch is to multiply
> >> the conjugate of the first operand by the second operand, whereas the
> >> documentation in the earlier patches implied that they should multiply the
> >> conjugate of the second operand by the first.
> >> But maybe I've got this wrong.
> >>
> >>   operands to the pattern: a + bi, c + di
> >>
> >>   conjugate of first operand * second operand [A]:
> >>
> >>     (a - bi) * (c + di) = (ac + bd) + (ad - bc)i
> >>
> >>   first operand * conjugate of second operand [B]:
> >>
> >>     (a + bi) * (c - di) = (ac + bd) + (bc - ad)i
> >>
> >>   the FCMUL180 sequence chosen by the patch:
> >>
> >>     0: a * (c + di) = ac + adi
> >>   270: b * (d - ci) = bd - bci
> >>
> >>   which gives A rather than B.
> >
> > Correct, it expects the conjucate In the second operand and corrects the argument order to account
> > for it during vectorization if it's on the first operand.
>
> What's “it” here though?  The point of my comment above is that the
> AArch64 expansion seems to conjugate the first operand rather than the
> second.  When I read the iterator part of the patch originally, I was
> wondering whether the AArch64 define_expand would swap the inputs to ensure
> that the second operand is conjugated instead, but the define_expand
> seems to preserve the original order.
>
> So it seemed like it was the other way around from what you said above:
> the optabs interface treats conjugating the first operand as the
> “native” choice and the vectoriser would need to correct the argument
> order if the scalar gimple code conjugated the second operand instead.
>
> TBH I'm not sure which is the “natural” order in gimple.  Guess that's
> one for Richi.

Not sure but most remotely related ops like
WIDEN_MULT_PLUS_EXPR have the "unmodified" operand
last.  But this is an internal function, so ...

I'd say make it consistent within the group.

And for all means please somewhere write down in complex
components what the ops are supposed to do ;)  Guess most
people know how to conjugate but rotations ... I only realized
_way_ late that x86 has vector operations for all of this
as well just because there it's all called 'plusminus' rather
than based on complex numbers and 'rotates' ...

Richard.

> Still…
>
> > i.e. conjucate first returns
> >
> >         mov     v0.16b, v3.16b
> >         ldr     q1, [x0, x3]
> >         ldr     q2, [x1, x3]
> >         fcmla   v0.4s, v1.4s, v2.4s, #0
> >         fcmla   v0.4s, v1.4s, v2.4s, #270
> >         str     q0, [x2, x3]
> >
> > and conjucate second returns
> >
> >         mov     v0.16b, v3.16b
> >         ldr     q1, [x1, x3]
> >         ldr     q2, [x0, x3]
> >         fcmla   v0.4s, v1.4s, v2.4s, #0
> >         fcmla   v0.4s, v1.4s, v2.4s, #270
> >         str     q0, [x2, x3]
>
> …I agree this looks good, so it seems like everything works out in the end.
> I guess it's just a question of clarifying the interface.
>
> > I think at some point the documentation in got out of sync with the implementation because it turned out to be easier to conj first
> > and flip conj second.  I'll update the optab docs.
>
> Thanks.
>
> >> > +;; The complex mla/mls operations always need to expand to two
> >> instructions.
> >> > +;; The first operation does half the computation and the second does
> >> > +the ;; remainder.  Because of this, expand early.
> >> > +(define_expand "cml<fcmac1><rot_op><mode>4"
> >> > +  [(set (match_operand:VHSDF 0 "register_operand")
> >> > +  (plus:VHSDF (match_operand:VHSDF 1 "register_operand")
> >> > +              (unspec:VHSDF [(match_operand:VHSDF 2
> >> "register_operand")
> >> > +                             (match_operand:VHSDF 3
> >> "register_operand")]
> >> > +                             FCMLA_OP)))]
> >> > +  "TARGET_COMPLEX"
> >> > +{
> >> > +  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (operands[0],
> >> operands[1],
> >> > +                                           operands[2], operands[3]));
> >> > +  emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0],
> >> operands[0],
> >> > +                                           operands[2], operands[3]));
> >> > +  DONE;
> >> > +})
> >>
> >> The interface doesn't guarantee that operands[0] is distinct from the inputs,
> >> so I think the result of the first instruction should be a temporary register.
> >
> > Hmm sorry I'm not sure I follow.. operands[0] doesn't have to be distinct from the inputs,
> > and in most cases it wouldn't be.
> >
> > In c = c + (a * b)
> >
> > operands[0] should be the same as operands[1].
> >
> > I'm just trying to figure out why It needs to be different :) since it's an accumulation instruction..
>
> Consider:
>
>   c = c + (a * c)
>
> With the expansion above, the first instruction would clobber the
> operands[3] input to the second instruction.
>
> Thanks,
> Richard
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 381a702eba003520d2e83e91065d2a808b9c6493..c2ddef19e4e433f7ca055e42d1222d9dad6bd6c2 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -449,6 +449,14 @@  (define_insn "aarch64_fcadd<rot><mode>"
   [(set_attr "type" "neon_fcadd")]
 )
 
+(define_expand "cadd<rot><mode>3"
+  [(set (match_operand:VHSDF 0 "register_operand")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")
+		       (match_operand:VHSDF 2 "register_operand")]
+		       FCADD))]
+  "TARGET_COMPLEX"
+)
+
 (define_insn "aarch64_fcmla<rot><mode>"
   [(set (match_operand:VHSDF 0 "register_operand" "=w")
 	(plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0")
@@ -508,6 +516,45 @@  (define_insn "aarch64_fcmlaq_lane<rot><mode>"
   [(set_attr "type" "neon_fcmla")]
 )
 
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cml<fcmac1><rot_op><mode>4"
+  [(set (match_operand:VHSDF 0 "register_operand")
+	(plus:VHSDF (match_operand:VHSDF 1 "register_operand")
+		    (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand")
+				   (match_operand:VHSDF 3 "register_operand")]
+				   FCMLA_OP)))]
+  "TARGET_COMPLEX"
+{
+  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (operands[0], operands[1],
+						 operands[2], operands[3]));
+  emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0], operands[0],
+						 operands[2], operands[3]));
+  DONE;
+})
+
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<rot_op><mode>3"
+  [(set (match_operand:VHSDF 0 "register_operand")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")
+		       (match_operand:VHSDF 2 "register_operand")]
+		       FCMUL_OP))]
+  "TARGET_COMPLEX"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_move_insn (tmp, CONST0_RTX (<MODE>mode));
+  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (operands[0], tmp,
+						 operands[1], operands[2]));
+  emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0], operands[0],
+						 operands[1], operands[2]));
+  DONE;
+})
+
+
+
 ;; These instructions map to the __builtins for the Dot Product operations.
 (define_insn "aarch64_<sur>dot<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 054fd8515c6ebf136da699e2993f6ebb348c3b1a..98217c9fd3ee2b6063f7564193e400e9ef71c6ac 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -182,6 +182,11 @@  (define_mode_iterator V2F [V2SF V2DF])
 ;; All Advanced SIMD modes on which we support any arithmetic operations.
 (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for performing arithmetics.
+(define_mode_iterator VALL_ARITH [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				  (V4HF "TARGET_SIMD_F16INST") (V8HF "TARGET_SIMD_F16INST")
+				  V2SF V4SF V2DF])
+
 ;; All Advanced SIMD modes suitable for moving, loading, and storing.
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
@@ -705,6 +710,10 @@  (define_c_enum "unspec"
     UNSPEC_FCMLA90	; Used in aarch64-simd.md.
     UNSPEC_FCMLA180	; Used in aarch64-simd.md.
     UNSPEC_FCMLA270	; Used in aarch64-simd.md.
+    UNSPEC_FCMUL	; Used in aarch64-simd.md.
+    UNSPEC_FCMUL180	; Used in aarch64-simd.md.
+    UNSPEC_FCMLS	; Used in aarch64-simd.md.
+    UNSPEC_FCMLS180	; Used in aarch64-simd.md.
     UNSPEC_ASRD		; Used in aarch64-sve.md.
     UNSPEC_ADCLB	; Used in aarch64-sve2.md.
     UNSPEC_ADCLT	; Used in aarch64-sve2.md.
@@ -723,6 +732,10 @@  (define_c_enum "unspec"
     UNSPEC_CMLA180	; Used in aarch64-sve2.md.
     UNSPEC_CMLA270	; Used in aarch64-sve2.md.
     UNSPEC_CMLA90	; Used in aarch64-sve2.md.
+    UNSPEC_CMLS		; Used in aarch64-sve2.md.
+    UNSPEC_CMLS180	; Used in aarch64-sve2.md.
+    UNSPEC_CMUL		; Used in aarch64-sve2.md.
+    UNSPEC_CMUL180	; Used in aarch64-sve2.md.
     UNSPEC_COND_FCVTLT	; Used in aarch64-sve2.md.
     UNSPEC_COND_FCVTNT	; Used in aarch64-sve2.md.
     UNSPEC_COND_FCVTX	; Used in aarch64-sve2.md.
@@ -2680,6 +2693,14 @@  (define_int_iterator FMMLA [UNSPEC_FMMLA])
 (define_int_iterator BF_MLA [UNSPEC_BFMLALB
 			     UNSPEC_BFMLALT])
 
+(define_int_iterator FCMLA_OP [UNSPEC_FCMLA
+			       UNSPEC_FCMLA180
+			       UNSPEC_FCMLS
+			       UNSPEC_FCMLS180])
+
+(define_int_iterator FCMUL_OP [UNSPEC_FCMUL
+			       UNSPEC_FCMUL180])
+
 ;; Iterators for atomic operations.
 
 (define_int_iterator ATOMIC_LDOP
@@ -3375,6 +3396,7 @@  (define_int_attr rot [(UNSPEC_CADD90 "90")
 		      (UNSPEC_CMLA270 "270")
 		      (UNSPEC_FCADD90 "90")
 		      (UNSPEC_FCADD270 "270")
+		      (UNSPEC_FCMLS "0")
 		      (UNSPEC_FCMLA "0")
 		      (UNSPEC_FCMLA90 "90")
 		      (UNSPEC_FCMLA180 "180")
@@ -3390,7 +3412,41 @@  (define_int_attr rot [(UNSPEC_CADD90 "90")
 		      (UNSPEC_COND_FCMLA "0")
 		      (UNSPEC_COND_FCMLA90 "90")
 		      (UNSPEC_COND_FCMLA180 "180")
-		      (UNSPEC_COND_FCMLA270 "270")])
+		      (UNSPEC_COND_FCMLA270 "270")
+		      (UNSPEC_FCMUL "0")
+		      (UNSPEC_FCMUL180 "180")])
+
+;; A conjucate is a rotation of 180* around the argand plane, or * I.
+(define_int_attr rot_op [(UNSPEC_FCMLS "")
+			 (UNSPEC_FCMLS180 "_conj")
+			 (UNSPEC_FCMLA "")
+			 (UNSPEC_FCMLA180 "_conj")
+			 (UNSPEC_FCMUL "")
+			 (UNSPEC_FCMUL180 "_conj")
+			 (UNSPEC_CMLS "")
+			 (UNSPEC_CMLA "")
+			 (UNSPEC_CMLA180 "_conj")
+			 (UNSPEC_CMUL "")
+			 (UNSPEC_CMUL180 "_conj")])
+
+(define_int_attr rotsplit1 [(UNSPEC_FCMLA "0")
+			    (UNSPEC_FCMLA180 "0")
+			    (UNSPEC_FCMUL "0")
+			    (UNSPEC_FCMUL180 "0")
+			    (UNSPEC_FCMLS "270")
+			    (UNSPEC_FCMLS180 "90")])
+
+(define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
+			    (UNSPEC_FCMLA180 "270")
+			    (UNSPEC_FCMUL "90")
+			    (UNSPEC_FCMUL180 "270")
+			    (UNSPEC_FCMLS "180")
+			    (UNSPEC_FCMLS180 "180")])
+
+(define_int_attr fcmac1 [(UNSPEC_FCMLA "a") (UNSPEC_FCMLA180 "a")
+			 (UNSPEC_FCMLS "s") (UNSPEC_FCMLS180 "s")
+			 (UNSPEC_CMLA "a") (UNSPEC_CMLA180 "a")
+			 (UNSPEC_CMLS "s") (UNSPEC_CMLS180 "s")])
 
 (define_int_attr sve_fmla_op [(UNSPEC_COND_FMLA "fmla")
 			      (UNSPEC_COND_FMLS "fmls")