diff mbox series

[v2,12/16] AArch64: Add SVE2 Integer RTL patterns for Complex Addition, Multiply and FMA.

Message ID 20200925143037.GA26815@arm.com
State New
Headers show
Series middle-end Add support for SLP vectorization of complex number instructions. | expand

Commit Message

Tamar Christina Sept. 25, 2020, 2:30 p.m. UTC
Hi All,

This adds implementation for the optabs for complex operations.  With this the
following C code:

  void f90 (int _Complex a[restrict N], int _Complex b[restrict N],
	    int _Complex c[restrict N])
  {
    for (int i=0; i < N; i++)
      c[i] = a[i] + (b[i] * I);
  }

generates

  f90:
	  mov     x3, 0
	  mov     x4, 200
	  whilelo p0.s, xzr, x4
	  .p2align 3,,7
  .L2:
	  ld1w    z0.s, p0/z, [x0, x3, lsl 2]
	  ld1w    z1.s, p0/z, [x1, x3, lsl 2]
	  cadd    z0.s, z0.s, z1.s, #90
	  st1w    z0.s, p0, [x2, x3, lsl 2]
	  incw    x3
	  whilelo p0.s, x3, x4
	  b.any   .L2
	  ret

instead of

  f90:
	  mov     x3, 0
	  mov     x4, 0
	  mov     w5, 100
	  whilelo p0.s, wzr, w5
	  .p2align 3,,7
  .L2:
	  ld2w    {z4.s - z5.s}, p0/z, [x0, x3, lsl 2]
	  ld2w    {z2.s - z3.s}, p0/z, [x1, x3, lsl 2]
	  sub     z0.s, z4.s, z3.s
	  add     z1.s, z5.s, z2.s
	  st2w    {z0.s - z1.s}, p0, [x2, x3, lsl 2]
	  incw    x4
	  inch    x3
	  whilelo p0.s, w4, w5
	  b.any   .L2
	  ret

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-sve2.md (cadd<rot><mode>3,
	cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New.
	* config/aarch64/iterators.md (SVE2_INT_CMLA_OP, SVE2_INT_CMUL_OP,
	SVE2_INT_CADD_OP): New.

--

Comments

Tamar Christina Nov. 14, 2020, 3:12 p.m. UTC | #1
ping

> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of Tamar
> Christina
> Sent: Friday, September 25, 2020 3:31 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>
> Subject: [PATCH v2 12/16]AArch64: Add SVE2 Integer RTL patterns for
> Complex Addition, Multiply and FMA.
> 
> Hi All,
> 
> This adds implementation for the optabs for complex operations.  With this
> the following C code:
> 
>   void f90 (int _Complex a[restrict N], int _Complex b[restrict N],
> 	    int _Complex c[restrict N])
>   {
>     for (int i=0; i < N; i++)
>       c[i] = a[i] + (b[i] * I);
>   }
> 
> generates
> 
>   f90:
> 	  mov     x3, 0
> 	  mov     x4, 200
> 	  whilelo p0.s, xzr, x4
> 	  .p2align 3,,7
>   .L2:
> 	  ld1w    z0.s, p0/z, [x0, x3, lsl 2]
> 	  ld1w    z1.s, p0/z, [x1, x3, lsl 2]
> 	  cadd    z0.s, z0.s, z1.s, #90
> 	  st1w    z0.s, p0, [x2, x3, lsl 2]
> 	  incw    x3
> 	  whilelo p0.s, x3, x4
> 	  b.any   .L2
> 	  ret
> 
> instead of
> 
>   f90:
> 	  mov     x3, 0
> 	  mov     x4, 0
> 	  mov     w5, 100
> 	  whilelo p0.s, wzr, w5
> 	  .p2align 3,,7
>   .L2:
> 	  ld2w    {z4.s - z5.s}, p0/z, [x0, x3, lsl 2]
> 	  ld2w    {z2.s - z3.s}, p0/z, [x1, x3, lsl 2]
> 	  sub     z0.s, z4.s, z3.s
> 	  add     z1.s, z5.s, z2.s
> 	  st2w    {z0.s - z1.s}, p0, [x2, x3, lsl 2]
> 	  incw    x4
> 	  inch    x3
> 	  whilelo p0.s, w4, w5
> 	  b.any   .L2
> 	  ret
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-sve2.md (cadd<rot><mode>3,
> 	cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New.
> 	* config/aarch64/iterators.md (SVE2_INT_CMLA_OP,
> SVE2_INT_CMUL_OP,
> 	SVE2_INT_CADD_OP): New.
> 
> --
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index e18b9fef16e72496588fb5850e362da4ae42898a..e601c6a4586e3ed1e11aedf047f56d556a99a302 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -1774,6 +1774,16 @@  (define_insn "@aarch64_sve_<optab><mode>"
   [(set_attr "movprfx" "*,yes")]
 )
 
+;; unpredicated optab pattern for auto-vectorizer
+(define_expand "cadd<rot><mode>3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_operand:SVE_FULL_I 1 "register_operand")
+	   (match_operand:SVE_FULL_I 2 "register_operand")]
+	  SVE2_INT_CADD_OP))]
+  "TARGET_SVE2"
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Complex ternary operations
 ;; -------------------------------------------------------------------------
@@ -1813,6 +1823,47 @@  (define_insn "@aarch64_<optab>_lane_<mode>"
   [(set_attr "movprfx" "*,yes")]
 )
 
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cml<fcmac1><rot_op><mode>4"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(plus:SVE_FULL_I (match_operand:SVE_FULL_I 1 "register_operand")
+	  (unspec:SVE_FULL_I
+	    [(match_operand:SVE_FULL_I 2 "register_operand")
+	     (match_operand:SVE_FULL_I 3 "register_operand")]
+	    SVE2_INT_CMLA_OP)))]
+  "TARGET_SVE2"
+{
+  emit_insn (gen_aarch64_sve_cmla<sve_rot1><mode> (operands[0], operands[1],
+						   operands[2], operands[3]));
+  emit_insn (gen_aarch64_sve_cmla<sve_rot2><mode> (operands[0], operands[0],
+						   operands[2], operands[3]));
+  DONE;
+})
+
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<rot_op><mode>3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_operand:SVE_FULL_I 1 "register_operand")
+	   (match_operand:SVE_FULL_I 2 "register_operand")
+	   (match_dup 3)]
+	  SVE2_INT_CMUL_OP))]
+  "TARGET_SVE2"
+{
+  operands[3] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+  emit_insn (gen_aarch64_sve_cmla<sve_rot1><mode> (operands[0], operands[3],
+						   operands[1], operands[2]));
+  emit_insn (gen_aarch64_sve_cmla<sve_rot2><mode> (operands[0], operands[0],
+						   operands[1], operands[2]));
+  DONE;
+})
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Complex dot product
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 7662b929e2c4f6c103cc06e051eb574247320809..c11e976237d30771a7bd7c7fb56922f9c5c785de 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2583,6 +2583,23 @@  (define_int_iterator SVE2_INT_CMLA [UNSPEC_CMLA
 				    UNSPEC_SQRDCMLAH180
 				    UNSPEC_SQRDCMLAH270])
 
+;; Unlike the normal CMLA instructions these represent the actual operation you
+;; to be performed.  They will always need to be expanded into multiple
+;; sequences consisting of CMLA.
+(define_int_iterator SVE2_INT_CMLA_OP [UNSPEC_CMLA
+				       UNSPEC_CMLA180
+				       UNSPEC_CMLS])
+
+;; Unlike the normal CMLA instructions these represent the actual operation you
+;; to be performed.  They will always need to be expanded into multiple
+;; sequences consisting of CMLA.
+(define_int_iterator SVE2_INT_CMUL_OP [UNSPEC_CMUL
+				       UNSPEC_CMUL180])
+
+;; Same as SVE2_INT_CADD but exclude the saturating instructions
+(define_int_iterator SVE2_INT_CADD_OP [UNSPEC_CADD90
+				       UNSPEC_CADD270])
+
 (define_int_iterator SVE2_INT_CDOT [UNSPEC_CDOT
 				    UNSPEC_CDOT90
 				    UNSPEC_CDOT180