diff mbox series

[2/2] AArch64 Add implementation for vector cbranch.

Message ID Y2KCrKb019Z1/HgC@arm.com
State New
Headers show
Series [1/2] middle-end: Support early break/return auto-vectorization. | expand

Commit Message

Tamar Christina Nov. 2, 2022, 2:46 p.m. UTC
Hi All,

This adds an implementation for conditional branch optab for AArch64.

For 128-bit vectors we generate:

        cmhi    v1.4s, v1.4s, v0.4s
        umaxp   v1.4s, v1.4s, v1.4s
        fmov    x3, d1
        cbnz    x3, .L8

and of 64-bit vector we can omit the compression:

        cmhi    v1.2s, v1.2s, v0.2s
        fmov    x2, d1
        cbz     x2, .L13

I did also want to provide a version that mixes SVE and NEON so I can use the
SVE CMHI instructions with a NEON register.

So concretely for a 128-bit vector you'd get:

        ptrue   p0.s, vl4
.L3:
        ...
        cmplo   p2.s, p0/z, z0.s, z2.s
        b.any   .L6
        ...
        cmp     w2, 200
        bne     .L3

However I ran into an issue where cbranch is not the thing that does the
comparison.  And if I use combine to do it then the resulting ptrue wouldn't be
floated outside the loop.

Is there a way to currently do this? or does a mid-end pass need to be changed
for this?

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (cbranch<mode>4): New.

gcc/testsuite/ChangeLog:

	* lib/target-supports.exp: Enable AArch64 generically.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 5386043739a9b2e328bfb2fc9067da8feeac1a92..e53d339ea20492812a3faa7c20ed945255321b11 100644




--
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 5386043739a9b2e328bfb2fc9067da8feeac1a92..e53d339ea20492812a3faa7c20ed945255321b11 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3795,6 +3795,41 @@ (define_expand "vcond_mask_<mode><v_int_equiv>"
   DONE;
 })
 
+;; Patterns comparing two vectors to produce a sets flagsi.
+
+(define_expand "cbranch<mode>4"
+  [(set (pc)
+        (if_then_else
+          (match_operator 0 "aarch64_equality_operator"
+            [(match_operand:VDQ_BHSI 1 "register_operand")
+             (match_operand:VDQ_BHSI 2 "aarch64_simd_reg_or_zero")])
+          (label_ref (match_operand 3 ""))
+          (pc)))]
+  "TARGET_SIMD"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+
+  /* For 64-bit vectors we need no reductions.  */
+  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+    {
+      /* Always reduce using a V4SI.  */
+      rtx reduc = simplify_gen_subreg (V4SImode, operands[1], <MODE>mode, 0);
+      rtx res = gen_reg_rtx (V4SImode);
+      emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc));
+      emit_move_insn (tmp, simplify_gen_subreg (<MODE>mode, res, V4SImode, 0));
+    }
+  else
+    tmp = operands[1];
+
+  rtx val = gen_reg_rtx (DImode);
+  emit_move_insn (val, simplify_gen_subreg (DImode, tmp, <MODE>mode, 0));
+
+  rtx cc_reg = aarch64_gen_compare_reg (NE, val, const0_rtx);
+  rtx cmp_rtx = gen_rtx_fmt_ee (NE, DImode, cc_reg, operands[2]);
+  emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
+  DONE;
+})
+
 ;; Patterns comparing two vectors to produce a mask.
 
 (define_expand "vec_cmp<mode><mode>"
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 5cbf54bd2a23dfdc5dc7b148b0dc6ed4c63814ae..8964cbd6610a718711546d312e89cee937d210e8 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3653,8 +3653,7 @@ proc check_effective_target_vect_int { } {
 proc check_effective_target_vect_early_break { } {
     return [check_cached_effective_target_indexed vect_early_break {
       expr {
-	([istarget aarch64*-*-*]
-	 && [check_effective_target_aarch64_sve])
+	[istarget aarch64*-*-*]
 	}}]
 }
 # Return 1 if the target supports hardware vectorization of complex additions of
diff mbox series

Patch

--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3795,6 +3795,41 @@  (define_expand "vcond_mask_<mode><v_int_equiv>"
   DONE;
 })
 
+;; Patterns comparing two vectors to produce a sets flagsi.
+
+(define_expand "cbranch<mode>4"
+  [(set (pc)
+        (if_then_else
+          (match_operator 0 "aarch64_equality_operator"
+            [(match_operand:VDQ_BHSI 1 "register_operand")
+             (match_operand:VDQ_BHSI 2 "aarch64_simd_reg_or_zero")])
+          (label_ref (match_operand 3 ""))
+          (pc)))]
+  "TARGET_SIMD"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+
+  /* For 64-bit vectors we need no reductions.  */
+  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+    {
+      /* Always reduce using a V4SI.  */
+      rtx reduc = simplify_gen_subreg (V4SImode, operands[1], <MODE>mode, 0);
+      rtx res = gen_reg_rtx (V4SImode);
+      emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc));
+      emit_move_insn (tmp, simplify_gen_subreg (<MODE>mode, res, V4SImode, 0));
+    }
+  else
+    tmp = operands[1];
+
+  rtx val = gen_reg_rtx (DImode);
+  emit_move_insn (val, simplify_gen_subreg (DImode, tmp, <MODE>mode, 0));
+
+  rtx cc_reg = aarch64_gen_compare_reg (NE, val, const0_rtx);
+  rtx cmp_rtx = gen_rtx_fmt_ee (NE, DImode, cc_reg, operands[2]);
+  emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
+  DONE;
+})
+
 ;; Patterns comparing two vectors to produce a mask.
 
 (define_expand "vec_cmp<mode><mode>"
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 5cbf54bd2a23dfdc5dc7b148b0dc6ed4c63814ae..8964cbd6610a718711546d312e89cee937d210e8 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3653,8 +3653,7 @@  proc check_effective_target_vect_int { } {
 proc check_effective_target_vect_early_break { } {
     return [check_cached_effective_target_indexed vect_early_break {
       expr {
-	([istarget aarch64*-*-*]
-	 && [check_effective_target_aarch64_sve])
+	[istarget aarch64*-*-*]
 	}}]
 }
 # Return 1 if the target supports hardware vectorization of complex additions of