diff mbox series

[19/19] Arm: Add MVE cbranch implementation

Message ID ZJw6i/lre3vnMev/@arm.com
State New
Headers show
Series Support early break/return auto-vectorization | expand

Commit Message

Tamar Christina June 28, 2023, 1:50 p.m. UTC
Hi All,

This adds an implementation for conditional branch optab for MVE.

Unfortunately MVE has rather limited operations on VPT.P0, we are missing the
ability to do P0 comparisons and logical OR on P0.

For that reason we can only support cbranch with 0, as for comparing to a 0
predicate we don't need to actually do a comparison, we only have to check that
any bit is set within P0.

Because we can only do P0 comparisons with 0, the costing of the comparison was
reduced in order for the compiler not to try to push 0 to a register thinking
it's too expensive.  For the cbranch implementation to be safe we must see the
constant 0 vector.

For the lack of logical OR on P0 we can't really work around.  This means MVE
can't support cases where the sizes of operands in the comparison don't match,
i.e. when one operand has been unpacked.

For e.g.

void f1 ()
{
  for (int i = 0; i < N; i++)
    {
      b[i] += a[i];
      if (a[i] > 0)
	break;
    }
}

For 128-bit vectors we generate:

        vcmp.s32        gt, q3, q1
        vmrs    r3, p0  @ movhi
        cbnz    r3, .L2

MVE does not have 64-bit vector comparisons, as such that is also not supported.

Bootstrapped arm-none-linux-gnueabihf and regtested with
-march=armv8.1-m.main+mve -mfpu=auto and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/arm/arm.cc (arm_rtx_costs_internal): Update costs for pred 0
	compares.
	* config/arm/mve.md (cbranch<mode>4): New.

gcc/testsuite/ChangeLog:

	* lib/target-supports.exp (vect_early_break): Add MVE.
	* gcc.target/arm/mve/vect-early-break-cbranch.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 38f0839de1c75547c259ac3d655fcfc14e7208a2..15e65c15cb3cb6f70161787e84b255a24eb51e32 100644




--
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 38f0839de1c75547c259ac3d655fcfc14e7208a2..15e65c15cb3cb6f70161787e84b255a24eb51e32 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -11883,6 +11883,15 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code,
 	   || TARGET_HAVE_MVE)
 	  && simd_immediate_valid_for_move (x, mode, NULL, NULL))
 	*cost = COSTS_N_INSNS (1);
+      else if (TARGET_HAVE_MVE
+	       && outer_code == COMPARE
+	       && VALID_MVE_PRED_MODE (mode))
+	/* MVE allows very limited instructions on VPT.P0,  however comparisons
+	   to 0 do not require us to materialze this constant or require a
+	   predicate comparison as we can go through SImode.  For that reason
+	   allow P0 CMP 0 as a cheap operation such that the 0 isn't forced to
+	   registers as we can't compare two predicates.  */
+	*cost = COSTS_N_INSNS (1);
       else
 	*cost = COSTS_N_INSNS (4);
       return true;
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 74909ce47e132c22a94f7d9cd3a0921b38e33051..95d40770ecc25f9eb251eba38306dd43cbebfb3f 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -6880,6 +6880,21 @@ (define_expand "vcond_mask_<mode><MVE_vpred>"
   DONE;
 })
 
+(define_expand "cbranch<mode>4"
+  [(set (pc) (if_then_else
+	      (match_operator 0 "expandable_comparison_operator"
+	       [(match_operand:MVE_7 1 "register_operand")
+	        (match_operand:MVE_7 2 "zero_operand")])
+	      (label_ref (match_operand 3 "" ""))
+	      (pc)))]
+  "TARGET_HAVE_MVE"
+{
+  rtx val = gen_reg_rtx (SImode);
+  emit_move_insn (val, gen_lowpart (SImode, operands[1]));
+  emit_jump_insn (gen_cbranchsi4 (operands[0], val, const0_rtx, operands[3]));
+  DONE;
+})
+
 ;; Reinterpret operand 1 in operand 0's mode, without changing its contents.
 (define_expand "@arm_mve_reinterpret<mode>"
   [(set (match_operand:MVE_vecs 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/arm/mve/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/mve/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..c3b8506dca0b2b044e6869a6c8259d663c1ff930
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/vect-early-break-cbranch.c
@@ -0,0 +1,117 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/*
+** f1:
+**	...
+**	vcmp.s32	gt, q[0-9]+, q[0-9]+
+**	vmrs	r[0-9]+, p0	@ movhi
+**	cbnz	r[0-9]+, \.L[0-9]+
+**	...
+*/
+void f1 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] > 0)
+	break;
+    }
+}
+
+/*
+** f2:
+**	...
+**	vcmp.s32	ge, q[0-9]+, q[0-9]+
+**	vmrs	r[0-9]+, p0	@ movhi
+**	cbnz	r[0-9]+, \.L[0-9]+
+**	...
+*/
+void f2 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] >= 0)
+	break;
+    }
+}
+
+/*
+** f3:
+**	...
+**	vcmp.i32	eq, q[0-9]+, q[0-9]+
+**	vmrs	r[0-9]+, p0	@ movhi
+**	cbnz	r[0-9]+, \.L[0-9]+
+**	...
+*/
+void f3 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] == 0)
+	break;
+    }
+}
+
+/*
+** f4:
+**	...
+**	vcmp.i32	ne, q[0-9]+, q[0-9]+
+**	vmrs	r[0-9]+, p0	@ movhi
+**	cbnz	r[0-9]+, \.L[0-9]+
+**	...
+*/
+void f4 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] != 0)
+	break;
+    }
+}
+
+/*
+** f5:
+**	...
+**	vcmp.s32	lt, q[0-9]+, q[0-9]+
+**	vmrs	r[0-9]+, p0	@ movhi
+**	cbnz	r[0-9]+, \.L[0-9]+
+**	...
+*/
+void f5 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] < 0)
+	break;
+    }
+}
+
+/*
+** f6:
+**	...
+**	vcmp.s32	le, q[0-9]+, q[0-9]+
+**	vmrs	r[0-9]+, p0	@ movhi
+**	cbnz	r[0-9]+, \.L[0-9]+
+**	...
+*/
+void f6 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] <= 0)
+	break;
+    }
+}
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 8f58671e6cfd3546c6a98e40341fe31c6492594b..1eef764542a782786e27ed935a06243e319ae3fc 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3785,6 +3785,8 @@ proc check_effective_target_vect_early_break { } {
       expr {
 	[istarget aarch64*-*-*]
 	|| [check_effective_target_arm_neon_ok]
+	|| ([check_effective_target_arm_v8_1m_mve_fp_ok]
+	     && [check_effective_target_arm_little_endian])
 	}}]
 }
 # Return 1 if the target supports hardware vectorization of complex additions of
diff mbox series

Patch

--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -11883,6 +11883,15 @@  arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code,
 	   || TARGET_HAVE_MVE)
 	  && simd_immediate_valid_for_move (x, mode, NULL, NULL))
 	*cost = COSTS_N_INSNS (1);
+      else if (TARGET_HAVE_MVE
+	       && outer_code == COMPARE
+	       && VALID_MVE_PRED_MODE (mode))
+	/* MVE allows very limited instructions on VPT.P0,  however comparisons
+	   to 0 do not require us to materialze this constant or require a
+	   predicate comparison as we can go through SImode.  For that reason
+	   allow P0 CMP 0 as a cheap operation such that the 0 isn't forced to
+	   registers as we can't compare two predicates.  */
+	*cost = COSTS_N_INSNS (1);
       else
 	*cost = COSTS_N_INSNS (4);
       return true;
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 74909ce47e132c22a94f7d9cd3a0921b38e33051..95d40770ecc25f9eb251eba38306dd43cbebfb3f 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -6880,6 +6880,21 @@  (define_expand "vcond_mask_<mode><MVE_vpred>"
   DONE;
 })
 
+(define_expand "cbranch<mode>4"
+  [(set (pc) (if_then_else
+	      (match_operator 0 "expandable_comparison_operator"
+	       [(match_operand:MVE_7 1 "register_operand")
+	        (match_operand:MVE_7 2 "zero_operand")])
+	      (label_ref (match_operand 3 "" ""))
+	      (pc)))]
+  "TARGET_HAVE_MVE"
+{
+  rtx val = gen_reg_rtx (SImode);
+  emit_move_insn (val, gen_lowpart (SImode, operands[1]));
+  emit_jump_insn (gen_cbranchsi4 (operands[0], val, const0_rtx, operands[3]));
+  DONE;
+})
+
 ;; Reinterpret operand 1 in operand 0's mode, without changing its contents.
 (define_expand "@arm_mve_reinterpret<mode>"
   [(set (match_operand:MVE_vecs 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/arm/mve/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/mve/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..c3b8506dca0b2b044e6869a6c8259d663c1ff930
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/vect-early-break-cbranch.c
@@ -0,0 +1,117 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/*
+** f1:
+**	...
+**	vcmp.s32	gt, q[0-9]+, q[0-9]+
+**	vmrs	r[0-9]+, p0	@ movhi
+**	cbnz	r[0-9]+, \.L[0-9]+
+**	...
+*/
+void f1 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] > 0)
+	break;
+    }
+}
+
+/*
+** f2:
+**	...
+**	vcmp.s32	ge, q[0-9]+, q[0-9]+
+**	vmrs	r[0-9]+, p0	@ movhi
+**	cbnz	r[0-9]+, \.L[0-9]+
+**	...
+*/
+void f2 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] >= 0)
+	break;
+    }
+}
+
+/*
+** f3:
+**	...
+**	vcmp.i32	eq, q[0-9]+, q[0-9]+
+**	vmrs	r[0-9]+, p0	@ movhi
+**	cbnz	r[0-9]+, \.L[0-9]+
+**	...
+*/
+void f3 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] == 0)
+	break;
+    }
+}
+
+/*
+** f4:
+**	...
+**	vcmp.i32	ne, q[0-9]+, q[0-9]+
+**	vmrs	r[0-9]+, p0	@ movhi
+**	cbnz	r[0-9]+, \.L[0-9]+
+**	...
+*/
+void f4 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] != 0)
+	break;
+    }
+}
+
+/*
+** f5:
+**	...
+**	vcmp.s32	lt, q[0-9]+, q[0-9]+
+**	vmrs	r[0-9]+, p0	@ movhi
+**	cbnz	r[0-9]+, \.L[0-9]+
+**	...
+*/
+void f5 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] < 0)
+	break;
+    }
+}
+
+/*
+** f6:
+**	...
+**	vcmp.s32	le, q[0-9]+, q[0-9]+
+**	vmrs	r[0-9]+, p0	@ movhi
+**	cbnz	r[0-9]+, \.L[0-9]+
+**	...
+*/
+void f6 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] <= 0)
+	break;
+    }
+}
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 8f58671e6cfd3546c6a98e40341fe31c6492594b..1eef764542a782786e27ed935a06243e319ae3fc 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3785,6 +3785,8 @@  proc check_effective_target_vect_early_break { } {
       expr {
 	[istarget aarch64*-*-*]
 	|| [check_effective_target_arm_neon_ok]
+	|| ([check_effective_target_arm_v8_1m_mve_fp_ok]
+	     && [check_effective_target_arm_little_endian])
 	}}]
 }
 # Return 1 if the target supports hardware vectorization of complex additions of