diff mbox series

AArch64: Fix codegen regressions around tbz.

Message ID patch-16829-tamar@arm.com
State New
Headers show
Series AArch64: Fix codegen regressions around tbz. | expand

Commit Message

Tamar Christina Jan. 27, 2023, 10:39 a.m. UTC
Hi All,

We were analyzing code quality after recent changes and have noticed that the
tbz support somehow managed to increase the number of branches overall rather
than decreased them.

While investigating this we figured out that the problem is that when an
existing & <contants> exists in gimple and the instruction is generated because
of the range information gotten from the ANDed constant that we end up with the
situation that you get a NOP AND in the RTL expansion.

This is not a problem as CSE will take care of it normally.   The issue is when
this original AND was done in a location where PRE or FRE "lift" the AND to a
different basic block.  This triggers a problem when the resulting value is not
single use.  Instead of having an AND and tbz, we end up generating an
AND + TST + BR if the mode is HI or QI.

This CSE across BB was a problem before but this change made it worse. Our
branch patterns rely on combine being able to fold AND or zero_extends into the
instructions.

To work around this (since a proper fix is outside of the scope of stage-4) we
are limiting the new tbranch optab to only HI and QI mode values.  This isn't a
problem because these two modes are modes for which we don't have CBZ support,
so they are the problematic cases to begin with.  Additionally booleans are QI.

The second thing we're doing is limiting the only legal bitpos to pos 0. i.e.
only the bottom bit.  This such that we prevent the double ANDs as much as
possible.

Now most other cases, i.e. where we had an explicit & in the source code are
still handled correctly by the anonymous (*tb<optab><ALLI:mode><GPI:mode>1)
pattern that was added along with tbranch support.

This means we don't expand the superflous AND here, and while it doesn't fix the
problem that in the cross BB case we loss tbz, it also doesn't make things worse.

With these tweaks we've now reduced the number of insn uniformly as originally
expected.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.md (tbranch_<code><mode>3): Restrict to SHORT
	and bottom bit only.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/tbz_2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 2c1367977a68fc8e4289118e07bb61398856791e..aa09e93d85e9628e8944e03498697eb9597ef867 100644




--
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 2c1367977a68fc8e4289118e07bb61398856791e..aa09e93d85e9628e8944e03498697eb9597ef867 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -949,8 +949,8 @@ (define_insn "*cb<optab><mode>1"
 
 (define_expand "tbranch_<code><mode>3"
   [(set (pc) (if_then_else
-              (EQL (match_operand:ALLI 0 "register_operand")
-                   (match_operand 1 "aarch64_simd_shift_imm_<mode>"))
+              (EQL (match_operand:SHORT 0 "register_operand")
+                   (match_operand 1 "const0_operand"))
               (label_ref (match_operand 2 ""))
               (pc)))]
   ""
diff --git a/gcc/testsuite/gcc.target/aarch64/tbz_2.c b/gcc/testsuite/gcc.target/aarch64/tbz_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..ec128b58f35276a7c5452685a65c73f95f2d5f9a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/tbz_2.c
@@ -0,0 +1,130 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99  -fno-unwind-tables -fno-asynchronous-unwind-tables" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdbool.h>
+
+void h(void);
+
+/*
+** g1:
+** 	cbnz	w0, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g1(int x)
+{
+  if (__builtin_expect (x, 0))
+    h ();
+}
+
+/* 
+** g2:
+** 	tbnz	x0, 0, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g2(int x)
+{
+  if (__builtin_expect (x & 1, 0))
+    h ();
+}
+
+/* 
+** g3:
+** 	tbnz	x0, 3, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g3(int x)
+{
+  if (__builtin_expect (x & 8, 0))
+    h ();
+}
+
+/* 
+** g4:
+** 	tbnz	w0, #31, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g4(int x)
+{
+  if (__builtin_expect (x & (1 << 31), 0))
+    h ();
+}
+
+/* 
+** g5:
+** 	tst	w0, 255
+** 	bne	.L[0-9]+
+** 	ret
+** 	...
+*/
+void g5(char x)
+{
+  if (__builtin_expect (x, 0))
+    h ();
+}
+
+/* 
+** g6:
+** 	tbnz	w0, 0, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g6(char x)
+{
+  if (__builtin_expect (x & 1, 0))
+    h ();
+}
+
+/* 
+** g7:
+** 	tst	w0, 3
+** 	bne	.L[0-9]+
+** 	ret
+** 	...
+*/
+void g7(char x)
+{
+  if (__builtin_expect (x & 3, 0))
+    h ();
+}
+
+/* 
+** g8:
+** 	tbnz	w0, 7, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g8(char x)
+{
+  if (__builtin_expect (x & (1 << 7), 0))
+    h ();
+}
+
+/* 
+** g9:
+** 	tbnz	w0, 0, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g9(bool x)
+{
+  if (__builtin_expect (x, 0))
+    h ();
+}
+
+/* 
+** g10:
+** 	tbnz	w0, 0, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g10(bool x)
+{
+  if (__builtin_expect (x & 1, 0))
+    h ();
+}
+

Comments

Richard Sandiford Jan. 27, 2023, 12:25 p.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> We were analyzing code quality after recent changes and have noticed that the
> tbz support somehow managed to increase the number of branches overall rather
> than decreased them.
>
> While investigating this we figured out that the problem is that when an
> existing & <contants> exists in gimple and the instruction is generated because
> of the range information gotten from the ANDed constant that we end up with the
> situation that you get a NOP AND in the RTL expansion.
>
> This is not a problem as CSE will take care of it normally.   The issue is when
> this original AND was done in a location where PRE or FRE "lift" the AND to a
> different basic block.  This triggers a problem when the resulting value is not
> single use.  Instead of having an AND and tbz, we end up generating an
> AND + TST + BR if the mode is HI or QI.
>
> This CSE across BB was a problem before but this change made it worse. Our
> branch patterns rely on combine being able to fold AND or zero_extends into the
> instructions.
>
> To work around this (since a proper fix is outside of the scope of stage-4) we
> are limiting the new tbranch optab to only HI and QI mode values.  This isn't a
> problem because these two modes are modes for which we don't have CBZ support,
> so they are the problematic cases to begin with.  Additionally booleans are QI.
>
> The second thing we're doing is limiting the only legal bitpos to pos 0. i.e.
> only the bottom bit.  This such that we prevent the double ANDs as much as
> possible.
>
> Now most other cases, i.e. where we had an explicit & in the source code are
> still handled correctly by the anonymous (*tb<optab><ALLI:mode><GPI:mode>1)
> pattern that was added along with tbranch support.
>
> This means we don't expand the superflous AND here, and while it doesn't fix the
> problem that in the cross BB case we loss tbz, it also doesn't make things worse.
>
> With these tweaks we've now reduced the number of insn uniformly as originally
> expected.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (tbranch_<code><mode>3): Restrict to SHORT
> 	and bottom bit only.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/tbz_2.c: New test.

Agreed that reducing the scope of the new optimisation seems like a safe
compromise for GCC 13.  But could you add a testcase that shows the
effect of both changes (reducing the mode selection and the bit
selection)?  The test above passes even without the patch.

It would be good to have a PR tracking this too.

Personally, I think we should try to get to the stage where gimple
does the CSE optimisations we need, and where the tbranch optab can
generate a tbz directly (rather than splitting it apart and hoping
that combine will put it back together later).

Thanks,
Richard

> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 2c1367977a68fc8e4289118e07bb61398856791e..aa09e93d85e9628e8944e03498697eb9597ef867 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -949,8 +949,8 @@ (define_insn "*cb<optab><mode>1"
>  
>  (define_expand "tbranch_<code><mode>3"
>    [(set (pc) (if_then_else
> -              (EQL (match_operand:ALLI 0 "register_operand")
> -                   (match_operand 1 "aarch64_simd_shift_imm_<mode>"))
> +              (EQL (match_operand:SHORT 0 "register_operand")
> +                   (match_operand 1 "const0_operand"))
>                (label_ref (match_operand 2 ""))
>                (pc)))]
>    ""
> diff --git a/gcc/testsuite/gcc.target/aarch64/tbz_2.c b/gcc/testsuite/gcc.target/aarch64/tbz_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ec128b58f35276a7c5452685a65c73f95f2d5f9a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/tbz_2.c
> @@ -0,0 +1,130 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2 -std=c99  -fno-unwind-tables -fno-asynchronous-unwind-tables" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <stdbool.h>
> +
> +void h(void);
> +
> +/*
> +** g1:
> +** 	cbnz	w0, .L[0-9]+
> +** 	ret
> +** 	...
> +*/
> +void g1(int x)
> +{
> +  if (__builtin_expect (x, 0))
> +    h ();
> +}
> +
> +/* 
> +** g2:
> +** 	tbnz	x0, 0, .L[0-9]+
> +** 	ret
> +** 	...
> +*/
> +void g2(int x)
> +{
> +  if (__builtin_expect (x & 1, 0))
> +    h ();
> +}
> +
> +/* 
> +** g3:
> +** 	tbnz	x0, 3, .L[0-9]+
> +** 	ret
> +** 	...
> +*/
> +void g3(int x)
> +{
> +  if (__builtin_expect (x & 8, 0))
> +    h ();
> +}
> +
> +/* 
> +** g4:
> +** 	tbnz	w0, #31, .L[0-9]+
> +** 	ret
> +** 	...
> +*/
> +void g4(int x)
> +{
> +  if (__builtin_expect (x & (1 << 31), 0))
> +    h ();
> +}
> +
> +/* 
> +** g5:
> +** 	tst	w0, 255
> +** 	bne	.L[0-9]+
> +** 	ret
> +** 	...
> +*/
> +void g5(char x)
> +{
> +  if (__builtin_expect (x, 0))
> +    h ();
> +}
> +
> +/* 
> +** g6:
> +** 	tbnz	w0, 0, .L[0-9]+
> +** 	ret
> +** 	...
> +*/
> +void g6(char x)
> +{
> +  if (__builtin_expect (x & 1, 0))
> +    h ();
> +}
> +
> +/* 
> +** g7:
> +** 	tst	w0, 3
> +** 	bne	.L[0-9]+
> +** 	ret
> +** 	...
> +*/
> +void g7(char x)
> +{
> +  if (__builtin_expect (x & 3, 0))
> +    h ();
> +}
> +
> +/* 
> +** g8:
> +** 	tbnz	w0, 7, .L[0-9]+
> +** 	ret
> +** 	...
> +*/
> +void g8(char x)
> +{
> +  if (__builtin_expect (x & (1 << 7), 0))
> +    h ();
> +}
> +
> +/* 
> +** g9:
> +** 	tbnz	w0, 0, .L[0-9]+
> +** 	ret
> +** 	...
> +*/
> +void g9(bool x)
> +{
> +  if (__builtin_expect (x, 0))
> +    h ();
> +}
> +
> +/* 
> +** g10:
> +** 	tbnz	w0, 0, .L[0-9]+
> +** 	ret
> +** 	...
> +*/
> +void g10(bool x)
> +{
> +  if (__builtin_expect (x & 1, 0))
> +    h ();
> +}
> +
Tamar Christina Jan. 30, 2023, 2:17 p.m. UTC | #2
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Friday, January 27, 2023 12:26 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH]AArch64: Fix codegen regressions around tbz.
> 
> Tamar Christina <tamar.christina@arm.com> writes:
> > Hi All,
> >
> > We were analyzing code quality after recent changes and have noticed
> > that the tbz support somehow managed to increase the number of
> > branches overall rather than decreased them.
> >
> > While investigating this we figured out that the problem is that when
> > an existing & <contants> exists in gimple and the instruction is
> > generated because of the range information gotten from the ANDed
> > constant that we end up with the situation that you get a NOP AND in the
> RTL expansion.
> >
> > This is not a problem as CSE will take care of it normally.   The issue is when
> > this original AND was done in a location where PRE or FRE "lift" the
> > AND to a different basic block.  This triggers a problem when the
> > resulting value is not single use.  Instead of having an AND and tbz,
> > we end up generating an AND + TST + BR if the mode is HI or QI.
> >
> > This CSE across BB was a problem before but this change made it worse.
> > Our branch patterns rely on combine being able to fold AND or
> > zero_extends into the instructions.
> >
> > To work around this (since a proper fix is outside of the scope of
> > stage-4) we are limiting the new tbranch optab to only HI and QI mode
> > values.  This isn't a problem because these two modes are modes for
> > which we don't have CBZ support, so they are the problematic cases to
> begin with.  Additionally booleans are QI.
> >
> > The second thing we're doing is limiting the only legal bitpos to pos 0. i.e.
> > only the bottom bit.  This such that we prevent the double ANDs as
> > much as possible.
> >
> > Now most other cases, i.e. where we had an explicit & in the source
> > code are still handled correctly by the anonymous
> > (*tb<optab><ALLI:mode><GPI:mode>1)
> > pattern that was added along with tbranch support.
> >
> > This means we don't expand the superflous AND here, and while it
> > doesn't fix the problem that in the cross BB case we loss tbz, it also doesn't
> make things worse.
> >
> > With these tweaks we've now reduced the number of insn uniformly as
> > originally expected.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64.md (tbranch_<code><mode>3): Restrict to
> SHORT
> > 	and bottom bit only.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/tbz_2.c: New test.
> 
> Agreed that reducing the scope of the new optimisation seems like a safe
> compromise for GCC 13.  But could you add a testcase that shows the effect
> of both changes (reducing the mode selection and the bit selection)?  The
> test above passes even without the patch.
> 
> It would be good to have a PR tracking this too.

I've been trying to isolate a small testcase to include, and it's not been trivial as GCC
will do various transformations on smaller sequences to still do the right thing.

I have various testcase where GCC is doing the wrong thing for the branches but as soon
As my repro for the cases this fixes gets too small problem gets hidden..

> 
> Personally, I think we should try to get to the stage where gimple does the
> CSE optimisations we need, and where the tbranch optab can generate a tbz
> directly (rather than splitting it apart and hoping that combine will put it back
> together later).

Agreed, but that doesn't solve all the problems though. GCC is in general quite bad at branch
layouts especially wrt. To branch distances. For instance BB rotation doesn't take distance into
account. And TBZ, CBZ, B have different ranges.  Since the distance isn't taken into account we
end up "optimizing" the branch and then at codegen doing an emergency distance enlargement
using a TST + B to replace whatever we optimized too.

LLVM does much better in all of these scenarios, so it's likely that the entire branch strategy needs
an overhaul.

https://godbolt.org/z/1dhf7T5he  shows some of the mini examples.  I can keep trying to make a
reproducer for what my patch changes if you'd like, but I don't think it will be small..  The general
problem is the same as in f2 but we introduce it in the cases of f3 as well (bool), but adding & to a bool
will get it trivially optimized out.

Thanks,
Tamar

> 
> Thanks,
> Richard
> 
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64.md
> > b/gcc/config/aarch64/aarch64.md index
> >
> 2c1367977a68fc8e4289118e07bb61398856791e..aa09e93d85e9628e8944e0349
> 869
> > 7eb9597ef867 100644
> > --- a/gcc/config/aarch64/aarch64.md
> > +++ b/gcc/config/aarch64/aarch64.md
> > @@ -949,8 +949,8 @@ (define_insn "*cb<optab><mode>1"
> >
> >  (define_expand "tbranch_<code><mode>3"
> >    [(set (pc) (if_then_else
> > -              (EQL (match_operand:ALLI 0 "register_operand")
> > -                   (match_operand 1 "aarch64_simd_shift_imm_<mode>"))
> > +              (EQL (match_operand:SHORT 0 "register_operand")
> > +                   (match_operand 1 "const0_operand"))
> >                (label_ref (match_operand 2 ""))
> >                (pc)))]
> >    ""
> > diff --git a/gcc/testsuite/gcc.target/aarch64/tbz_2.c
> > b/gcc/testsuite/gcc.target/aarch64/tbz_2.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..ec128b58f35276a7c5452685a6
> 5c
> > 73f95f2d5f9a
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/tbz_2.c
> > @@ -0,0 +1,130 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O2 -std=c99  -fno-unwind-tables
> > +-fno-asynchronous-unwind-tables" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +#include <stdbool.h>
> > +
> > +void h(void);
> > +
> > +/*
> > +** g1:
> > +** 	cbnz	w0, .L[0-9]+
> > +** 	ret
> > +** 	...
> > +*/
> > +void g1(int x)
> > +{
> > +  if (__builtin_expect (x, 0))
> > +    h ();
> > +}
> > +
> > +/*
> > +** g2:
> > +** 	tbnz	x0, 0, .L[0-9]+
> > +** 	ret
> > +** 	...
> > +*/
> > +void g2(int x)
> > +{
> > +  if (__builtin_expect (x & 1, 0))
> > +    h ();
> > +}
> > +
> > +/*
> > +** g3:
> > +** 	tbnz	x0, 3, .L[0-9]+
> > +** 	ret
> > +** 	...
> > +*/
> > +void g3(int x)
> > +{
> > +  if (__builtin_expect (x & 8, 0))
> > +    h ();
> > +}
> > +
> > +/*
> > +** g4:
> > +** 	tbnz	w0, #31, .L[0-9]+
> > +** 	ret
> > +** 	...
> > +*/
> > +void g4(int x)
> > +{
> > +  if (__builtin_expect (x & (1 << 31), 0))
> > +    h ();
> > +}
> > +
> > +/*
> > +** g5:
> > +** 	tst	w0, 255
> > +** 	bne	.L[0-9]+
> > +** 	ret
> > +** 	...
> > +*/
> > +void g5(char x)
> > +{
> > +  if (__builtin_expect (x, 0))
> > +    h ();
> > +}
> > +
> > +/*
> > +** g6:
> > +** 	tbnz	w0, 0, .L[0-9]+
> > +** 	ret
> > +** 	...
> > +*/
> > +void g6(char x)
> > +{
> > +  if (__builtin_expect (x & 1, 0))
> > +    h ();
> > +}
> > +
> > +/*
> > +** g7:
> > +** 	tst	w0, 3
> > +** 	bne	.L[0-9]+
> > +** 	ret
> > +** 	...
> > +*/
> > +void g7(char x)
> > +{
> > +  if (__builtin_expect (x & 3, 0))
> > +    h ();
> > +}
> > +
> > +/*
> > +** g8:
> > +** 	tbnz	w0, 7, .L[0-9]+
> > +** 	ret
> > +** 	...
> > +*/
> > +void g8(char x)
> > +{
> > +  if (__builtin_expect (x & (1 << 7), 0))
> > +    h ();
> > +}
> > +
> > +/*
> > +** g9:
> > +** 	tbnz	w0, 0, .L[0-9]+
> > +** 	ret
> > +** 	...
> > +*/
> > +void g9(bool x)
> > +{
> > +  if (__builtin_expect (x, 0))
> > +    h ();
> > +}
> > +
> > +/*
> > +** g10:
> > +** 	tbnz	w0, 0, .L[0-9]+
> > +** 	ret
> > +** 	...
> > +*/
> > +void g10(bool x)
> > +{
> > +  if (__builtin_expect (x & 1, 0))
> > +    h ();
> > +}
> > +
Richard Sandiford March 6, 2023, 7:03 p.m. UTC | #3
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Friday, January 27, 2023 12:26 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH]AArch64: Fix codegen regressions around tbz.
>> 
>> Tamar Christina <tamar.christina@arm.com> writes:
>> > Hi All,
>> >
>> > We were analyzing code quality after recent changes and have noticed
>> > that the tbz support somehow managed to increase the number of
>> > branches overall rather than decreased them.
>> >
>> > While investigating this we figured out that the problem is that when
>> > an existing & <contants> exists in gimple and the instruction is
>> > generated because of the range information gotten from the ANDed
>> > constant that we end up with the situation that you get a NOP AND in the
>> RTL expansion.
>> >
>> > This is not a problem as CSE will take care of it normally.   The issue is when
>> > this original AND was done in a location where PRE or FRE "lift" the
>> > AND to a different basic block.  This triggers a problem when the
>> > resulting value is not single use.  Instead of having an AND and tbz,
>> > we end up generating an AND + TST + BR if the mode is HI or QI.
>> >
>> > This CSE across BB was a problem before but this change made it worse.
>> > Our branch patterns rely on combine being able to fold AND or
>> > zero_extends into the instructions.
>> >
>> > To work around this (since a proper fix is outside of the scope of
>> > stage-4) we are limiting the new tbranch optab to only HI and QI mode
>> > values.  This isn't a problem because these two modes are modes for
>> > which we don't have CBZ support, so they are the problematic cases to
>> begin with.  Additionally booleans are QI.
>> >
>> > The second thing we're doing is limiting the only legal bitpos to pos 0. i.e.
>> > only the bottom bit.  This such that we prevent the double ANDs as
>> > much as possible.
>> >
>> > Now most other cases, i.e. where we had an explicit & in the source
>> > code are still handled correctly by the anonymous
>> > (*tb<optab><ALLI:mode><GPI:mode>1)
>> > pattern that was added along with tbranch support.
>> >
>> > This means we don't expand the superflous AND here, and while it
>> > doesn't fix the problem that in the cross BB case we loss tbz, it also doesn't
>> make things worse.
>> >
>> > With these tweaks we've now reduced the number of insn uniformly as
>> > originally expected.
>> >
>> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> >
>> > Ok for master?
>> >
>> > Thanks,
>> > Tamar
>> >
>> > gcc/ChangeLog:
>> >
>> > 	* config/aarch64/aarch64.md (tbranch_<code><mode>3): Restrict to
>> SHORT
>> > 	and bottom bit only.
>> >
>> > gcc/testsuite/ChangeLog:
>> >
>> > 	* gcc.target/aarch64/tbz_2.c: New test.
>> 
>> Agreed that reducing the scope of the new optimisation seems like a safe
>> compromise for GCC 13.  But could you add a testcase that shows the effect
>> of both changes (reducing the mode selection and the bit selection)?  The
>> test above passes even without the patch.
>> 
>> It would be good to have a PR tracking this too.
>
> I've been trying to isolate a small testcase to include, and it's not been trivial as GCC
> will do various transformations on smaller sequences to still do the right thing.
>
> I have various testcase where GCC is doing the wrong thing for the branches but as soon
> As my repro for the cases this fixes gets too small problem gets hidden..

Thanks for sharing one of these off-list.  It's a bit of a contrived
example, but:

void g(int);

void
f (unsigned int x, _Bool y)
{
  for (int i = 0; i < 100; ++i)
    {
      if ((x >> 31) | y)
	g (1);
      for (int j = 0; j < 100; ++j)
	g (2);
    }
}

generates an extra AND without the patch, so a test for:

  { dg-final { scan-assembler-times {and\t} 1 } }

would cover it.  This seemed to be at least related to the problem
in one of the functions in the bigger test.

The patch is OK with a testcase along those lines in addition to the
original tbz_2.c.  (I agree we should keep the tbz_2.c one, for any
extra coverage it gives.)

>> Personally, I think we should try to get to the stage where gimple does the
>> CSE optimisations we need, and where the tbranch optab can generate a tbz
>> directly (rather than splitting it apart and hoping that combine will put it back
>> together later).
>
> Agreed, but that doesn't solve all the problems though. GCC is in general quite bad at branch
> layouts especially wrt. To branch distances. For instance BB rotation doesn't take distance into
> account. And TBZ, CBZ, B have different ranges.  Since the distance isn't taken into account we
> end up "optimizing" the branch and then at codegen doing an emergency distance enlargement
> using a TST + B to replace whatever we optimized too.

Hmm, true.

> LLVM does much better in all of these scenarios, so it's likely that the entire branch strategy needs
> an overhaul.

I suppose I'd better not look for fear of contamination, but it'd
be interesting to know whether they handle it by using heuristics
to predict distance early, or whether they're better at late fixups.

I also notice that LLVM prefers an AND with 1 over an AND with 255
when extending a bool to a wider mode.  I wonder how much that would
help, if at all.  There are obviously advantages and disadvantages
both ways: using AND with 1 encourages TBZs, whereas using AND with
255 is a natural zero extension (and so can be elided if the source
is spilled).

It might also help to have better nonzero bits tracking in RTL (as a
service to all RTL passes, rather than something internal to combine).
ISTR Segher was looking at that at one point.

Thanks,
Richard

> https://godbolt.org/z/1dhf7T5he  shows some of the mini examples.  I can keep trying to make a
> reproducer for what my patch changes if you'd like, but I don't think it will be small..  The general
> problem is the same as in f2 but we introduce it in the cases of f3 as well (bool), but adding & to a bool
> will get it trivially optimized out.
>
> Thanks,
> Tamar
>
>> 
>> Thanks,
>> Richard
>> 
>> > --- inline copy of patch --
>> > diff --git a/gcc/config/aarch64/aarch64.md
>> > b/gcc/config/aarch64/aarch64.md index
>> >
>> 2c1367977a68fc8e4289118e07bb61398856791e..aa09e93d85e9628e8944e0349
>> 869
>> > 7eb9597ef867 100644
>> > --- a/gcc/config/aarch64/aarch64.md
>> > +++ b/gcc/config/aarch64/aarch64.md
>> > @@ -949,8 +949,8 @@ (define_insn "*cb<optab><mode>1"
>> >
>> >  (define_expand "tbranch_<code><mode>3"
>> >    [(set (pc) (if_then_else
>> > -              (EQL (match_operand:ALLI 0 "register_operand")
>> > -                   (match_operand 1 "aarch64_simd_shift_imm_<mode>"))
>> > +              (EQL (match_operand:SHORT 0 "register_operand")
>> > +                   (match_operand 1 "const0_operand"))
>> >                (label_ref (match_operand 2 ""))
>> >                (pc)))]
>> >    ""
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/tbz_2.c
>> > b/gcc/testsuite/gcc.target/aarch64/tbz_2.c
>> > new file mode 100644
>> > index
>> >
>> 0000000000000000000000000000000000000000..ec128b58f35276a7c5452685a6
>> 5c
>> > 73f95f2d5f9a
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/tbz_2.c
>> > @@ -0,0 +1,130 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-additional-options "-O2 -std=c99  -fno-unwind-tables
>> > +-fno-asynchronous-unwind-tables" } */
>> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
>> > +} */
>> > +
>> > +#include <stdbool.h>
>> > +
>> > +void h(void);
>> > +
>> > +/*
>> > +** g1:
>> > +** 	cbnz	w0, .L[0-9]+
>> > +** 	ret
>> > +** 	...
>> > +*/
>> > +void g1(int x)
>> > +{
>> > +  if (__builtin_expect (x, 0))
>> > +    h ();
>> > +}
>> > +
>> > +/*
>> > +** g2:
>> > +** 	tbnz	x0, 0, .L[0-9]+
>> > +** 	ret
>> > +** 	...
>> > +*/
>> > +void g2(int x)
>> > +{
>> > +  if (__builtin_expect (x & 1, 0))
>> > +    h ();
>> > +}
>> > +
>> > +/*
>> > +** g3:
>> > +** 	tbnz	x0, 3, .L[0-9]+
>> > +** 	ret
>> > +** 	...
>> > +*/
>> > +void g3(int x)
>> > +{
>> > +  if (__builtin_expect (x & 8, 0))
>> > +    h ();
>> > +}
>> > +
>> > +/*
>> > +** g4:
>> > +** 	tbnz	w0, #31, .L[0-9]+
>> > +** 	ret
>> > +** 	...
>> > +*/
>> > +void g4(int x)
>> > +{
>> > +  if (__builtin_expect (x & (1 << 31), 0))
>> > +    h ();
>> > +}
>> > +
>> > +/*
>> > +** g5:
>> > +** 	tst	w0, 255
>> > +** 	bne	.L[0-9]+
>> > +** 	ret
>> > +** 	...
>> > +*/
>> > +void g5(char x)
>> > +{
>> > +  if (__builtin_expect (x, 0))
>> > +    h ();
>> > +}
>> > +
>> > +/*
>> > +** g6:
>> > +** 	tbnz	w0, 0, .L[0-9]+
>> > +** 	ret
>> > +** 	...
>> > +*/
>> > +void g6(char x)
>> > +{
>> > +  if (__builtin_expect (x & 1, 0))
>> > +    h ();
>> > +}
>> > +
>> > +/*
>> > +** g7:
>> > +** 	tst	w0, 3
>> > +** 	bne	.L[0-9]+
>> > +** 	ret
>> > +** 	...
>> > +*/
>> > +void g7(char x)
>> > +{
>> > +  if (__builtin_expect (x & 3, 0))
>> > +    h ();
>> > +}
>> > +
>> > +/*
>> > +** g8:
>> > +** 	tbnz	w0, 7, .L[0-9]+
>> > +** 	ret
>> > +** 	...
>> > +*/
>> > +void g8(char x)
>> > +{
>> > +  if (__builtin_expect (x & (1 << 7), 0))
>> > +    h ();
>> > +}
>> > +
>> > +/*
>> > +** g9:
>> > +** 	tbnz	w0, 0, .L[0-9]+
>> > +** 	ret
>> > +** 	...
>> > +*/
>> > +void g9(bool x)
>> > +{
>> > +  if (__builtin_expect (x, 0))
>> > +    h ();
>> > +}
>> > +
>> > +/*
>> > +** g10:
>> > +** 	tbnz	w0, 0, .L[0-9]+
>> > +** 	ret
>> > +** 	...
>> > +*/
>> > +void g10(bool x)
>> > +{
>> > +  if (__builtin_expect (x & 1, 0))
>> > +    h ();
>> > +}
>> > +
diff mbox series

Patch

--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -949,8 +949,8 @@  (define_insn "*cb<optab><mode>1"
 
 (define_expand "tbranch_<code><mode>3"
   [(set (pc) (if_then_else
-              (EQL (match_operand:ALLI 0 "register_operand")
-                   (match_operand 1 "aarch64_simd_shift_imm_<mode>"))
+              (EQL (match_operand:SHORT 0 "register_operand")
+                   (match_operand 1 "const0_operand"))
               (label_ref (match_operand 2 ""))
               (pc)))]
   ""
diff --git a/gcc/testsuite/gcc.target/aarch64/tbz_2.c b/gcc/testsuite/gcc.target/aarch64/tbz_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..ec128b58f35276a7c5452685a65c73f95f2d5f9a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/tbz_2.c
@@ -0,0 +1,130 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99  -fno-unwind-tables -fno-asynchronous-unwind-tables" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdbool.h>
+
+void h(void);
+
+/*
+** g1:
+** 	cbnz	w0, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g1(int x)
+{
+  if (__builtin_expect (x, 0))
+    h ();
+}
+
+/* 
+** g2:
+** 	tbnz	x0, 0, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g2(int x)
+{
+  if (__builtin_expect (x & 1, 0))
+    h ();
+}
+
+/* 
+** g3:
+** 	tbnz	x0, 3, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g3(int x)
+{
+  if (__builtin_expect (x & 8, 0))
+    h ();
+}
+
+/* 
+** g4:
+** 	tbnz	w0, #31, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g4(int x)
+{
+  if (__builtin_expect (x & (1 << 31), 0))
+    h ();
+}
+
+/* 
+** g5:
+** 	tst	w0, 255
+** 	bne	.L[0-9]+
+** 	ret
+** 	...
+*/
+void g5(char x)
+{
+  if (__builtin_expect (x, 0))
+    h ();
+}
+
+/* 
+** g6:
+** 	tbnz	w0, 0, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g6(char x)
+{
+  if (__builtin_expect (x & 1, 0))
+    h ();
+}
+
+/* 
+** g7:
+** 	tst	w0, 3
+** 	bne	.L[0-9]+
+** 	ret
+** 	...
+*/
+void g7(char x)
+{
+  if (__builtin_expect (x & 3, 0))
+    h ();
+}
+
+/* 
+** g8:
+** 	tbnz	w0, 7, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g8(char x)
+{
+  if (__builtin_expect (x & (1 << 7), 0))
+    h ();
+}
+
+/* 
+** g9:
+** 	tbnz	w0, 0, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g9(bool x)
+{
+  if (__builtin_expect (x, 0))
+    h ();
+}
+
+/* 
+** g10:
+** 	tbnz	w0, 0, .L[0-9]+
+** 	ret
+** 	...
+*/
+void g10(bool x)
+{
+  if (__builtin_expect (x & 1, 0))
+    h ();
+}
+