diff mbox

[AArch64] Improve Cortex-A53 shift bypass

Message ID AM5PR0802MB26102C1AB3F6B5364538F4E883100@AM5PR0802MB2610.eurprd08.prod.outlook.com
State New
Headers show

Commit Message

Wilco Dijkstra April 27, 2017, 5:38 p.m. UTC
The aarch_forward_to_shift_is_not_shifted_reg bypass always returns true
on AArch64 shifted instructions.  This causes the bypass to activate in
too many cases, resulting in slower execution on Cortex-A53 like reported
in PR79665.

This patch uses the arm_no_early_alu_shift_dep condition instead which
improves the example in PR79665 by ~7%.  Given it is no longer used,
remove aarch_forward_to_shift_is_not_shifted_reg.

Passes AArch64 bootstrap and regress. OK for commit?

ChangeLog:
2017-04-27  Wilco Dijkstra  <wdijkstr@arm.com>

	PR target/79665
	* config/arm/aarch-common.c (arm_no_early_alu_shift_dep):
	Remove redundant if.
	(aarch_forward_to_shift_is_not_shifted_reg): Remove.
	* config/arm/aarch-common-protos.h
	(aarch_forward_to_shift_is_not_shifted_re): Remove.
	* config/arm/cortex-a53.md: Use arm_no_early_alu_shift_dep in bypass.

--

Comments

Richard Earnshaw (lists) May 5, 2017, 3:34 p.m. UTC | #1
On 27/04/17 18:38, Wilco Dijkstra wrote:
> The aarch_forward_to_shift_is_not_shifted_reg bypass always returns true
> on AArch64 shifted instructions.  This causes the bypass to activate in
> too many cases, resulting in slower execution on Cortex-A53 like reported
> in PR79665.
> 
> This patch uses the arm_no_early_alu_shift_dep condition instead which
> improves the example in PR79665 by ~7%.  Given it is no longer used,
> remove aarch_forward_to_shift_is_not_shifted_reg.
> 
> Passes AArch64 bootstrap and regress. OK for commit?
> 
> ChangeLog:
> 2017-04-27  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	PR target/79665
> 	* config/arm/aarch-common.c (arm_no_early_alu_shift_dep):
> 	Remove redundant if.
> 	(aarch_forward_to_shift_is_not_shifted_reg): Remove.
> 	* config/arm/aarch-common-protos.h
> 	(aarch_forward_to_shift_is_not_shifted_re): Remove.
> 	* config/arm/cortex-a53.md: Use arm_no_early_alu_shift_dep in bypass.
> 
> --
> 
> diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
> index 7c2bb4c2ed93728efcbd9e2811c09dddd04b37fe..4350d975abbbbd2cda55ac31e0d47971b40fcde5 100644
> --- a/gcc/config/arm/aarch-common-protos.h
> +++ b/gcc/config/arm/aarch-common-protos.h
> @@ -25,7 +25,6 @@
>  
>  extern int aarch_accumulator_forwarding (rtx_insn *, rtx_insn *);
>  extern int aarch_crypto_can_dual_issue (rtx_insn *, rtx_insn *);
> -extern int aarch_forward_to_shift_is_not_shifted_reg (rtx_insn *, rtx_insn *);
>  extern bool aarch_rev16_p (rtx);
>  extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode);
>  extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode);
> diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c
> index 742d2ff4c7b779ae07b92f8a800e4667e32c44fb..9da2e382b2a1ecabd56acccc57997dbf626da513 100644
> --- a/gcc/config/arm/aarch-common.c
> +++ b/gcc/config/arm/aarch-common.c
> @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer)
>      return 0;
>  
>    if ((early_op = arm_find_shift_sub_rtx (op)))
> -    {
> -      if (REG_P (early_op))
> -	early_op = op;
> -
> -      return !reg_overlap_mentioned_p (value, early_op);
> -    }
> +    return !reg_overlap_mentioned_p (value, early_op);
>  
>    return 0;
>  }

This function is used by several aarch32 pipeline description models.
What testing have you given it there.  Are the changes appropriate for
those cores as well?

R.

> @@ -472,38 +467,6 @@ aarch_accumulator_forwarding (rtx_insn *producer, rtx_insn *consumer)
>    return (REGNO (dest) == REGNO (accumulator));
>  }
>  
> -/* Return nonzero if the CONSUMER instruction is some sort of
> -   arithmetic or logic + shift operation, and the register we are
> -   writing in PRODUCER is not used in a register shift by register
> -   operation.  */
> -
> -int
> -aarch_forward_to_shift_is_not_shifted_reg (rtx_insn *producer,
> -					   rtx_insn *consumer)
> -{
> -  rtx value, op;
> -  rtx early_op;
> -
> -  if (!arm_get_set_operands (producer, consumer, &value, &op))
> -    return 0;
> -
> -  if ((early_op = arm_find_shift_sub_rtx (op)))
> -    {
> -      if (REG_P (early_op))
> -	early_op = op;
> -
> -      /* Any other canonicalisation of a shift is a shift-by-constant
> -	 so we don't care.  */
> -      if (GET_CODE (early_op) == ASHIFT)
> -	return (!REG_P (XEXP (early_op, 0))
> -		|| !REG_P (XEXP (early_op, 1)));
> -      else
> -	return 1;
> -    }
> -
> -  return 0;
> -}
> -
>  /* Return non-zero if the consumer (a multiply-accumulate instruction)
>     has an accumulator dependency on the result of the producer (a
>     multiplication instruction) and no other dependency on that result.  */
> diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md
> index 7cf5fc5a0cd1d59efd0be3310b78303018138547..5bd0e62a108241ca56b01315908426e3f095fa81 100644
> --- a/gcc/config/arm/cortex-a53.md
> +++ b/gcc/config/arm/cortex-a53.md
> @@ -211,7 +211,7 @@ (define_bypass 1 "cortex_a53_alu*"
>  
>  (define_bypass 1 "cortex_a53_alu*"
>  		 "cortex_a53_alu_shift*"
> -		 "aarch_forward_to_shift_is_not_shifted_reg")
> +		 "arm_no_early_alu_shift_dep")
>  
>  (define_bypass 2 "cortex_a53_alu*"
>  		 "cortex_a53_alu_*,cortex_a53_shift*")
>
Wilco Dijkstra May 5, 2017, 4:02 p.m. UTC | #2
Richard Earnshaw (lists) wrote:

> --- a/gcc/config/arm/aarch-common.c
> +++ b/gcc/config/arm/aarch-common.c
> @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer)
>      return 0;

>    if ((early_op = arm_find_shift_sub_rtx (op)))
> -    {
> -      if (REG_P (early_op))
> -     early_op = op;
> -
> -      return !reg_overlap_mentioned_p (value, early_op);
> -    }
> +    return !reg_overlap_mentioned_p (value, early_op);

>    return 0;
>  }

> This function is used by several aarch32 pipeline description models.
> What testing have you given it there.  Are the changes appropriate for
> those cores as well?

arm_find_shift_sub_rtx can only ever return NULL_RTX or a shift rtx, so the
check for REG_P is dead code. Bootstrap passes on ARM too of course.

Wilco
Wilco Dijkstra June 13, 2017, 1:57 p.m. UTC | #3
ping
    
Richard Earnshaw (lists) wrote:

> --- a/gcc/config/arm/aarch-common.c
> +++ b/gcc/config/arm/aarch-common.c
> @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer)
>      return 0;

>    if ((early_op = arm_find_shift_sub_rtx (op)))
> -    {
> -      if (REG_P (early_op))
> -     early_op = op;
> -
> -      return !reg_overlap_mentioned_p (value, early_op);
> -    }
> +    return !reg_overlap_mentioned_p (value, early_op);

>    return 0;
>  }

> This function is used by several aarch32 pipeline description models.
> What testing have you given it there.  Are the changes appropriate for
> those cores as well?

arm_find_shift_sub_rtx can only ever return NULL_RTX or a shift rtx, so the
check for REG_P is dead code. Bootstrap passes on ARM too of course.

Wilco
James Greenhalgh June 14, 2017, 1:55 p.m. UTC | #4
On Fri, May 05, 2017 at 05:02:46PM +0100, Wilco Dijkstra wrote:
> Richard Earnshaw (lists) wrote:
> 
> > --- a/gcc/config/arm/aarch-common.c
> > +++ b/gcc/config/arm/aarch-common.c
> > @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer)
> >      return 0;
> >  
> >    if ((early_op = arm_find_shift_sub_rtx (op)))
> > -    {
> > -      if (REG_P (early_op))
> > -     early_op = op;
> > -
> > -      return !reg_overlap_mentioned_p (value, early_op);
> > -    }
> > +    return !reg_overlap_mentioned_p (value, early_op);
> >  
> >    return 0;
> >  }
> 
> > This function is used by several aarch32 pipeline description models.
> > What testing have you given it there.  Are the changes appropriate for
> > those cores as well?
> 
> arm_find_shift_sub_rtx can only ever return NULL_RTX or a shift rtx, so the
> check for REG_P is dead code. Bootstrap passes on ARM too of course.

This took me a bit of head-scratching to get right...

arm_find_shift_sub_rtx calls arm_find_sub_rtx_with_code, looking for
ASHIFT, with find_any_shift set to TRUE. There, we're going to run
through the subRTX of pattern, if the code of the subrtx is one of the
shift-like patterns, we return X, otherwise we return NULL_RTX.

Thus 

> > -      if (REG_P (early_op))
> > -     early_op = op;

is not needed, and the code can be reduced to:

  if ((early_op = arm_find_shift_sub_rtx (op)))
    return !reg_overlap_mentioned_p (value, early_op);
  return 0;

So, this looks fine to me from an AArch64 perspective - but you'll need an
ARM OK too as this is shared code.

Cheers,
James
Wilco Dijkstra June 27, 2017, 3:40 p.m. UTC | #5
ping

    
On Fri, May 05, 2017 at 05:02:46PM +0100, Wilco Dijkstra wrote:
> Richard Earnshaw (lists) wrote:
> 
> > --- a/gcc/config/arm/aarch-common.c
> > +++ b/gcc/config/arm/aarch-common.c
> > @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer)
> >      return 0;
> >  
> >    if ((early_op = arm_find_shift_sub_rtx (op)))
> > -    {
> > -      if (REG_P (early_op))
> > -     early_op = op;
> > -
> > -      return !reg_overlap_mentioned_p (value, early_op);
> > -    }
> > +    return !reg_overlap_mentioned_p (value, early_op);
> >  
> >    return 0;
> >  }
> 
> > This function is used by several aarch32 pipeline description models.
> > What testing have you given it there.  Are the changes appropriate for
> > those cores as well?
> 
> arm_find_shift_sub_rtx can only ever return NULL_RTX or a shift rtx, so the
> check for REG_P is dead code. Bootstrap passes on ARM too of course.

This took me a bit of head-scratching to get right...

arm_find_shift_sub_rtx calls arm_find_sub_rtx_with_code, looking for
ASHIFT, with find_any_shift set to TRUE. There, we're going to run
through the subRTX of pattern, if the code of the subrtx is one of the
shift-like patterns, we return X, otherwise we return NULL_RTX.

Thus 

> > -      if (REG_P (early_op))
> > -     early_op = op;

is not needed, and the code can be reduced to:

  if ((early_op = arm_find_shift_sub_rtx (op)))
    return !reg_overlap_mentioned_p (value, early_op);
  return 0;

So, this looks fine to me from an AArch64 perspective - but you'll need an
ARM OK too as this is shared code.

Cheers,
James
Ramana Radhakrishnan June 27, 2017, 4:33 p.m. UTC | #6
On Wed, Jun 14, 2017 at 2:55 PM, James Greenhalgh
<james.greenhalgh@arm.com> wrote:
> On Fri, May 05, 2017 at 05:02:46PM +0100, Wilco Dijkstra wrote:
>> Richard Earnshaw (lists) wrote:
>>
>> > --- a/gcc/config/arm/aarch-common.c
>> > +++ b/gcc/config/arm/aarch-common.c
>> > @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer)
>> >      return 0;
>> >
>> >    if ((early_op = arm_find_shift_sub_rtx (op)))
>> > -    {
>> > -      if (REG_P (early_op))
>> > -     early_op = op;
>> > -
>> > -      return !reg_overlap_mentioned_p (value, early_op);
>> > -    }
>> > +    return !reg_overlap_mentioned_p (value, early_op);
>> >
>> >    return 0;
>> >  }
>>
>> > This function is used by several aarch32 pipeline description models.
>> > What testing have you given it there.  Are the changes appropriate for
>> > those cores as well?
>>
>> arm_find_shift_sub_rtx can only ever return NULL_RTX or a shift rtx, so the
>> check for REG_P is dead code. Bootstrap passes on ARM too of course.
>
> This took me a bit of head-scratching to get right...
>
> arm_find_shift_sub_rtx calls arm_find_sub_rtx_with_code, looking for
> ASHIFT, with find_any_shift set to TRUE. There, we're going to run
> through the subRTX of pattern, if the code of the subrtx is one of the
> shift-like patterns, we return X, otherwise we return NULL_RTX.
>
> Thus
>
>> > -      if (REG_P (early_op))
>> > -     early_op = op;
>
> is not needed, and the code can be reduced to:
>
>   if ((early_op = arm_find_shift_sub_rtx (op)))
>     return !reg_overlap_mentioned_p (value, early_op);
>   return 0;
>
> So, this looks fine to me from an AArch64 perspective - but you'll need an
> ARM OK too as this is shared code.


I'm about to run home for the day but this came in from
https://gcc.gnu.org/ml/gcc-patches/2013-09/msg02109.html and James
said in that email that this was put in to ensure no segfaults on
cortex-a15 / cortex-a7 tuning.

I'll try and look at it later this week.




Ramana

>
> Cheers,
> James
>
Wilco Dijkstra June 28, 2017, 12:49 p.m. UTC | #7
Ramana Radhakrishnan wrote:

> I'm about to run home for the day but this came in from
> https://gcc.gnu.org/ml/gcc-patches/2013-09/msg02109.html and James
> said in that email that this was put in to ensure no segfaults on
> cortex-a15 / cortex-a7 tuning.

The code is historical - an older version didn't specifically look just for 
shifts. Given we can only return shift rtx's now it's completely redundant.
A bootstrap on armhf with --with-cpu=cortex-a15 passed.

Wilco
Ramana Radhakrishnan June 28, 2017, 1:03 p.m. UTC | #8
On 6/28/17 1:49 PM, Wilco Dijkstra wrote:
> Ramana Radhakrishnan wrote:
>>      
>> I'm about to run home for the day but this came in from
>> https://gcc.gnu.org/ml/gcc-patches/2013-09/msg02109.html and James
>> said in that email that this was put in to ensure no segfaults on
>> cortex-a15 / cortex-a7 tuning.
> 
> The code is historical - an older version didn't specifically look just for
> shifts. Given we can only return shift rtx's now it's completely redundant.
> A bootstrap on armhf with --with-cpu=cortex-a15 passed.
> 

In which case - ok . thanks,

regards
Ramana

> Wilco
>      
>
James Greenhalgh June 28, 2017, 1:21 p.m. UTC | #9
On Wed, Jun 28, 2017 at 01:49:26PM +0100, Wilco Dijkstra wrote:
> Ramana Radhakrishnan wrote:
> >    
> > I'm about to run home for the day but this came in from
> > https://gcc.gnu.org/ml/gcc-patches/2013-09/msg02109.html and James
> > said in that email that this was put in to ensure no segfaults on
> > cortex-a15 / cortex-a7 tuning.
> 
> The code is historical - an older version didn't specifically look just for 
> shifts. Given we can only return shift rtx's now it's completely redundant.
> A bootstrap on armhf with --with-cpu=cortex-a15 passed.

The final commit was:

  https://gcc.gnu.org/ml/gcc-patches/2013-11/msg00533.html

That ends up being a more involved rewrite of the code in
arm_no_early_alu_shift_dep .

The check Wilco deletes was in their before the clean-up. Previosuly we
were manually walking the RTX for the consumer, assuming that after
stripping COND_EXEC and PARALLEL we'd have either:

  OP = (some_shift (register) (*))

or

  OP = (some_arithmetic_code (some_shift (register) (*)) (*))

That is, we're either looking at a shift, or an arithmetic operation that
contains a shift.

The early_op = XEXP (op, 0); would give you either:

  EARLY_OP = (regiser)

or

  EARLY_OP = (some_shift (register) (*))

But, we're interested in getting to the whole shift. So, we check if we
are now looking at a register, and if we are, use op instead:

 if (REG_P (early_op))
   early_op = op;

So we end up with either:

  EARLY_OP = (some_shift (register) (*))

or

  EARLY_OP = (some_shift (register) (*))

Which is exactly what we wanted.

After the refactoring, arm_find_shift_sub_rtx will always return you
(some_shift (*) (*)) - that's good we're now more resilient, and always
operate on a full shift, but it does mean the check for REG_P can never
match. I wrote this a fair while ago, but it looks like a simple oversight.

So, this code is safely dead and can be cleaned up as Wilco suggests with
no issue.

Hope that helps!

Thanks,
James
diff mbox

Patch

diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 7c2bb4c2ed93728efcbd9e2811c09dddd04b37fe..4350d975abbbbd2cda55ac31e0d47971b40fcde5 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -25,7 +25,6 @@ 
 
 extern int aarch_accumulator_forwarding (rtx_insn *, rtx_insn *);
 extern int aarch_crypto_can_dual_issue (rtx_insn *, rtx_insn *);
-extern int aarch_forward_to_shift_is_not_shifted_reg (rtx_insn *, rtx_insn *);
 extern bool aarch_rev16_p (rtx);
 extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode);
 extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode);
diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c
index 742d2ff4c7b779ae07b92f8a800e4667e32c44fb..9da2e382b2a1ecabd56acccc57997dbf626da513 100644
--- a/gcc/config/arm/aarch-common.c
+++ b/gcc/config/arm/aarch-common.c
@@ -254,12 +254,7 @@  arm_no_early_alu_shift_dep (rtx producer, rtx consumer)
     return 0;
 
   if ((early_op = arm_find_shift_sub_rtx (op)))
-    {
-      if (REG_P (early_op))
-	early_op = op;
-
-      return !reg_overlap_mentioned_p (value, early_op);
-    }
+    return !reg_overlap_mentioned_p (value, early_op);
 
   return 0;
 }
@@ -472,38 +467,6 @@  aarch_accumulator_forwarding (rtx_insn *producer, rtx_insn *consumer)
   return (REGNO (dest) == REGNO (accumulator));
 }
 
-/* Return nonzero if the CONSUMER instruction is some sort of
-   arithmetic or logic + shift operation, and the register we are
-   writing in PRODUCER is not used in a register shift by register
-   operation.  */
-
-int
-aarch_forward_to_shift_is_not_shifted_reg (rtx_insn *producer,
-					   rtx_insn *consumer)
-{
-  rtx value, op;
-  rtx early_op;
-
-  if (!arm_get_set_operands (producer, consumer, &value, &op))
-    return 0;
-
-  if ((early_op = arm_find_shift_sub_rtx (op)))
-    {
-      if (REG_P (early_op))
-	early_op = op;
-
-      /* Any other canonicalisation of a shift is a shift-by-constant
-	 so we don't care.  */
-      if (GET_CODE (early_op) == ASHIFT)
-	return (!REG_P (XEXP (early_op, 0))
-		|| !REG_P (XEXP (early_op, 1)));
-      else
-	return 1;
-    }
-
-  return 0;
-}
-
 /* Return non-zero if the consumer (a multiply-accumulate instruction)
    has an accumulator dependency on the result of the producer (a
    multiplication instruction) and no other dependency on that result.  */
diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md
index 7cf5fc5a0cd1d59efd0be3310b78303018138547..5bd0e62a108241ca56b01315908426e3f095fa81 100644
--- a/gcc/config/arm/cortex-a53.md
+++ b/gcc/config/arm/cortex-a53.md
@@ -211,7 +211,7 @@  (define_bypass 1 "cortex_a53_alu*"
 
 (define_bypass 1 "cortex_a53_alu*"
 		 "cortex_a53_alu_shift*"
-		 "aarch_forward_to_shift_is_not_shifted_reg")
+		 "arm_no_early_alu_shift_dep")
 
 (define_bypass 2 "cortex_a53_alu*"
 		 "cortex_a53_alu_*,cortex_a53_shift*")