diff mbox series

aarch64: Improve PERM<{0}, a, ...> (64bit) by adding whole vector shift right [PR113872]

Message ID 20240211221556.3121555-1-quic_apinski@quicinc.com
State New
Headers show
Series aarch64: Improve PERM<{0}, a, ...> (64bit) by adding whole vector shift right [PR113872] | expand

Commit Message

Andrew Pinski Feb. 11, 2024, 10:15 p.m. UTC
The backend currently defines a whole vector shift left for 64bit vectors, adding the
shift right can also improve code for some PERMs too. So this adds that pattern.

I added a testcase for the shift left also. I also fixed the instruction template
there which was using a space instead of a tab after the instruction.

Built and tested on aarch64-linux-gnu.

	PR target/113872

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (vec_shr_<mode><vczle><vczbe>): Use tab instead of space after
	the instruction in the template.
	(vec_shl_<mode><vczle><vczbe>): New pattern
	* config/aarch64/iterators.md (unspec): Add UNSPEC_VEC_SHL

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/perm_zero-1.c: New test.
	* gcc.target/aarch64/perm_zero-2.c: New test.

Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
---
 gcc/config/aarch64/aarch64-simd.md             | 18 ++++++++++++++++--
 gcc/config/aarch64/iterators.md                |  1 +
 gcc/testsuite/gcc.target/aarch64/perm_zero-1.c | 15 +++++++++++++++
 gcc/testsuite/gcc.target/aarch64/perm_zero-2.c | 15 +++++++++++++++
 4 files changed, 47 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/perm_zero-2.c

Comments

Richard Sandiford Feb. 15, 2024, 2:55 p.m. UTC | #1
Andrew Pinski <quic_apinski@quicinc.com> writes:
> The backend currently defines a whole vector shift left for 64bit vectors, adding the
> shift right can also improve code for some PERMs too. So this adds that pattern.

Is this reversed?  It looks like we have the shift right and the patch is
adding the shift left (at least in GCC internal and little-endian terms).

But on many Arm cores, EXT has a higher throughput than SHL, so I don't think
we should do this unconditionally.

Thanks,
Richard

>
> I added a testcase for the shift left also. I also fixed the instruction template
> there which was using a space instead of a tab after the instruction.
>
> Built and tested on aarch64-linux-gnu.
>
> 	PR target/113872
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-simd.md (vec_shr_<mode><vczle><vczbe>): Use tab instead of space after
> 	the instruction in the template.
> 	(vec_shl_<mode><vczle><vczbe>): New pattern
> 	* config/aarch64/iterators.md (unspec): Add UNSPEC_VEC_SHL
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/perm_zero-1.c: New test.
> 	* gcc.target/aarch64/perm_zero-2.c: New test.
>
> Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64-simd.md             | 18 ++++++++++++++++--
>  gcc/config/aarch64/iterators.md                |  1 +
>  gcc/testsuite/gcc.target/aarch64/perm_zero-1.c | 15 +++++++++++++++
>  gcc/testsuite/gcc.target/aarch64/perm_zero-2.c | 15 +++++++++++++++
>  4 files changed, 47 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index f8bb973a278..0d2f1ea3902 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -1592,9 +1592,23 @@ (define_insn "vec_shr_<mode><vczle><vczbe>"
>    "TARGET_SIMD"
>    {
>      if (BYTES_BIG_ENDIAN)
> -      return "shl %d0, %d1, %2";
> +      return "shl\t%d0, %d1, %2";
>      else
> -      return "ushr %d0, %d1, %2";
> +      return "ushr\t%d0, %d1, %2";
> +  }
> +  [(set_attr "type" "neon_shift_imm")]
> +)
> +(define_insn "vec_shl_<mode><vczle><vczbe>"
> +  [(set (match_operand:VD 0 "register_operand" "=w")
> +        (unspec:VD [(match_operand:VD 1 "register_operand" "w")
> +		    (match_operand:SI 2 "immediate_operand" "i")]
> +		   UNSPEC_VEC_SHL))]
> +  "TARGET_SIMD"
> +  {
> +    if (BYTES_BIG_ENDIAN)
> +      return "ushr\t%d0, %d1, %2";
> +    else
> +      return "shl\t%d0, %d1, %2";
>    }
>    [(set_attr "type" "neon_shift_imm")]
>  )
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 99cde46f1ba..3aebe9cf18a 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -758,6 +758,7 @@ (define_c_enum "unspec"
>      UNSPEC_PMULL        ; Used in aarch64-simd.md.
>      UNSPEC_PMULL2       ; Used in aarch64-simd.md.
>      UNSPEC_REV_REGLIST  ; Used in aarch64-simd.md.
> +    UNSPEC_VEC_SHL      ; Used in aarch64-simd.md.
>      UNSPEC_VEC_SHR      ; Used in aarch64-simd.md.
>      UNSPEC_SQRDMLAH     ; Used in aarch64-simd.md.
>      UNSPEC_SQRDMLSH     ; Used in aarch64-simd.md.
> diff --git a/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c b/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
> new file mode 100644
> index 00000000000..3c8f0591a2f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2"  } */
> +/* PR target/113872 */
> +/* For 64bit vectors, PERM with a constant 0 should produce a shift instead of the ext instruction. */
> +
> +#define vect64 __attribute__((vector_size(8)))
> +
> +void f(vect64  unsigned short *a)
> +{
> +  *a = __builtin_shufflevector((vect64 unsigned short){0},*a, 3,4,5,6);
> +}
> +
> +/* { dg-final { scan-assembler-times "ushr\t" 1 { target aarch64_big_endian } } } */
> +/* { dg-final { scan-assembler-times "shl\t" 1 { target aarch64_little_endian } } } */
> +/* { dg-final { scan-assembler-not "ext\t"  } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/perm_zero-2.c b/gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
> new file mode 100644
> index 00000000000..970e428f832
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2"  } */
> +/* PR target/113872 */
> +/* For 64bit vectors, PERM with a constant 0 should produce a shift instead of the ext instruction. */
> +
> +#define vect64 __attribute__((vector_size(8)))
> +
> +void f(vect64  unsigned short *a)
> +{
> +  *a = __builtin_shufflevector(*a, (vect64 unsigned short){0},3,4,5,6);
> +}
> +
> +/* { dg-final { scan-assembler-times "shl\t" 1 { target aarch64_big_endian } } } */
> +/* { dg-final { scan-assembler-times "ushr\t" 1 { target aarch64_little_endian } } } */
> +/* { dg-final { scan-assembler-not "ext\t"  } } */
Tamar Christina Feb. 15, 2024, 4:27 p.m. UTC | #2
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Thursday, February 15, 2024 2:56 PM
> To: Andrew Pinski <quic_apinski@quicinc.com>
> Cc: gcc-patches@gcc.gnu.org; Tamar Christina <Tamar.Christina@arm.com>
> Subject: Re: [PATCH] aarch64: Improve PERM<{0}, a, ...> (64bit) by adding whole
> vector shift right [PR113872]
> 
> Andrew Pinski <quic_apinski@quicinc.com> writes:
> > The backend currently defines a whole vector shift left for 64bit vectors, adding
> the
> > shift right can also improve code for some PERMs too. So this adds that pattern.
> 
> Is this reversed?  It looks like we have the shift right and the patch is
> adding the shift left (at least in GCC internal and little-endian terms).
> 
> But on many Arm cores, EXT has a higher throughput than SHL, so I don't think
> we should do this unconditionally.

Yeah, on most (if not all) all Arm cores the EXT has higher throughput than SHL
and on Cortex-A75 the EXT has both higher throughput and lower latency.

I guess the expected gain here is that we wouldn't need to create the zero vector,
However on modern Arm cores the zero vector creation is free using movi and EXT
being three operand also means we only need one copy if e.g in a loop.

Kind Regards,
Tamar

> 
> Thanks,
> Richard
> 
> >
> > I added a testcase for the shift left also. I also fixed the instruction template
> > there which was using a space instead of a tab after the instruction.
> >
> > Built and tested on aarch64-linux-gnu.
> >
> > 	PR target/113872
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64-simd.md (vec_shr_<mode><vczle><vczbe>):
> Use tab instead of space after
> > 	the instruction in the template.
> > 	(vec_shl_<mode><vczle><vczbe>): New pattern
> > 	* config/aarch64/iterators.md (unspec): Add UNSPEC_VEC_SHL
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/perm_zero-1.c: New test.
> > 	* gcc.target/aarch64/perm_zero-2.c: New test.
> >
> > Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
> > ---
> >  gcc/config/aarch64/aarch64-simd.md             | 18 ++++++++++++++++--
> >  gcc/config/aarch64/iterators.md                |  1 +
> >  gcc/testsuite/gcc.target/aarch64/perm_zero-1.c | 15 +++++++++++++++
> >  gcc/testsuite/gcc.target/aarch64/perm_zero-2.c | 15 +++++++++++++++
> >  4 files changed, 47 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> > index f8bb973a278..0d2f1ea3902 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -1592,9 +1592,23 @@ (define_insn "vec_shr_<mode><vczle><vczbe>"
> >    "TARGET_SIMD"
> >    {
> >      if (BYTES_BIG_ENDIAN)
> > -      return "shl %d0, %d1, %2";
> > +      return "shl\t%d0, %d1, %2";
> >      else
> > -      return "ushr %d0, %d1, %2";
> > +      return "ushr\t%d0, %d1, %2";
> > +  }
> > +  [(set_attr "type" "neon_shift_imm")]
> > +)
> > +(define_insn "vec_shl_<mode><vczle><vczbe>"
> > +  [(set (match_operand:VD 0 "register_operand" "=w")
> > +        (unspec:VD [(match_operand:VD 1 "register_operand" "w")
> > +		    (match_operand:SI 2 "immediate_operand" "i")]
> > +		   UNSPEC_VEC_SHL))]
> > +  "TARGET_SIMD"
> > +  {
> > +    if (BYTES_BIG_ENDIAN)
> > +      return "ushr\t%d0, %d1, %2";
> > +    else
> > +      return "shl\t%d0, %d1, %2";
> >    }
> >    [(set_attr "type" "neon_shift_imm")]
> >  )
> > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> > index 99cde46f1ba..3aebe9cf18a 100644
> > --- a/gcc/config/aarch64/iterators.md
> > +++ b/gcc/config/aarch64/iterators.md
> > @@ -758,6 +758,7 @@ (define_c_enum "unspec"
> >      UNSPEC_PMULL        ; Used in aarch64-simd.md.
> >      UNSPEC_PMULL2       ; Used in aarch64-simd.md.
> >      UNSPEC_REV_REGLIST  ; Used in aarch64-simd.md.
> > +    UNSPEC_VEC_SHL      ; Used in aarch64-simd.md.
> >      UNSPEC_VEC_SHR      ; Used in aarch64-simd.md.
> >      UNSPEC_SQRDMLAH     ; Used in aarch64-simd.md.
> >      UNSPEC_SQRDMLSH     ; Used in aarch64-simd.md.
> > diff --git a/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
> b/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
> > new file mode 100644
> > index 00000000000..3c8f0591a2f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
> > @@ -0,0 +1,15 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2"  } */
> > +/* PR target/113872 */
> > +/* For 64bit vectors, PERM with a constant 0 should produce a shift instead of
> the ext instruction. */
> > +
> > +#define vect64 __attribute__((vector_size(8)))
> > +
> > +void f(vect64  unsigned short *a)
> > +{
> > +  *a = __builtin_shufflevector((vect64 unsigned short){0},*a, 3,4,5,6);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "ushr\t" 1 { target aarch64_big_endian } } }
> */
> > +/* { dg-final { scan-assembler-times "shl\t" 1 { target aarch64_little_endian } } }
> */
> > +/* { dg-final { scan-assembler-not "ext\t"  } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
> b/gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
> > new file mode 100644
> > index 00000000000..970e428f832
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
> > @@ -0,0 +1,15 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2"  } */
> > +/* PR target/113872 */
> > +/* For 64bit vectors, PERM with a constant 0 should produce a shift instead of
> the ext instruction. */
> > +
> > +#define vect64 __attribute__((vector_size(8)))
> > +
> > +void f(vect64  unsigned short *a)
> > +{
> > +  *a = __builtin_shufflevector(*a, (vect64 unsigned short){0},3,4,5,6);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "shl\t" 1 { target aarch64_big_endian } } } */
> > +/* { dg-final { scan-assembler-times "ushr\t" 1 { target aarch64_little_endian } }
> } */
> > +/* { dg-final { scan-assembler-not "ext\t"  } } */
Andrew Pinski Feb. 15, 2024, 5:05 p.m. UTC | #3
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Thursday, February 15, 2024 8:27 AM
> To: Richard Sandiford <Richard.Sandiford@arm.com>; Andrew Pinski (QUIC)
> <quic_apinski@quicinc.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: RE: [PATCH] aarch64: Improve PERM<{0}, a, ...> (64bit) by adding
> whole vector shift right [PR113872]
> 
> > -----Original Message-----
> > From: Richard Sandiford <richard.sandiford@arm.com>
> > Sent: Thursday, February 15, 2024 2:56 PM
> > To: Andrew Pinski <quic_apinski@quicinc.com>
> > Cc: gcc-patches@gcc.gnu.org; Tamar Christina <Tamar.Christina@arm.com>
> > Subject: Re: [PATCH] aarch64: Improve PERM<{0}, a, ...> (64bit) by
> > adding whole vector shift right [PR113872]
> >
> > Andrew Pinski <quic_apinski@quicinc.com> writes:
> > > The backend currently defines a whole vector shift left for 64bit
> > > vectors, adding
> > the
> > > shift right can also improve code for some PERMs too. So this adds that
> pattern.
> >
> > Is this reversed?  It looks like we have the shift right and the patch
> > is adding the shift left (at least in GCC internal and little-endian terms).
> >
> > But on many Arm cores, EXT has a higher throughput than SHL, so I
> > don't think we should do this unconditionally.
> 
> Yeah, on most (if not all) all Arm cores the EXT has higher throughput than SHL
> and on Cortex-A75 the EXT has both higher throughput and lower latency.
> 
> I guess the expected gain here is that we wouldn't need to create the zero
> vector, However on modern Arm cores the zero vector creation is free using
> movi and EXT being three operand also means we only need one copy if e.g in
> a loop.

That might be true on Arm's cores but that is not true on Quacom's cores which I am working with.
EXT and SHL have the same throughput and latency but movi is not free (in that it still not done in the renamer) and a register is definitely not free if there is huge register pressure.
So I think we will need to figure out a way to have this as a tuning mechanism.

Thanks,
Andrew Pinski

> 
> Kind Regards,
> Tamar
> 
> >
> > Thanks,
> > Richard
> >
> > >
> > > I added a testcase for the shift left also. I also fixed the
> > > instruction template there which was using a space instead of a tab after
> the instruction.
> > >
> > > Built and tested on aarch64-linux-gnu.
> > >
> > > 	PR target/113872
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* config/aarch64/aarch64-simd.md
> (vec_shr_<mode><vczle><vczbe>):
> > Use tab instead of space after
> > > 	the instruction in the template.
> > > 	(vec_shl_<mode><vczle><vczbe>): New pattern
> > > 	* config/aarch64/iterators.md (unspec): Add UNSPEC_VEC_SHL
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > 	* gcc.target/aarch64/perm_zero-1.c: New test.
> > > 	* gcc.target/aarch64/perm_zero-2.c: New test.
> > >
> > > Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
> > > ---
> > >  gcc/config/aarch64/aarch64-simd.md             | 18 ++++++++++++++++--
> > >  gcc/config/aarch64/iterators.md                |  1 +
> > >  gcc/testsuite/gcc.target/aarch64/perm_zero-1.c | 15 +++++++++++++++
> > > gcc/testsuite/gcc.target/aarch64/perm_zero-2.c | 15 +++++++++++++++
> > >  4 files changed, 47 insertions(+), 2 deletions(-)  create mode
> > > 100644 gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
> > >  create mode 100644 gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
> > >
> > > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > > index f8bb973a278..0d2f1ea3902 100644
> > > --- a/gcc/config/aarch64/aarch64-simd.md
> > > +++ b/gcc/config/aarch64/aarch64-simd.md
> > > @@ -1592,9 +1592,23 @@ (define_insn
> "vec_shr_<mode><vczle><vczbe>"
> > >    "TARGET_SIMD"
> > >    {
> > >      if (BYTES_BIG_ENDIAN)
> > > -      return "shl %d0, %d1, %2";
> > > +      return "shl\t%d0, %d1, %2";
> > >      else
> > > -      return "ushr %d0, %d1, %2";
> > > +      return "ushr\t%d0, %d1, %2";
> > > +  }
> > > +  [(set_attr "type" "neon_shift_imm")]
> > > +)
> > > +(define_insn "vec_shl_<mode><vczle><vczbe>"
> > > +  [(set (match_operand:VD 0 "register_operand" "=w")
> > > +        (unspec:VD [(match_operand:VD 1 "register_operand" "w")
> > > +		    (match_operand:SI 2 "immediate_operand" "i")]
> > > +		   UNSPEC_VEC_SHL))]
> > > +  "TARGET_SIMD"
> > > +  {
> > > +    if (BYTES_BIG_ENDIAN)
> > > +      return "ushr\t%d0, %d1, %2";
> > > +    else
> > > +      return "shl\t%d0, %d1, %2";
> > >    }
> > >    [(set_attr "type" "neon_shift_imm")]
> > >  )
> > > diff --git a/gcc/config/aarch64/iterators.md
> > > b/gcc/config/aarch64/iterators.md index 99cde46f1ba..3aebe9cf18a
> > > 100644
> > > --- a/gcc/config/aarch64/iterators.md
> > > +++ b/gcc/config/aarch64/iterators.md
> > > @@ -758,6 +758,7 @@ (define_c_enum "unspec"
> > >      UNSPEC_PMULL        ; Used in aarch64-simd.md.
> > >      UNSPEC_PMULL2       ; Used in aarch64-simd.md.
> > >      UNSPEC_REV_REGLIST  ; Used in aarch64-simd.md.
> > > +    UNSPEC_VEC_SHL      ; Used in aarch64-simd.md.
> > >      UNSPEC_VEC_SHR      ; Used in aarch64-simd.md.
> > >      UNSPEC_SQRDMLAH     ; Used in aarch64-simd.md.
> > >      UNSPEC_SQRDMLSH     ; Used in aarch64-simd.md.
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
> > b/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
> > > new file mode 100644
> > > index 00000000000..3c8f0591a2f
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
> > > @@ -0,0 +1,15 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2"  } */
> > > +/* PR target/113872 */
> > > +/* For 64bit vectors, PERM with a constant 0 should produce a shift
> > > +instead of
> > the ext instruction. */
> > > +
> > > +#define vect64 __attribute__((vector_size(8)))
> > > +
> > > +void f(vect64  unsigned short *a)
> > > +{
> > > +  *a = __builtin_shufflevector((vect64 unsigned short){0},*a,
> > > +3,4,5,6); }
> > > +
> > > +/* { dg-final { scan-assembler-times "ushr\t" 1 { target
> > > +aarch64_big_endian } } }
> > */
> > > +/* { dg-final { scan-assembler-times "shl\t" 1 { target
> > > +aarch64_little_endian } } }
> > */
> > > +/* { dg-final { scan-assembler-not "ext\t"  } } */
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
> > b/gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
> > > new file mode 100644
> > > index 00000000000..970e428f832
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
> > > @@ -0,0 +1,15 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2"  } */
> > > +/* PR target/113872 */
> > > +/* For 64bit vectors, PERM with a constant 0 should produce a shift
> > > +instead of
> > the ext instruction. */
> > > +
> > > +#define vect64 __attribute__((vector_size(8)))
> > > +
> > > +void f(vect64  unsigned short *a)
> > > +{
> > > +  *a = __builtin_shufflevector(*a, (vect64 unsigned
> > > +short){0},3,4,5,6); }
> > > +
> > > +/* { dg-final { scan-assembler-times "shl\t" 1 { target
> > > +aarch64_big_endian } } } */
> > > +/* { dg-final { scan-assembler-times "ushr\t" 1 { target
> > > +aarch64_little_endian } }
> > } */
> > > +/* { dg-final { scan-assembler-not "ext\t"  } } */
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f8bb973a278..0d2f1ea3902 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1592,9 +1592,23 @@  (define_insn "vec_shr_<mode><vczle><vczbe>"
   "TARGET_SIMD"
   {
     if (BYTES_BIG_ENDIAN)
-      return "shl %d0, %d1, %2";
+      return "shl\t%d0, %d1, %2";
     else
-      return "ushr %d0, %d1, %2";
+      return "ushr\t%d0, %d1, %2";
+  }
+  [(set_attr "type" "neon_shift_imm")]
+)
+(define_insn "vec_shl_<mode><vczle><vczbe>"
+  [(set (match_operand:VD 0 "register_operand" "=w")
+        (unspec:VD [(match_operand:VD 1 "register_operand" "w")
+		    (match_operand:SI 2 "immediate_operand" "i")]
+		   UNSPEC_VEC_SHL))]
+  "TARGET_SIMD"
+  {
+    if (BYTES_BIG_ENDIAN)
+      return "ushr\t%d0, %d1, %2";
+    else
+      return "shl\t%d0, %d1, %2";
   }
   [(set_attr "type" "neon_shift_imm")]
 )
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 99cde46f1ba..3aebe9cf18a 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -758,6 +758,7 @@  (define_c_enum "unspec"
     UNSPEC_PMULL        ; Used in aarch64-simd.md.
     UNSPEC_PMULL2       ; Used in aarch64-simd.md.
     UNSPEC_REV_REGLIST  ; Used in aarch64-simd.md.
+    UNSPEC_VEC_SHL      ; Used in aarch64-simd.md.
     UNSPEC_VEC_SHR      ; Used in aarch64-simd.md.
     UNSPEC_SQRDMLAH     ; Used in aarch64-simd.md.
     UNSPEC_SQRDMLSH     ; Used in aarch64-simd.md.
diff --git a/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c b/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
new file mode 100644
index 00000000000..3c8f0591a2f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2"  } */
+/* PR target/113872 */
+/* For 64bit vectors, PERM with a constant 0 should produce a shift instead of the ext instruction. */
+
+#define vect64 __attribute__((vector_size(8)))
+
+void f(vect64  unsigned short *a)
+{
+  *a = __builtin_shufflevector((vect64 unsigned short){0},*a, 3,4,5,6);
+}
+
+/* { dg-final { scan-assembler-times "ushr\t" 1 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times "shl\t" 1 { target aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-not "ext\t"  } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/perm_zero-2.c b/gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
new file mode 100644
index 00000000000..970e428f832
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2"  } */
+/* PR target/113872 */
+/* For 64bit vectors, PERM with a constant 0 should produce a shift instead of the ext instruction. */
+
+#define vect64 __attribute__((vector_size(8)))
+
+void f(vect64  unsigned short *a)
+{
+  *a = __builtin_shufflevector(*a, (vect64 unsigned short){0},3,4,5,6);
+}
+
+/* { dg-final { scan-assembler-times "shl\t" 1 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times "ushr\t" 1 { target aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-not "ext\t"  } } */