diff mbox series

AArch64: Use UZP1 instead of INS

Message ID PAWPR08MB8982BC625437D0489D1D83A083EC2@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers show
Series AArch64: Use UZP1 instead of INS | expand

Commit Message

Wilco Dijkstra May 15, 2024, 10:09 a.m. UTC
Use UZP1 instead of INS when combining low and high halves of vectors.
UZP1 has 3 operands which improves register allocation, and is faster on
some microarchitectures.

Passes regress & bootstrap, OK for commit?

gcc:
        * config/aarch64/aarch64-simd.md (aarch64_combine_internal<mode>):
        Use UZP1 instead of INS.
        (aarch64_combine_internal_be<mode>): Likewise.

gcc/testsuite:	
        * gcc.target/aarch64/ldp_stp_16.c: Update to check for UZP1.	
        * gcc.target/aarch64/pr109072_1.c: Likewise.
        * gcc.target/aarch64/vec-init-14.c: Likewise.
        * gcc.target/aarch64/vec-init-9.c: Likewise.

---

Comments

Richard Sandiford May 15, 2024, 10:22 a.m. UTC | #1
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Use UZP1 instead of INS when combining low and high halves of vectors.
> UZP1 has 3 operands which improves register allocation, and is faster on
> some microarchitectures.
>
> Passes regress & bootstrap, OK for commit?

OK, thanks.  We can add core-specific tuning later if a supported core
strongly prefers INS for some reason, but I agree that the three-address
nature of UZP1 makes it the better default choice.

Richard

>
> gcc:
>         * config/aarch64/aarch64-simd.md (aarch64_combine_internal<mode>):
>         Use UZP1 instead of INS.
>         (aarch64_combine_internal_be<mode>): Likewise.
>
> gcc/testsuite:	
>         * gcc.target/aarch64/ldp_stp_16.c: Update to check for UZP1.	
>         * gcc.target/aarch64/pr109072_1.c: Likewise.
>         * gcc.target/aarch64/vec-init-14.c: Likewise.
>         * gcc.target/aarch64/vec-init-9.c: Likewise.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index f8bb973a278c7964f3e3a4f7154a0ab62214b7cf..16b7445d9f72f77a98ab262e21fd24e6cc97eba0 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -4388,7 +4388,7 @@
>     && (register_operand (operands[0], <VDBL>mode)
>         || register_operand (operands[2], <MODE>mode))"
>    {@ [ cons: =0 , 1  , 2   ; attrs: type               , arch  ]
> -     [ w        , 0  , w   ; neon_ins<dblq>            , simd  ] ins\t%0.<single_type>[1], %2.<single_type>[0]
> +     [ w        , w  , w   ; neon_permute<dblq>        , simd  ] uzp1\t%0.2<single_type>, %1.2<single_type>, %2.2<single_type>
>       [ w        , 0  , ?r  ; neon_from_gp<dblq>        , simd  ] ins\t%0.<single_type>[1], %<single_wx>2
>       [ w        , 0  , ?r  ; f_mcr                     , *     ] fmov\t%0.d[1], %2
>       [ w        , 0  , Utv ; neon_load1_one_lane<dblq> , simd  ] ld1\t{%0.<single_type>}[1], %2
> @@ -4407,7 +4407,7 @@
>     && (register_operand (operands[0], <VDBL>mode)
>         || register_operand (operands[2], <MODE>mode))"
>    {@ [ cons: =0 , 1  , 2   ; attrs: type               , arch  ]
> -     [ w        , 0  , w   ; neon_ins<dblq>            , simd  ] ins\t%0.<single_type>[1], %2.<single_type>[0]
> +     [ w        , w  , w   ; neon_permute<dblq>        , simd  ] uzp1\t%0.2<single_type>, %1.2<single_type>, %2.2<single_type>
>       [ w        , 0  , ?r  ; neon_from_gp<dblq>        , simd  ] ins\t%0.<single_type>[1], %<single_wx>2
>       [ w        , 0  , ?r  ; f_mcr                     , *     ] fmov\t%0.d[1], %2
>       [ w        , 0  , Utv ; neon_load1_one_lane<dblq> , simd  ] ld1\t{%0.<single_type>}[1], %2
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
> index f1f46e051a86d160a7f7f14872108da87b444ca1..95835aa2eb41c289e7b74f19bb56cf6fa23a3045 100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
> @@ -80,16 +80,16 @@ CONS2_FN (2, float);
>  
>  /*
>  ** cons2_4_float:	{ target aarch64_little_endian }
> -**	ins	v0.s\[1\], v1.s\[0\]
> -**	stp	d0, d0, \[x0\]
> -**	stp	d0, d0, \[x0, #?16\]
> +**	uzp1	v([0-9])\.2s, v0\.2s, v1\.2s
> +**	stp	d\1, d\1, \[x0\]
> +**	stp	d\1, d\1, \[x0, #?16\]
>  **	ret
>  */
>  /*
>  ** cons2_4_float:	{ target aarch64_big_endian }
> -**	ins	v1.s\[1\], v0.s\[0\]
> -**	stp	d1, d1, \[x0\]
> -**	stp	d1, d1, \[x0, #?16\]
> +**	uzp1	v([0-9])\.2s, v1\.2s, v0\.2s
> +**	stp	d\1, d\1, \[x0\]
> +**	stp	d\1, d\1, \[x0, #?16\]
>  **	ret
>  */
>  CONS2_FN (4, float);
> @@ -125,8 +125,8 @@ CONS4_FN (2, float);
>  
>  /*
>  ** cons4_4_float:
> -**	ins	v[0-9]+\.s[^\n]+
> -**	ins	v[0-9]+\.s[^\n]+
> +**	uzp1	v[0-9]+\.2s[^\n]+
> +**	uzp1	v[0-9]+\.2s[^\n]+
>  **	zip1	v([0-9]+).4s, [^\n]+
>  **	stp	q\1, q\1, \[x0\]
>  **	stp	q\1, q\1, \[x0, #?32\]
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
> index 6c1d2b0bdccfb74b80d938a0d94413f0f9dda5ab..0fc195a598f3b82ff188b3151e77e1272254b78c 100644
> --- a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
> @@ -54,7 +54,7 @@ f32x2_1 (float32_t x)
>  
>  /*
>  ** f32x2_2:
> -**	ins	v0\.s\[1\], v1.s\[0\]
> +**	uzp1	v0\.2s, v0\.2s, v1\.2s
>  **	ret
>  */
>  float32x2_t
> @@ -165,7 +165,7 @@ f64x2_1 (float64_t x)
>  
>  /*
>  ** f64x2_2:
> -**	ins	v0\.d\[1\], v1.d\[0\]
> +**	uzp1	v0\.2d, v0\.2d, v1\.2d
>  **	ret
>  */
>  float64x2_t
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-14.c b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
> index 02875088cd98833882cdf15b14dcb426951e428f..1a2cc9fbf473ad0de2d8ef97d7efdbe40d959866 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
> @@ -67,7 +67,7 @@ int32x2_t s32_6(int32_t a0, int32_t a1) {
>  
>  /*
>  ** f32_1:
> -**	ins	v0\.s\[1\], v1\.s\[0\]
> +**	uzp1	v0\.2s, v0\.2s, v1\.2s
>  **	ret
>  */
>  float32x2_t f32_1(float32_t a0, float32_t a1) {
> @@ -90,7 +90,7 @@ float32x2_t f32_2(float32_t a0, float32_t *ptr) {
>  /*
>  ** f32_3:
>  **	ldr	s0, \[x0\]
> -**	ins	v0\.s\[1\], v1\.s\[0\]
> +**	uzp1	v0\.2s, v0\.2s, v1\.2s
>  **	ret
>  */
>  float32x2_t f32_3(float32_t a0, float32_t a1, float32_t *ptr) {
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-9.c b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
> index 8f68e06a55925b973a87723c7b5924264382e4b0..3cf05cf865e21fad482e5ffc8c769d0f15a57e74 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
> @@ -75,7 +75,7 @@ int64x2_t s64q_6(int64_t a0, int64_t a1) {
>  
>  /*
>  ** f64q_1:
> -**	ins	v0\.d\[1\], v1\.d\[0\]
> +**	uzp1	v0\.2d, v0\.2d, v1\.2d
>  **	ret
>  */
>  float64x2_t f64q_1(float64_t a0, float64_t a1) {
> @@ -98,7 +98,7 @@ float64x2_t f64q_2(float64_t a0, float64_t *ptr) {
>  /*
>  ** f64q_3:
>  **	ldr	d0, \[x0\]
> -**	ins	v0\.d\[1\], v1\.d\[0\]
> +**	uzp1	v0\.2d, v0\.2d, v1\.2d
>  **	ret
>  */
>  float64x2_t f64q_3(float64_t a0, float64_t a1, float64_t *ptr) {
> @@ -140,7 +140,7 @@ float64x2_t f64q_6(float64_t a0, float64_t a1) {
>  
>  /*
>  ** s32q_1:
> -**	ins	v0\.d\[1\], v1\.d\[0\]
> +**	uzp1	v0\.2d, v0\.2d, v1\.2d
>  **	ret
>  */
>  int32x4_t s32q_1(int32x2_t a0, int32x2_t a1) {
> @@ -157,7 +157,7 @@ int32x4_t s32q_2(int32x2_t a0, int32x2_t *ptr) {
>  /*
>  ** s32q_3:
>  **	ldr	d0, \[x0\]
> -**	ins	v0\.d\[1\], v1\.d\[0\]
> +**	uzp1	v0\.2d, v0\.2d, v1\.2d
>  **	ret
>  */
>  int32x4_t s32q_3(int32x2_t a0, int32x2_t a1, int32x2_t *ptr) {
> @@ -204,7 +204,7 @@ int32x4_t s32q_6(int32x2_t a0, int32x2_t a1) {
>  
>  /*
>  ** f32q_1:
> -**	ins	v0\.d\[1\], v1\.d\[0\]
> +**	uzp1	v0\.2d, v0\.2d, v1\.2d
>  **	ret
>  */
>  float32x4_t f32q_1(float32x2_t a0, float32x2_t a1) {
> @@ -221,7 +221,7 @@ float32x4_t f32q_2(float32x2_t a0, float32x2_t *ptr) {
>  /*
>  ** f32q_3:
>  **	ldr	d0, \[x0\]
> -**	ins	v0\.d\[1\], v1\.d\[0\]
> +**	uzp1	v0\.2d, v0\.2d, v1\.2d
>  **	ret
>  */
>  float32x4_t f32q_3(float32x2_t a0, float32x2_t a1, float32x2_t *ptr) {
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f8bb973a278c7964f3e3a4f7154a0ab62214b7cf..16b7445d9f72f77a98ab262e21fd24e6cc97eba0 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4388,7 +4388,7 @@ 
    && (register_operand (operands[0], <VDBL>mode)
        || register_operand (operands[2], <MODE>mode))"
   {@ [ cons: =0 , 1  , 2   ; attrs: type               , arch  ]
-     [ w        , 0  , w   ; neon_ins<dblq>            , simd  ] ins\t%0.<single_type>[1], %2.<single_type>[0]
+     [ w        , w  , w   ; neon_permute<dblq>        , simd  ] uzp1\t%0.2<single_type>, %1.2<single_type>, %2.2<single_type>
      [ w        , 0  , ?r  ; neon_from_gp<dblq>        , simd  ] ins\t%0.<single_type>[1], %<single_wx>2
      [ w        , 0  , ?r  ; f_mcr                     , *     ] fmov\t%0.d[1], %2
      [ w        , 0  , Utv ; neon_load1_one_lane<dblq> , simd  ] ld1\t{%0.<single_type>}[1], %2
@@ -4407,7 +4407,7 @@ 
    && (register_operand (operands[0], <VDBL>mode)
        || register_operand (operands[2], <MODE>mode))"
   {@ [ cons: =0 , 1  , 2   ; attrs: type               , arch  ]
-     [ w        , 0  , w   ; neon_ins<dblq>            , simd  ] ins\t%0.<single_type>[1], %2.<single_type>[0]
+     [ w        , w  , w   ; neon_permute<dblq>        , simd  ] uzp1\t%0.2<single_type>, %1.2<single_type>, %2.2<single_type>
      [ w        , 0  , ?r  ; neon_from_gp<dblq>        , simd  ] ins\t%0.<single_type>[1], %<single_wx>2
      [ w        , 0  , ?r  ; f_mcr                     , *     ] fmov\t%0.d[1], %2
      [ w        , 0  , Utv ; neon_load1_one_lane<dblq> , simd  ] ld1\t{%0.<single_type>}[1], %2
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
index f1f46e051a86d160a7f7f14872108da87b444ca1..95835aa2eb41c289e7b74f19bb56cf6fa23a3045 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
@@ -80,16 +80,16 @@  CONS2_FN (2, float);
 
 /*
 ** cons2_4_float:	{ target aarch64_little_endian }
-**	ins	v0.s\[1\], v1.s\[0\]
-**	stp	d0, d0, \[x0\]
-**	stp	d0, d0, \[x0, #?16\]
+**	uzp1	v([0-9])\.2s, v0\.2s, v1\.2s
+**	stp	d\1, d\1, \[x0\]
+**	stp	d\1, d\1, \[x0, #?16\]
 **	ret
 */
 /*
 ** cons2_4_float:	{ target aarch64_big_endian }
-**	ins	v1.s\[1\], v0.s\[0\]
-**	stp	d1, d1, \[x0\]
-**	stp	d1, d1, \[x0, #?16\]
+**	uzp1	v([0-9])\.2s, v1\.2s, v0\.2s
+**	stp	d\1, d\1, \[x0\]
+**	stp	d\1, d\1, \[x0, #?16\]
 **	ret
 */
 CONS2_FN (4, float);
@@ -125,8 +125,8 @@  CONS4_FN (2, float);
 
 /*
 ** cons4_4_float:
-**	ins	v[0-9]+\.s[^\n]+
-**	ins	v[0-9]+\.s[^\n]+
+**	uzp1	v[0-9]+\.2s[^\n]+
+**	uzp1	v[0-9]+\.2s[^\n]+
 **	zip1	v([0-9]+).4s, [^\n]+
 **	stp	q\1, q\1, \[x0\]
 **	stp	q\1, q\1, \[x0, #?32\]
diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
index 6c1d2b0bdccfb74b80d938a0d94413f0f9dda5ab..0fc195a598f3b82ff188b3151e77e1272254b78c 100644
--- a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
@@ -54,7 +54,7 @@  f32x2_1 (float32_t x)
 
 /*
 ** f32x2_2:
-**	ins	v0\.s\[1\], v1.s\[0\]
+**	uzp1	v0\.2s, v0\.2s, v1\.2s
 **	ret
 */
 float32x2_t
@@ -165,7 +165,7 @@  f64x2_1 (float64_t x)
 
 /*
 ** f64x2_2:
-**	ins	v0\.d\[1\], v1.d\[0\]
+**	uzp1	v0\.2d, v0\.2d, v1\.2d
 **	ret
 */
 float64x2_t
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-14.c b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
index 02875088cd98833882cdf15b14dcb426951e428f..1a2cc9fbf473ad0de2d8ef97d7efdbe40d959866 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
@@ -67,7 +67,7 @@  int32x2_t s32_6(int32_t a0, int32_t a1) {
 
 /*
 ** f32_1:
-**	ins	v0\.s\[1\], v1\.s\[0\]
+**	uzp1	v0\.2s, v0\.2s, v1\.2s
 **	ret
 */
 float32x2_t f32_1(float32_t a0, float32_t a1) {
@@ -90,7 +90,7 @@  float32x2_t f32_2(float32_t a0, float32_t *ptr) {
 /*
 ** f32_3:
 **	ldr	s0, \[x0\]
-**	ins	v0\.s\[1\], v1\.s\[0\]
+**	uzp1	v0\.2s, v0\.2s, v1\.2s
 **	ret
 */
 float32x2_t f32_3(float32_t a0, float32_t a1, float32_t *ptr) {
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-9.c b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
index 8f68e06a55925b973a87723c7b5924264382e4b0..3cf05cf865e21fad482e5ffc8c769d0f15a57e74 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
@@ -75,7 +75,7 @@  int64x2_t s64q_6(int64_t a0, int64_t a1) {
 
 /*
 ** f64q_1:
-**	ins	v0\.d\[1\], v1\.d\[0\]
+**	uzp1	v0\.2d, v0\.2d, v1\.2d
 **	ret
 */
 float64x2_t f64q_1(float64_t a0, float64_t a1) {
@@ -98,7 +98,7 @@  float64x2_t f64q_2(float64_t a0, float64_t *ptr) {
 /*
 ** f64q_3:
 **	ldr	d0, \[x0\]
-**	ins	v0\.d\[1\], v1\.d\[0\]
+**	uzp1	v0\.2d, v0\.2d, v1\.2d
 **	ret
 */
 float64x2_t f64q_3(float64_t a0, float64_t a1, float64_t *ptr) {
@@ -140,7 +140,7 @@  float64x2_t f64q_6(float64_t a0, float64_t a1) {
 
 /*
 ** s32q_1:
-**	ins	v0\.d\[1\], v1\.d\[0\]
+**	uzp1	v0\.2d, v0\.2d, v1\.2d
 **	ret
 */
 int32x4_t s32q_1(int32x2_t a0, int32x2_t a1) {
@@ -157,7 +157,7 @@  int32x4_t s32q_2(int32x2_t a0, int32x2_t *ptr) {
 /*
 ** s32q_3:
 **	ldr	d0, \[x0\]
-**	ins	v0\.d\[1\], v1\.d\[0\]
+**	uzp1	v0\.2d, v0\.2d, v1\.2d
 **	ret
 */
 int32x4_t s32q_3(int32x2_t a0, int32x2_t a1, int32x2_t *ptr) {
@@ -204,7 +204,7 @@  int32x4_t s32q_6(int32x2_t a0, int32x2_t a1) {
 
 /*
 ** f32q_1:
-**	ins	v0\.d\[1\], v1\.d\[0\]
+**	uzp1	v0\.2d, v0\.2d, v1\.2d
 **	ret
 */
 float32x4_t f32q_1(float32x2_t a0, float32x2_t a1) {
@@ -221,7 +221,7 @@  float32x4_t f32q_2(float32x2_t a0, float32x2_t *ptr) {
 /*
 ** f32q_3:
 **	ldr	d0, \[x0\]
-**	ins	v0\.d\[1\], v1\.d\[0\]
+**	uzp1	v0\.2d, v0\.2d, v1\.2d
 **	ret
 */
 float32x4_t f32q_3(float32x2_t a0, float32x2_t a1, float32x2_t *ptr) {