[AArch64] Improve Neon store of zero

Message ID fc20ca5e-edf7-f092-f3e4-d4a61082485a@foss.arm.com
State New
Headers show

Commit Message

Jackson Woodruff Aug. 10, 2017, 1:12 p.m.
Hi all,

This patch changes patterns in aarch64-simd.md to replace

     movi    v0.4s, 0
     str    q0, [x0, 16]

With:

     stp xzr, xzr, [x0, 16]

When we are storing zeros to vectors like this:

     void f(uint32x4_t *p) {
       uint32x4_t x = { 0, 0, 0, 0};
       p[1] = x;
     }

Bootstrapped and regtested on aarch64 with no regressions.
OK for trunk?

Jackson

gcc/

2017-08-09  Jackson Woodruff  <jackson.woodruff@arm.com>

	* aarch64-simd.md (mov<mode>): No longer force zero
	immediate into register.
	(*aarch64_simd_mov<mode>): Add new case for stp
	using zero immediate.


gcc/testsuite

2017-08-09  Jackson Woodruff  <jackson.woodruff@arm.com>

	* gcc.target/aarch64/simd/neon_str_zero.c: New.

Comments

Richard Earnshaw (lists) Aug. 11, 2017, 1:56 p.m. | #1
On 10/08/17 14:12, Jackson Woodruff wrote:
> Hi all,
> 
> This patch changes patterns in aarch64-simd.md to replace
> 
>     movi    v0.4s, 0
>     str    q0, [x0, 16]
> 
> With:
> 
>     stp xzr, xzr, [x0, 16]
> 
> When we are storing zeros to vectors like this:
> 
>     void f(uint32x4_t *p) {
>       uint32x4_t x = { 0, 0, 0, 0};
>       p[1] = x;
>     }
> 
> Bootstrapped and regtested on aarch64 with no regressions.
> OK for trunk?
> 
> Jackson
> 
> gcc/
> 
> 2017-08-09  Jackson Woodruff  <jackson.woodruff@arm.com>
> 
>     * aarch64-simd.md (mov<mode>): No longer force zero
>     immediate into register.
>     (*aarch64_simd_mov<mode>): Add new case for stp
>     using zero immediate.
> 
> 
> gcc/testsuite
> 
> 2017-08-09  Jackson Woodruff  <jackson.woodruff@arm.com>
> 
>     * gcc.target/aarch64/simd/neon_str_zero.c: New.
> 
> 
> patchfile
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 74de9b8c89dd5e4e3d87504594c969de0e0128ce..0149a742d34ae4fd5b3fd705b03c845f94aa1d59 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -23,7 +23,10 @@
>  	(match_operand:VALL_F16 1 "general_operand" ""))]
>    "TARGET_SIMD"
>    "
> -    if (GET_CODE (operands[0]) == MEM)
> +    if (GET_CODE (operands[0]) == MEM
> +	    && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
> +		 && aarch64_legitimate_address_p (<MODE>mode, operands[0],
> +						  PARALLEL, 1)))
>        operands[1] = force_reg (<MODE>mode, operands[1]);
>    "
>  )
> @@ -94,63 +97,70 @@
>  
>  (define_insn "*aarch64_simd_mov<mode>"
>    [(set (match_operand:VD 0 "nonimmediate_operand"
> -		"=w, m,  w, ?r, ?w, ?r, w")
> +		"=w, m,  m,  w, ?r, ?w, ?r, w")
>  	(match_operand:VD 1 "general_operand"
> -		"m,  w,  w,  w,  r,  r, Dn"))]
> +		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
>    "TARGET_SIMD
> -   && (register_operand (operands[0], <MODE>mode)
> -       || register_operand (operands[1], <MODE>mode))"
> +   && ((register_operand (operands[0], <MODE>mode)
> +       || register_operand (operands[1], <MODE>mode))
> +      || (memory_operand (operands[0], <MODE>mode)
> +	  && immediate_operand (operands[1], <MODE>mode)))"

Allowing any immediate here seems too lax - it allows any immediate
value which then could cause reload operations to be inserted (that in
turn might cause register pressure calculations to be incorrect).
Wouldn't it be better to use something like aarch64_simd_reg_or_zero?
Similarly below.

R.

>  {
>     switch (which_alternative)
>       {
>       case 0: return "ldr\\t%d0, %1";
> -     case 1: return "str\\t%d1, %0";
> -     case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
> -     case 3: return "umov\t%0, %1.d[0]";
> -     case 4: return "fmov\t%d0, %1";
> -     case 5: return "mov\t%0, %1";
> -     case 6:
> +     case 1: return "str\\txzr, %0";
> +     case 2: return "str\\t%d1, %0";
> +     case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
> +     case 4: return "umov\t%0, %1.d[0]";
> +     case 5: return "fmov\t%d0, %1";
> +     case 6: return "mov\t%0, %1";
> +     case 7:
>  	return aarch64_output_simd_mov_immediate (operands[1],
>  						  <MODE>mode, 64);
>       default: gcc_unreachable ();
>       }
>  }
> -  [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
> +  [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\
>  		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
>  		     mov_reg, neon_move<q>")]
>  )
>  
>  (define_insn "*aarch64_simd_mov<mode>"
>    [(set (match_operand:VQ 0 "nonimmediate_operand"
> -		"=w, m,  w, ?r, ?w, ?r, w")
> +		"=w, Ump,  m,  w, ?r, ?w, ?r, w")
>  	(match_operand:VQ 1 "general_operand"
> -		"m,  w,  w,  w,  r,  r, Dn"))]
> +		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
>    "TARGET_SIMD
> -   && (register_operand (operands[0], <MODE>mode)
> -       || register_operand (operands[1], <MODE>mode))"
> +   && ((register_operand (operands[0], <MODE>mode)
> +	|| register_operand (operands[1], <MODE>mode))
> +       || (memory_operand (operands[0], <MODE>mode)
> +	   && immediate_operand (operands[1], <MODE>mode)))"
>  {
>    switch (which_alternative)
>      {
>      case 0:
>  	return "ldr\\t%q0, %1";
>      case 1:
> -	return "str\\t%q1, %0";
> +	return "stp\\txzr, xzr, %0";
>      case 2:
> -	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
> +	return "str\\t%q1, %0";
>      case 3:
> +	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
>      case 4:
>      case 5:
> -	return "#";
>      case 6:
> +	return "#";
> +    case 7:
>  	return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
>      default:
>  	gcc_unreachable ();
>      }
>  }
>    [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
> -                     neon_logic<q>, multiple, multiple, multiple,\
> -                     neon_move<q>")
> -   (set_attr "length" "4,4,4,8,8,8,4")]
> +		     neon_stp, neon_logic<q>, multiple, multiple,\
> +		     multiple, neon_move<q>")
> +   (set_attr "length" "4,4,4,4,8,8,8,4")]
>  )
>  
>  ;; When storing lane zero we can use the normal STR and its more permissive
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1" } */
> +
> +#include <arm_neon.h>
> +
> +void
> +f (uint32x4_t *p)
> +{
> +  uint32x4_t x = { 0, 0, 0, 0};
> +  p[1] = x;
> +
> +  /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
> +}
> +
> +void
> +g (float32x2_t *p)
> +{
> +  float32x2_t x = {0.0, 0.0};
> +  p[0] = x;
> +
> +  /* { dg-final { scan-assembler "str\txzr, " } } */
> +}
>

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 74de9b8c89dd5e4e3d87504594c969de0e0128ce..0149a742d34ae4fd5b3fd705b03c845f94aa1d59 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -23,7 +23,10 @@ 
 	(match_operand:VALL_F16 1 "general_operand" ""))]
   "TARGET_SIMD"
   "
-    if (GET_CODE (operands[0]) == MEM)
+    if (GET_CODE (operands[0]) == MEM
+	    && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
+		 && aarch64_legitimate_address_p (<MODE>mode, operands[0],
+						  PARALLEL, 1)))
       operands[1] = force_reg (<MODE>mode, operands[1]);
   "
 )
@@ -94,63 +97,70 @@ 
 
 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VD 0 "nonimmediate_operand"
-		"=w, m,  w, ?r, ?w, ?r, w")
+		"=w, m,  m,  w, ?r, ?w, ?r, w")
 	(match_operand:VD 1 "general_operand"
-		"m,  w,  w,  w,  r,  r, Dn"))]
+		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
-   && (register_operand (operands[0], <MODE>mode)
-       || register_operand (operands[1], <MODE>mode))"
+   && ((register_operand (operands[0], <MODE>mode)
+       || register_operand (operands[1], <MODE>mode))
+      || (memory_operand (operands[0], <MODE>mode)
+	  && immediate_operand (operands[1], <MODE>mode)))"
 {
    switch (which_alternative)
      {
      case 0: return "ldr\\t%d0, %1";
-     case 1: return "str\\t%d1, %0";
-     case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
-     case 3: return "umov\t%0, %1.d[0]";
-     case 4: return "fmov\t%d0, %1";
-     case 5: return "mov\t%0, %1";
-     case 6:
+     case 1: return "str\\txzr, %0";
+     case 2: return "str\\t%d1, %0";
+     case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+     case 4: return "umov\t%0, %1.d[0]";
+     case 5: return "fmov\t%d0, %1";
+     case 6: return "mov\t%0, %1";
+     case 7:
 	return aarch64_output_simd_mov_immediate (operands[1],
 						  <MODE>mode, 64);
      default: gcc_unreachable ();
      }
 }
-  [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
+  [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\
 		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
 		     mov_reg, neon_move<q>")]
 )
 
 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-		"=w, m,  w, ?r, ?w, ?r, w")
+		"=w, Ump,  m,  w, ?r, ?w, ?r, w")
 	(match_operand:VQ 1 "general_operand"
-		"m,  w,  w,  w,  r,  r, Dn"))]
+		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
-   && (register_operand (operands[0], <MODE>mode)
-       || register_operand (operands[1], <MODE>mode))"
+   && ((register_operand (operands[0], <MODE>mode)
+	|| register_operand (operands[1], <MODE>mode))
+       || (memory_operand (operands[0], <MODE>mode)
+	   && immediate_operand (operands[1], <MODE>mode)))"
 {
   switch (which_alternative)
     {
     case 0:
 	return "ldr\\t%q0, %1";
     case 1:
-	return "str\\t%q1, %0";
+	return "stp\\txzr, xzr, %0";
     case 2:
-	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+	return "str\\t%q1, %0";
     case 3:
+	return "mov\t%0.<Vbtype>, %1.<Vbtype>";
     case 4:
     case 5:
-	return "#";
     case 6:
+	return "#";
+    case 7:
 	return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
     default:
 	gcc_unreachable ();
     }
 }
   [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
-                     neon_logic<q>, multiple, multiple, multiple,\
-                     neon_move<q>")
-   (set_attr "length" "4,4,4,8,8,8,4")]
+		     neon_stp, neon_logic<q>, multiple, multiple,\
+		     multiple, neon_move<q>")
+   (set_attr "length" "4,4,4,4,8,8,8,4")]
 )
 
 ;; When storing lane zero we can use the normal STR and its more permissive
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
new file mode 100644
index 0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <arm_neon.h>
+
+void
+f (uint32x4_t *p)
+{
+  uint32x4_t x = { 0, 0, 0, 0};
+  p[1] = x;
+
+  /* { dg-final { scan-assembler "stp\txzr, xzr," } } */
+}
+
+void
+g (float32x2_t *p)
+{
+  float32x2_t x = {0.0, 0.0};
+  p[0] = x;
+
+  /* { dg-final { scan-assembler "str\txzr, " } } */
+}