diff mbox series

AArch64: Correct dot-product auto-vect optab RTL

Message ID patch-14499-tamar@arm.com
State New
Headers show
Series AArch64: Correct dot-product auto-vect optab RTL | expand

Commit Message

Tamar Christina May 25, 2021, 2:59 p.m. UTC
Hi All,

The current RTL for the vectorizer patterns for dot-product are incorrect.
Operand3 isn't an output parameter so we can't write to it.

This fixes this issue and reduces the number of RTL.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master? And backport to GCC 11, 10, 9?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def (udot, sdot): Rename to...
	(sdot_prod, udot_prod): ...These.
	* config/aarch64/aarch64-simd.md (<sur>dot_prod<vsi2qi>): Remove.
	(</vsi2qi>(aarch64_<sur>dot<vsi2qi>): Rename to...
	(<sur>dot_prod<vsi2qi>): ...This.
	* config/aarch64/arm_neon.h (vdot_u32, vdotq_u32, vdot_s32, vdotq_s32):
	Update builtins.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index c869ed9a6ab7d63f0e3d5fe393a93c1cc9142e78..fa3bb7b96710122957933b5c0b0b276256892a4c 100644


--

Comments

Richard Sandiford May 26, 2021, 8:52 a.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> The current RTL for the vectorizer patterns for dot-product are incorrect.
> Operand3 isn't an output parameter so we can't write to it.
>
> This fixes this issue and reduces the number of RTL.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master? And backport to GCC 11, 10, 9?

Yeah, OK for both master and backports, thanks.

Richard

> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-simd-builtins.def (udot, sdot): Rename to...
> 	(sdot_prod, udot_prod): ...These.
> 	* config/aarch64/aarch64-simd.md (<sur>dot_prod<vsi2qi>): Remove.
> 	(</vsi2qi>(aarch64_<sur>dot<vsi2qi>): Rename to...
> 	(<sur>dot_prod<vsi2qi>): ...This.
> 	* config/aarch64/arm_neon.h (vdot_u32, vdotq_u32, vdot_s32, vdotq_s32):
> 	Update builtins.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
> index c869ed9a6ab7d63f0e3d5fe393a93c1cc9142e78..fa3bb7b96710122957933b5c0b0b276256892a4c 100644
> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> @@ -362,8 +362,8 @@
>    BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE)
>  
>    /* Implemented by <sur><dotprod>_prod<dot_mode>.  */
> -  BUILTIN_VB (TERNOP, sdot, 0, NONE)
> -  BUILTIN_VB (TERNOPU, udot, 0, NONE)
> +  BUILTIN_VB (TERNOP, sdot_prod, 10, NONE)
> +  BUILTIN_VB (TERNOPU, udot_prod, 10, NONE)
>    BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE)
>    /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>.  */
>    BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 253ddbe25d3a86af4b40b056132e6a86a0392ea6..638e2d103bcba0af2292b16efd02046d1195095b 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -587,8 +587,28 @@ (define_expand "cmul<conj_op><mode>3"
>    DONE;
>  })
>  
> -;; These instructions map to the __builtins for the Dot Product operations.
> -(define_insn "aarch64_<sur>dot<vsi2qi>"
> +;; These expands map to the Dot Product optab the vectorizer checks for
> +;; and to the intrinsics patttern.
> +;; The auto-vectorizer expects a dot product builtin that also does an
> +;; accumulation into the provided register.
> +;; Given the following pattern
> +;;
> +;; for (i=0; i<len; i++) {
> +;;     c = a[i] * b[i];
> +;;     r += c;
> +;; }
> +;; return result;
> +;;
> +;; This can be auto-vectorized to
> +;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
> +;;
> +;; given enough iterations.  However the vectorizer can keep unrolling the loop
> +;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
> +;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
> +;; ...
> +;;
> +;; and so the vectorizer provides r, in which the result has to be accumulated.
> +(define_insn "<sur>dot_prod<vsi2qi>"
>    [(set (match_operand:VS 0 "register_operand" "=w")
>  	(plus:VS (match_operand:VS 1 "register_operand" "0")
>  		(unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
> @@ -613,41 +633,6 @@ (define_insn "usdot_prod<vsi2qi>"
>    [(set_attr "type" "neon_dot<q>")]
>  )
>  
> -;; These expands map to the Dot Product optab the vectorizer checks for.
> -;; The auto-vectorizer expects a dot product builtin that also does an
> -;; accumulation into the provided register.
> -;; Given the following pattern
> -;;
> -;; for (i=0; i<len; i++) {
> -;;     c = a[i] * b[i];
> -;;     r += c;
> -;; }
> -;; return result;
> -;;
> -;; This can be auto-vectorized to
> -;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
> -;;
> -;; given enough iterations.  However the vectorizer can keep unrolling the loop
> -;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
> -;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
> -;; ...
> -;;
> -;; and so the vectorizer provides r, in which the result has to be accumulated.
> -(define_expand "<sur>dot_prod<vsi2qi>"
> -  [(set (match_operand:VS 0 "register_operand")
> -	(plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
> -			    (match_operand:<VSI2QI> 2 "register_operand")]
> -		 DOTPROD)
> -		(match_operand:VS 3 "register_operand")))]
> -  "TARGET_DOTPROD"
> -{
> -  emit_insn (
> -    gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1],
> -				    operands[2]));
> -  emit_insn (gen_rtx_SET (operands[0], operands[3]));
> -  DONE;
> -})
> -
>  ;; These instructions map to the __builtins for the Dot Product
>  ;; indexed operations.
>  (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
> @@ -944,8 +929,7 @@ (define_expand "<sur>sadv16qi"
>  	rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
>  	rtx abd = gen_reg_rtx (V16QImode);
>  	emit_insn (gen_aarch64_<sur>abdv16qi (abd, operands[1], operands[2]));
> -	emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
> -					  abd, ones));
> +	emit_insn (gen_udot_prodv16qi (operands[0], operands[3], abd, ones));
>  	DONE;
>        }
>      rtx reduc = gen_reg_rtx (V8HImode);
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 373f06a24ea6ce686d7e0cdf53dd364041c61092..90770411f177f05b4f1bdbd83890734612c31dc3 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -32112,28 +32112,28 @@ __extension__ extern __inline uint32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b)
>  {
> -  return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b);
> +  return __builtin_aarch64_udot_prodv8qi_uuuu (__r, __a, __b);
>  }
>  
>  __extension__ extern __inline uint32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
>  {
> -  return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b);
> +  return __builtin_aarch64_udot_prodv16qi_uuuu (__r, __a, __b);
>  }
>  
>  __extension__ extern __inline int32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b)
>  {
> -  return __builtin_aarch64_sdotv8qi (__r, __a, __b);
> +  return __builtin_aarch64_sdot_prodv8qi (__r, __a, __b);
>  }
>  
>  __extension__ extern __inline int32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
>  {
> -  return __builtin_aarch64_sdotv16qi (__r, __a, __b);
> +  return __builtin_aarch64_sdot_prodv16qi (__r, __a, __b);
>  }
>  
>  __extension__ extern __inline uint32x2_t
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index c869ed9a6ab7d63f0e3d5fe393a93c1cc9142e78..fa3bb7b96710122957933b5c0b0b276256892a4c 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -362,8 +362,8 @@ 
   BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE)
 
   /* Implemented by <sur><dotprod>_prod<dot_mode>.  */
-  BUILTIN_VB (TERNOP, sdot, 0, NONE)
-  BUILTIN_VB (TERNOPU, udot, 0, NONE)
+  BUILTIN_VB (TERNOP, sdot_prod, 10, NONE)
+  BUILTIN_VB (TERNOPU, udot_prod, 10, NONE)
   BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE)
   /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>.  */
   BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 253ddbe25d3a86af4b40b056132e6a86a0392ea6..638e2d103bcba0af2292b16efd02046d1195095b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -587,8 +587,28 @@  (define_expand "cmul<conj_op><mode>3"
   DONE;
 })
 
-;; These instructions map to the __builtins for the Dot Product operations.
-(define_insn "aarch64_<sur>dot<vsi2qi>"
+;; These expands map to the Dot Product optab the vectorizer checks for
+;; and to the intrinsics patttern.
+;; The auto-vectorizer expects a dot product builtin that also does an
+;; accumulation into the provided register.
+;; Given the following pattern
+;;
+;; for (i=0; i<len; i++) {
+;;     c = a[i] * b[i];
+;;     r += c;
+;; }
+;; return result;
+;;
+;; This can be auto-vectorized to
+;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
+;;
+;; given enough iterations.  However the vectorizer can keep unrolling the loop
+;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
+;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
+;; ...
+;;
+;; and so the vectorizer provides r, in which the result has to be accumulated.
+(define_insn "<sur>dot_prod<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
 	(plus:VS (match_operand:VS 1 "register_operand" "0")
 		(unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
@@ -613,41 +633,6 @@  (define_insn "usdot_prod<vsi2qi>"
   [(set_attr "type" "neon_dot<q>")]
 )
 
-;; These expands map to the Dot Product optab the vectorizer checks for.
-;; The auto-vectorizer expects a dot product builtin that also does an
-;; accumulation into the provided register.
-;; Given the following pattern
-;;
-;; for (i=0; i<len; i++) {
-;;     c = a[i] * b[i];
-;;     r += c;
-;; }
-;; return result;
-;;
-;; This can be auto-vectorized to
-;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
-;;
-;; given enough iterations.  However the vectorizer can keep unrolling the loop
-;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
-;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
-;; ...
-;;
-;; and so the vectorizer provides r, in which the result has to be accumulated.
-(define_expand "<sur>dot_prod<vsi2qi>"
-  [(set (match_operand:VS 0 "register_operand")
-	(plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
-			    (match_operand:<VSI2QI> 2 "register_operand")]
-		 DOTPROD)
-		(match_operand:VS 3 "register_operand")))]
-  "TARGET_DOTPROD"
-{
-  emit_insn (
-    gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1],
-				    operands[2]));
-  emit_insn (gen_rtx_SET (operands[0], operands[3]));
-  DONE;
-})
-
 ;; These instructions map to the __builtins for the Dot Product
 ;; indexed operations.
 (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
@@ -944,8 +929,7 @@  (define_expand "<sur>sadv16qi"
 	rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
 	rtx abd = gen_reg_rtx (V16QImode);
 	emit_insn (gen_aarch64_<sur>abdv16qi (abd, operands[1], operands[2]));
-	emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
-					  abd, ones));
+	emit_insn (gen_udot_prodv16qi (operands[0], operands[3], abd, ones));
 	DONE;
       }
     rtx reduc = gen_reg_rtx (V8HImode);
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 373f06a24ea6ce686d7e0cdf53dd364041c61092..90770411f177f05b4f1bdbd83890734612c31dc3 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -32112,28 +32112,28 @@  __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b)
 {
-  return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b);
+  return __builtin_aarch64_udot_prodv8qi_uuuu (__r, __a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b);
+  return __builtin_aarch64_udot_prodv16qi_uuuu (__r, __a, __b);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_sdotv8qi (__r, __a, __b);
+  return __builtin_aarch64_sdot_prodv8qi (__r, __a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_sdotv16qi (__r, __a, __b);
+  return __builtin_aarch64_sdot_prodv16qi (__r, __a, __b);
 }
 
 __extension__ extern __inline uint32x2_t