diff mbox series

[2/4] AArch64: correct usdot vectorizer and intrinsics optabs

Message ID 20210715163953.GA2861@arm.com
State New
Headers show
Series [1/4,committed] testsuite: Fix testisms in scalar tests PR101457 | expand

Commit Message

Tamar Christina July 15, 2021, 4:39 p.m. UTC
Hi All,

There's a slight mismatch between the vectorizer optabs and the intrinsics
patterns for NEON.  The vectorizer expects operands[3] and operands[0] to be
the same but the aarch64 intrinsics expanders expect operands[0] and
operands[1] to be the same.

This means we need different patterns here.  This adds a separate usdot
vectorizer pattern which just shuffles around the RTL params.

There's also an inconsistency between the usdot and (u|s)dot intrinsics RTL
patterns which is not corrected here.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Rename to...
	(aarch64_usdot<vsi2qi>): ..This
	(usdot_prod<vsi2qi>): New.
	* config/aarch64/arm_neon.h (vusdot_s32, vusdotq_s32): Use
	aarch64_usdot<vsi2qi>.
	* config/aarch64/aarch64-simd-builtins.def: Likewise.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 063f503ebd96657f017dfaa067cb231991376bda..ac5d4fc7ff1e61d404e66193b629986382ee4ffd 100644


--

Comments

Richard Sandiford July 15, 2021, 7:34 p.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> There's a slight mismatch between the vectorizer optabs and the intrinsics
> patterns for NEON.  The vectorizer expects operands[3] and operands[0] to be
> the same but the aarch64 intrinsics expanders expect operands[0] and
> operands[1] to be the same.
>
> This means we need different patterns here.  This adds a separate usdot
> vectorizer pattern which just shuffles around the RTL params.
>
> There's also an inconsistency between the usdot and (u|s)dot intrinsics RTL
> patterns which is not corrected here.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?

Couldn't we just change:

> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 00d76ea937ace5763746478cbdfadf6479e0b15a..17e059efb80fa86a8a32127ace4fc7f43e2040a8 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -34039,14 +34039,14 @@ __extension__ extern __inline int32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
>  {
> -  return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b);
> +  return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);

…this to __builtin_aarch64_usdot_prodv8qi_ssus (__a, __b, __r) etc.?
I think that's an OK thing to do when the function is named after
an optab rather than an arm_neon.h intrinsic.

Thanks,
Richard

>  }
>  
>  __extension__ extern __inline int32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
>  {
> -  return __builtin_aarch64_usdot_prodv16qi_ssus (__r, __a, __b);
> +  return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b);
>  }
>  
>  __extension__ extern __inline int32x2_t
Tamar Christina July 20, 2021, 12:34 p.m. UTC | #2
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Thursday, July 15, 2021 8:35 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH 2/4]AArch64: correct usdot vectorizer and intrinsics
> optabs
> 
> Tamar Christina <tamar.christina@arm.com> writes:
> > Hi All,
> >
> > There's a slight mismatch between the vectorizer optabs and the
> > intrinsics patterns for NEON.  The vectorizer expects operands[3] and
> > operands[0] to be the same but the aarch64 intrinsics expanders expect
> > operands[0] and operands[1] to be the same.
> >
> > This means we need different patterns here.  This adds a separate
> > usdot vectorizer pattern which just shuffles around the RTL params.
> >
> > There's also an inconsistency between the usdot and (u|s)dot
> > intrinsics RTL patterns which is not corrected here.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> 
> Couldn't we just change:
> 
> > diff --git a/gcc/config/aarch64/arm_neon.h
> > b/gcc/config/aarch64/arm_neon.h index
> >
> 00d76ea937ace5763746478cbdfadf6479e0b15a..17e059efb80fa86a8a32127ac
> e4f
> > c7f43e2040a8 100644
> > --- a/gcc/config/aarch64/arm_neon.h
> > +++ b/gcc/config/aarch64/arm_neon.h
> > @@ -34039,14 +34039,14 @@ __extension__ extern __inline int32x2_t
> > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> >  vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)  {
> > -  return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b);
> > +  return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);
> 
> …this to __builtin_aarch64_usdot_prodv8qi_ssus (__a, __b, __r) etc.?

Not easily, as I was mentioning before, Neon intrinsics have the assumption that
operands[0] and operands[1] are the same. And this goes much further than just
the header call.

The actual type is determined by the optabs and the C stubs that are generated.

aarch64_init_simd_builtins which creates the C function stubs starts processing
arguments from the end and on non-void functions assumes that the value at
operands[0] be the return type. So simply moving __r will get it to think that
the result type should be uint8x8_t.

I can bypass this but then have to write a custom expander in expand code to
handle this, but at point, is it really worth it..

Tamar

> I think that's an OK thing to do when the function is named after
> an optab rather than an arm_neon.h intrinsic.
> 
> Thanks,
> Richard
> 
> >  }
> >
> >  __extension__ extern __inline int32x4_t
> >  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> >  vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
> >  {
> > -  return __builtin_aarch64_usdot_prodv16qi_ssus (__r, __a, __b);
> > +  return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b);
> >  }
> >
> >  __extension__ extern __inline int32x2_t
Richard Sandiford July 20, 2021, 4:15 p.m. UTC | #3
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Thursday, July 15, 2021 8:35 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH 2/4]AArch64: correct usdot vectorizer and intrinsics
>> optabs
>> 
>> Tamar Christina <tamar.christina@arm.com> writes:
>> > Hi All,
>> >
>> > There's a slight mismatch between the vectorizer optabs and the
>> > intrinsics patterns for NEON.  The vectorizer expects operands[3] and
>> > operands[0] to be the same but the aarch64 intrinsics expanders expect
>> > operands[0] and operands[1] to be the same.
>> >
>> > This means we need different patterns here.  This adds a separate
>> > usdot vectorizer pattern which just shuffles around the RTL params.
>> >
>> > There's also an inconsistency between the usdot and (u|s)dot
>> > intrinsics RTL patterns which is not corrected here.
>> >
>> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> >
>> > Ok for master?
>> 
>> Couldn't we just change:
>> 
>> > diff --git a/gcc/config/aarch64/arm_neon.h
>> > b/gcc/config/aarch64/arm_neon.h index
>> >
>> 00d76ea937ace5763746478cbdfadf6479e0b15a..17e059efb80fa86a8a32127ac
>> e4f
>> > c7f43e2040a8 100644
>> > --- a/gcc/config/aarch64/arm_neon.h
>> > +++ b/gcc/config/aarch64/arm_neon.h
>> > @@ -34039,14 +34039,14 @@ __extension__ extern __inline int32x2_t
>> > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> >  vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)  {
>> > -  return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b);
>> > +  return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);
>> 
>> …this to __builtin_aarch64_usdot_prodv8qi_ssus (__a, __b, __r) etc.?
>
> Not easily, as I was mentioning before, Neon intrinsics have the assumption that
> operands[0] and operands[1] are the same. And this goes much further than just
> the header call.
>
> The actual type is determined by the optabs and the C stubs that are generated.
>
> aarch64_init_simd_builtins which creates the C function stubs starts processing
> arguments from the end and on non-void functions assumes that the value at
> operands[0] be the return type. So simply moving __r will get it to think that
> the result type should be uint8x8_t.

Yeah, the mode of operand 0 (i.e. the output) determines the return type.
But that mode isn't changing, so the return type will be correct for both
input operand orders.  It works for me locally with:

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 88fa5ba5a44..5987d9af7c6 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -610,12 +610,12 @@ (define_expand "cmul<conj_op><mode>3"
 ;; and so the vectorizer provides r, in which the result has to be accumulated.
 (define_insn "<sur>dot_prod<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
-	(plus:VS (match_operand:VS 1 "register_operand" "0")
-		(unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
-			    (match_operand:<VSI2QI> 3 "register_operand" "w")]
-		DOTPROD)))]
+	(plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
+			     (match_operand:<VSI2QI> 2 "register_operand" "w")]
+			    DOTPROD)
+		 (match_operand:VS 3 "register_operand" "0")))]
   "TARGET_DOTPROD"
-  "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
+  "<sur>dot\\t%0.<Vtype>, %1.<Vdottype>, %2.<Vdottype>"
   [(set_attr "type" "neon_dot<q>")]
 )
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 597f44ce106..64b6d43a1a0 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -31767,28 +31767,28 @@ __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b)
 {
-  return __builtin_aarch64_udot_prodv8qi_uuuu (__r, __a, __b);
+  return __builtin_aarch64_udot_prodv8qi_uuuu (__a, __b, __r);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_aarch64_udot_prodv16qi_uuuu (__r, __a, __b);
+  return __builtin_aarch64_udot_prodv16qi_uuuu (__a, __b, __r);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_sdot_prodv8qi (__r, __a, __b);
+  return __builtin_aarch64_sdot_prodv8qi (__a, __b, __r);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_sdot_prodv16qi (__r, __a, __b);
+  return __builtin_aarch64_sdot_prodv16qi (__a, __b, __r);
 }
 
 __extension__ extern __inline uint32x2_t

Thanks,
Richard
Tamar Christina July 22, 2021, 11:50 a.m. UTC | #4
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SUSS,
	aarch64_types_ternop_suss_qualifiers): New.
	* config/aarch64/aarch64-simd-builtins.def (usdot_prod): Use it.
	* config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Re-organize RTL.
	* config/aarch64/arm_neon.h (vusdot_s32, vusdotq_s32): Use it.

--- inline copy of patch --

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 9ed4b72d005799b8984a858f96d4763e7fa5aa39..f6b41d9c200d6300dee65ba60ae94488231a8a38 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -209,6 +209,10 @@ static enum aarch64_type_qualifiers
 aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };
 #define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_ternop_suss_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_unsigned, qualifier_none, qualifier_none };
+#define TYPES_TERNOP_SUSS (aarch64_types_ternop_suss_qualifiers)
 
 
 static enum aarch64_type_qualifiers
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index b7f1237b1ffd0d4ca283c853be1cc94b9fc35260..3bb45a82945b143497035ec30d35543b2dad55a3 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -377,7 +377,7 @@
   /* Implemented by <sur><dotprod>_prod<dot_mode>.  */
   BUILTIN_VB (TERNOP, sdot, 0, NONE)
   BUILTIN_VB (TERNOPU, udot, 0, NONE)
-  BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE)
+  BUILTIN_VB (TERNOP_SUSS, usdot_prod, 10, NONE)
   /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>.  */
   BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
   BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 7332a735d35846e0d9375ad2686ed7ecdb09cd29..bf667b99944e3fcce618a21c77bd5b804b3a0b5d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -599,20 +599,6 @@ (define_insn "aarch64_<sur>dot<vsi2qi>"
   [(set_attr "type" "neon_dot<q>")]
 )
 
-;; These instructions map to the __builtins for the armv8.6a I8MM usdot
-;; (vector) Dot Product operation.
-(define_insn "usdot_prod<vsi2qi>"
-  [(set (match_operand:VS 0 "register_operand" "=w")
-	(plus:VS
-	  (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
-		      (match_operand:<VSI2QI> 3 "register_operand" "w")]
-	  UNSPEC_USDOT)
-	  (match_operand:VS 1 "register_operand" "0")))]
-  "TARGET_I8MM"
-  "usdot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
-  [(set_attr "type" "neon_dot<q>")]
-)
-
 ;; These expands map to the Dot Product optab the vectorizer checks for.
 ;; The auto-vectorizer expects a dot product builtin that also does an
 ;; accumulation into the provided register.
@@ -648,6 +634,20 @@ (define_expand "<sur>dot_prod<vsi2qi>"
   DONE;
 })
 
+;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
+;; (vector) Dot Product operation and the vectorized optab.
+(define_insn "usdot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+	(plus:VS
+	  (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
+		      (match_operand:<VSI2QI> 2 "register_operand" "w")]
+	  UNSPEC_USDOT)
+	  (match_operand:VS 3 "register_operand" "0")))]
+  "TARGET_I8MM"
+  "usdot\\t%0.<Vtype>, %1.<Vdottype>, %2.<Vdottype>"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
 ;; These instructions map to the __builtins for the Dot Product
 ;; indexed operations.
 (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 1048d7c7eaac14554142eaa7544159a50929b7f1..8396e872580bc9fb32b872f3915485b02ec2b334 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34021,14 +34021,14 @@ __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b);
+  return __builtin_aarch64_usdot_prodv8qi_suss (__a, __b, __r);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_usdot_prodv16qi_ssus (__r, __a, __b);
+  return __builtin_aarch64_usdot_prodv16qi_suss (__a, __b, __r);
 }
 
 __extension__ extern __inline int32x2_t

> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Tuesday, July 20, 2021 5:16 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH 2/4]AArch64: correct usdot vectorizer and intrinsics
> optabs
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Thursday, July 15, 2021 8:35 PM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> >> Subject: Re: [PATCH 2/4]AArch64: correct usdot vectorizer and
> >> intrinsics optabs
> >>
> >> Tamar Christina <tamar.christina@arm.com> writes:
> >> > Hi All,
> >> >
> >> > There's a slight mismatch between the vectorizer optabs and the
> >> > intrinsics patterns for NEON.  The vectorizer expects operands[3]
> >> > and operands[0] to be the same but the aarch64 intrinsics expanders
> >> > expect operands[0] and operands[1] to be the same.
> >> >
> >> > This means we need different patterns here.  This adds a separate
> >> > usdot vectorizer pattern which just shuffles around the RTL params.
> >> >
> >> > There's also an inconsistency between the usdot and (u|s)dot
> >> > intrinsics RTL patterns which is not corrected here.
> >> >
> >> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >> >
> >> > Ok for master?
> >>
> >> Couldn't we just change:
> >>
> >> > diff --git a/gcc/config/aarch64/arm_neon.h
> >> > b/gcc/config/aarch64/arm_neon.h index
> >> >
> >>
> 00d76ea937ace5763746478cbdfadf6479e0b15a..17e059efb80fa86a8a32127ac
> >> e4f
> >> > c7f43e2040a8 100644
> >> > --- a/gcc/config/aarch64/arm_neon.h
> >> > +++ b/gcc/config/aarch64/arm_neon.h
> >> > @@ -34039,14 +34039,14 @@ __extension__ extern __inline int32x2_t
> >> > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> >> >  vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)  {
> >> > -  return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b);
> >> > +  return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);
> >>
> >> …this to __builtin_aarch64_usdot_prodv8qi_ssus (__a, __b, __r) etc.?
> >
> > Not easily, as I was mentioning before, Neon intrinsics have the
> > assumption that operands[0] and operands[1] are the same. And this
> > goes much further than just the header call.
> >
> > The actual type is determined by the optabs and the C stubs that are
> generated.
> >
> > aarch64_init_simd_builtins which creates the C function stubs starts
> > processing arguments from the end and on non-void functions assumes
> > that the value at operands[0] be the return type. So simply moving __r
> > will get it to think that the result type should be uint8x8_t.
> 
> Yeah, the mode of operand 0 (i.e. the output) determines the return type.
> But that mode isn't changing, so the return type will be correct for both input
> operand orders.  It works for me locally with:
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index 88fa5ba5a44..5987d9af7c6 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -610,12 +610,12 @@ (define_expand "cmul<conj_op><mode>3"
>  ;; and so the vectorizer provides r, in which the result has to be accumulated.
>  (define_insn "<sur>dot_prod<vsi2qi>"
>    [(set (match_operand:VS 0 "register_operand" "=w")
> -	(plus:VS (match_operand:VS 1 "register_operand" "0")
> -		(unspec:VS [(match_operand:<VSI2QI> 2 "register_operand"
> "w")
> -			    (match_operand:<VSI2QI> 3 "register_operand"
> "w")]
> -		DOTPROD)))]
> +	(plus:VS (unspec:VS [(match_operand:<VSI2QI> 1
> "register_operand" "w")
> +			     (match_operand:<VSI2QI> 2 "register_operand"
> "w")]
> +			    DOTPROD)
> +		 (match_operand:VS 3 "register_operand" "0")))]
>    "TARGET_DOTPROD"
> -  "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
> +  "<sur>dot\\t%0.<Vtype>, %1.<Vdottype>, %2.<Vdottype>"
>    [(set_attr "type" "neon_dot<q>")]
>  )
> 
> diff --git a/gcc/config/aarch64/arm_neon.h
> b/gcc/config/aarch64/arm_neon.h index 597f44ce106..64b6d43a1a0 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -31767,28 +31767,28 @@ __extension__ extern __inline uint32x2_t
> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b)  {
> -  return __builtin_aarch64_udot_prodv8qi_uuuu (__r, __a, __b);
> +  return __builtin_aarch64_udot_prodv8qi_uuuu (__a, __b, __r);
>  }
> 
>  __extension__ extern __inline uint32x4_t  __attribute__
> ((__always_inline__, __gnu_inline__, __artificial__))
>  vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)  {
> -  return __builtin_aarch64_udot_prodv16qi_uuuu (__r, __a, __b);
> +  return __builtin_aarch64_udot_prodv16qi_uuuu (__a, __b, __r);
>  }
> 
>  __extension__ extern __inline int32x2_t  __attribute__
> ((__always_inline__, __gnu_inline__, __artificial__))
>  vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b)  {
> -  return __builtin_aarch64_sdot_prodv8qi (__r, __a, __b);
> +  return __builtin_aarch64_sdot_prodv8qi (__a, __b, __r);
>  }
> 
>  __extension__ extern __inline int32x4_t  __attribute__
> ((__always_inline__, __gnu_inline__, __artificial__))
>  vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)  {
> -  return __builtin_aarch64_sdot_prodv16qi (__r, __a, __b);
> +  return __builtin_aarch64_sdot_prodv16qi (__a, __b, __r);
>  }
> 
>  __extension__ extern __inline uint32x2_t
> 
> Thanks,
> Richard
Richard Sandiford July 22, 2021, 6:09 p.m. UTC | #5
Tamar Christina <Tamar.Christina@arm.com> writes:
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SUSS,
> 	aarch64_types_ternop_suss_qualifiers): New.
> 	* config/aarch64/aarch64-simd-builtins.def (usdot_prod): Use it.
> 	* config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Re-organize RTL.
> 	* config/aarch64/arm_neon.h (vusdot_s32, vusdotq_s32): Use it.

OK, thanks.

Richard
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 063f503ebd96657f017dfaa067cb231991376bda..ac5d4fc7ff1e61d404e66193b629986382ee4ffd 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -374,11 +374,10 @@ 
   BUILTIN_VSDQ_I_DI (BINOP, srshl, 0, NONE)
   BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE)
 
-  /* Implemented by <sur><dotprod>_prod<dot_mode>.  */
+  /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>.  */
   BUILTIN_VB (TERNOP, sdot, 0, NONE)
   BUILTIN_VB (TERNOPU, udot, 0, NONE)
-  BUILTIN_VB (TERNOP_SSUS, usdot_prod, 10, NONE)
-  /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>.  */
+  BUILTIN_VB (TERNOP_SSUS, usdot, 0, NONE)
   BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
   BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, NONE)
   BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 74890989cb3045798bf8d0241467eaaf72238297..7397f1ec5ca0cb9e3cdd5c46772f604e640666e4 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -601,7 +601,7 @@  (define_insn "aarch64_<sur>dot<vsi2qi>"
 
 ;; These instructions map to the __builtins for the armv8.6a I8MM usdot
 ;; (vector) Dot Product operation.
-(define_insn "usdot_prod<vsi2qi>"
+(define_insn "aarch64_usdot<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
 	(plus:VS
 	  (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
@@ -648,6 +648,17 @@  (define_expand "<sur>dot_prod<vsi2qi>"
   DONE;
 })
 
+;; Auto-vectorizer pattern for usdot.  The operand[3] and operand[0] are the
+;; RMW parameters that when it comes to the vectorizer.
+(define_expand "usdot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand")
+	(plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+			    (match_operand:<VSI2QI> 2 "register_operand")]
+		 UNSPEC_USDOT)
+		 (match_operand:VS 3 "register_operand")))]
+  "TARGET_I8MM"
+)
+
 ;; These instructions map to the __builtins for the Dot Product
 ;; indexed operations.
 (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 00d76ea937ace5763746478cbdfadf6479e0b15a..17e059efb80fa86a8a32127ace4fc7f43e2040a8 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34039,14 +34039,14 @@  __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_usdot_prodv8qi_ssus (__r, __a, __b);
+  return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_usdot_prodv16qi_ssus (__r, __a, __b);
+  return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b);
 }
 
 __extension__ extern __inline int32x2_t