diff mbox series

[3/4,AArch32] : Add support for sign differing dot-product usdot for NEON.

Message ID 20210505173923.GA20719@arm.com
State New
Headers show
Series [1/4] middle-end Vect: Add support for dot-product where the sign for the multiplicant changes. | expand

Commit Message

Tamar Christina May 5, 2021, 5:39 p.m. UTC
Hi All,

This adds optabs implementing usdot_prod.

The following testcase:

#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned

SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
   SIGNEDNESS_4 char *restrict b)
{
  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
    {
      int av = a[i];
      int bv = b[i];
      SIGNEDNESS_2 short mult = av * bv;
      res += mult;
    }
  return res;
}

Generates

f:
        vmov.i32        q8, #0  @ v4si
        add     r3, r2, #480
.L2:
        vld1.8  {q10}, [r2]!
        vld1.8  {q9}, [r1]!
        vusdot.s8       q8, q9, q10
        cmp     r3, r2
        bne     .L2
        vadd.i32        d16, d16, d17
        vpadd.i32       d16, d16, d16
        vmov.32 r3, d16[0]
        add     r0, r0, r3
        bx      lr

instead of

f:
        vmov.i32        q8, #0  @ v4si
        add     r3, r2, #480
.L2:
        vld1.8  {q9}, [r2]!
        vld1.8  {q11}, [r1]!
        cmp     r3, r2
        vmull.s8 q10, d18, d22
        vmull.s8 q9, d19, d23
        vaddw.s16       q8, q8, d20
        vaddw.s16       q8, q8, d21
        vaddw.s16       q8, q8, d18
        vaddw.s16       q8, q8, d19
        bne     .L2
        vadd.i32        d16, d16, d17
        vpadd.i32       d16, d16, d16
        vmov.32 r3, d16[0]
        add     r0, r0, r3
        bx      lr

For NEON.  I couldn't figure out if the MVE instruction vmlaldav.s16 could be
used to emulate this.  Because it would require additional widening to work I
left MVE out of this patch set but perhaps someone should take a look.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/arm/neon.md (usdot_prod<vsi2qi>): New.

gcc/testsuite/ChangeLog:

	* gcc.target/arm/simd/vusdot-autovec.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index fec2cc91d24b6eff7b6fc8fdd54f39b3d646c468..23ad411178db77c5d19bee7452bc1070331c1aa0 100644


--

Comments

Christophe Lyon May 6, 2021, 9:23 a.m. UTC | #1
On Wed, 5 May 2021 at 19:39, Tamar Christina via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi All,
>
> This adds optabs implementing usdot_prod.
>
> The following testcase:
>
> #define N 480
> #define SIGNEDNESS_1 unsigned
> #define SIGNEDNESS_2 signed
> #define SIGNEDNESS_3 signed
> #define SIGNEDNESS_4 unsigned
>
> SIGNEDNESS_1 int __attribute__ ((noipa))
> f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
>    SIGNEDNESS_4 char *restrict b)
> {
>   for (__INTPTR_TYPE__ i = 0; i < N; ++i)
>     {
>       int av = a[i];
>       int bv = b[i];
>       SIGNEDNESS_2 short mult = av * bv;
>       res += mult;
>     }
>   return res;
> }
>
> Generates
>
> f:
>         vmov.i32        q8, #0  @ v4si
>         add     r3, r2, #480
> .L2:
>         vld1.8  {q10}, [r2]!
>         vld1.8  {q9}, [r1]!
>         vusdot.s8       q8, q9, q10
>         cmp     r3, r2
>         bne     .L2
>         vadd.i32        d16, d16, d17
>         vpadd.i32       d16, d16, d16
>         vmov.32 r3, d16[0]
>         add     r0, r0, r3
>         bx      lr
>
> instead of
>
> f:
>         vmov.i32        q8, #0  @ v4si
>         add     r3, r2, #480
> .L2:
>         vld1.8  {q9}, [r2]!
>         vld1.8  {q11}, [r1]!
>         cmp     r3, r2
>         vmull.s8 q10, d18, d22
>         vmull.s8 q9, d19, d23
>         vaddw.s16       q8, q8, d20
>         vaddw.s16       q8, q8, d21
>         vaddw.s16       q8, q8, d18
>         vaddw.s16       q8, q8, d19
>         bne     .L2
>         vadd.i32        d16, d16, d17
>         vpadd.i32       d16, d16, d16
>         vmov.32 r3, d16[0]
>         add     r0, r0, r3
>         bx      lr
>
> For NEON.  I couldn't figure out if the MVE instruction vmlaldav.s16 could be
> used to emulate this.  Because it would require additional widening to work I
> left MVE out of this patch set but perhaps someone should take a look.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

I guess you mean arm-linux-gnueabihf ?

>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>         * config/arm/neon.md (usdot_prod<vsi2qi>): New.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/arm/simd/vusdot-autovec.c: New test.
>
> --- inline copy of patch --
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index fec2cc91d24b6eff7b6fc8fdd54f39b3d646c468..23ad411178db77c5d19bee7452bc1070331c1aa0 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -3075,6 +3075,24 @@ (define_expand "<sup>dot_prod<vsi2qi>"
>    DONE;
>  })
>
> +;; Auto-vectorizer pattern for usdot
> +(define_expand "usdot_prod<vsi2qi>"
> +  [(set (match_operand:VCVTI 0 "register_operand")
> +       (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1
> +                                                       "register_operand")
> +                                  (match_operand:<VSI2QI> 2
> +                                                       "register_operand")]
> +                    UNSPEC_DOT_US)
> +                   (match_operand:VCVTI 3 "register_operand")))]
> +  "TARGET_I8MM"
> +{
> +  emit_insn (
> +    gen_neon_usdot<vsi2qi> (operands[3], operands[3], operands[1],
> +                           operands[2]));
> +  emit_insn (gen_rtx_SET (operands[0], operands[3]));
> +  DONE;
> +})
> +
>  (define_expand "neon_copysignf<mode>"
>    [(match_operand:VCVTF 0 "register_operand")
>     (match_operand:VCVTF 1 "register_operand")
> diff --git a/gcc/testsuite/gcc.target/arm/simd/vusdot-autovec.c b/gcc/testsuite/gcc.target/arm/simd/vusdot-autovec.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..7cc56f68817d77d6950df0ab372d6fbaad6b3813
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/vusdot-autovec.c
> @@ -0,0 +1,38 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=armv8.2-a+i8mm" } */
> +
> +#define N 480
> +#define SIGNEDNESS_1 unsigned
> +#define SIGNEDNESS_2 signed
> +#define SIGNEDNESS_3 signed
> +#define SIGNEDNESS_4 unsigned
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
> +   SIGNEDNESS_4 char *restrict b)
> +{
> +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> +    {
> +      int av = a[i];
> +      int bv = b[i];
> +      SIGNEDNESS_2 short mult = av * bv;
> +      res += mult;
> +    }
> +  return res;
> +}
> +
> +SIGNEDNESS_1 int __attribute__ ((noipa))
> +g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b,
> +   SIGNEDNESS_4 char *restrict a)
> +{
> +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> +    {
> +      int av = a[i];
> +      int bv = b[i];
> +      SIGNEDNESS_2 short mult = av * bv;
> +      res += mult;
> +    }
> +  return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {vusdot.s8} 2 { target { arm-*-*-gnueabihf } } } } */
>
>
> --
Tamar Christina May 6, 2021, 9:27 a.m. UTC | #2
> -----Original Message-----
> From: Christophe Lyon <christophe.lyon@linaro.org>
> Sent: Thursday, May 6, 2021 10:23 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc Patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>
> Subject: Re: [PATCH 3/4][AArch32]: Add support for sign differing dot-
> product usdot for NEON.
> 
> On Wed, 5 May 2021 at 19:39, Tamar Christina via Gcc-patches <gcc-
> patches@gcc.gnu.org> wrote:
> >
> > Hi All,
> >
> > This adds optabs implementing usdot_prod.
> >
> > The following testcase:
> >
> > #define N 480
> > #define SIGNEDNESS_1 unsigned
> > #define SIGNEDNESS_2 signed
> > #define SIGNEDNESS_3 signed
> > #define SIGNEDNESS_4 unsigned
> >
> > SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res,
> > SIGNEDNESS_3 char *restrict a,
> >    SIGNEDNESS_4 char *restrict b)
> > {
> >   for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> >     {
> >       int av = a[i];
> >       int bv = b[i];
> >       SIGNEDNESS_2 short mult = av * bv;
> >       res += mult;
> >     }
> >   return res;
> > }
> >
> > Generates
> >
> > f:
> >         vmov.i32        q8, #0  @ v4si
> >         add     r3, r2, #480
> > .L2:
> >         vld1.8  {q10}, [r2]!
> >         vld1.8  {q9}, [r1]!
> >         vusdot.s8       q8, q9, q10
> >         cmp     r3, r2
> >         bne     .L2
> >         vadd.i32        d16, d16, d17
> >         vpadd.i32       d16, d16, d16
> >         vmov.32 r3, d16[0]
> >         add     r0, r0, r3
> >         bx      lr
> >
> > instead of
> >
> > f:
> >         vmov.i32        q8, #0  @ v4si
> >         add     r3, r2, #480
> > .L2:
> >         vld1.8  {q9}, [r2]!
> >         vld1.8  {q11}, [r1]!
> >         cmp     r3, r2
> >         vmull.s8 q10, d18, d22
> >         vmull.s8 q9, d19, d23
> >         vaddw.s16       q8, q8, d20
> >         vaddw.s16       q8, q8, d21
> >         vaddw.s16       q8, q8, d18
> >         vaddw.s16       q8, q8, d19
> >         bne     .L2
> >         vadd.i32        d16, d16, d17
> >         vpadd.i32       d16, d16, d16
> >         vmov.32 r3, d16[0]
> >         add     r0, r0, r3
> >         bx      lr
> >
> > For NEON.  I couldn't figure out if the MVE instruction vmlaldav.s16
> > could be used to emulate this.  Because it would require additional
> > widening to work I left MVE out of this patch set but perhaps someone
> should take a look.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> I guess you mean arm-linux-gnueabihf ?
> 

Oops, yeah, automatic pilot..

> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> >         * config/arm/neon.md (usdot_prod<vsi2qi>): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/arm/simd/vusdot-autovec.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index
> >
> fec2cc91d24b6eff7b6fc8fdd54f39b3d646c468..23ad411178db77c5d19bee7452
> bc
> > 1070331c1aa0 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -3075,6 +3075,24 @@ (define_expand "<sup>dot_prod<vsi2qi>"
> >    DONE;
> >  })
> >
> > +;; Auto-vectorizer pattern for usdot
> > +(define_expand "usdot_prod<vsi2qi>"
> > +  [(set (match_operand:VCVTI 0 "register_operand")
> > +       (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1
> > +                                                       "register_operand")
> > +                                  (match_operand:<VSI2QI> 2
> > +                                                       "register_operand")]
> > +                    UNSPEC_DOT_US)
> > +                   (match_operand:VCVTI 3 "register_operand")))]
> > +  "TARGET_I8MM"
> > +{
> > +  emit_insn (
> > +    gen_neon_usdot<vsi2qi> (operands[3], operands[3], operands[1],
> > +                           operands[2]));
> > +  emit_insn (gen_rtx_SET (operands[0], operands[3]));
> > +  DONE;
> > +})
> > +
> >  (define_expand "neon_copysignf<mode>"
> >    [(match_operand:VCVTF 0 "register_operand")
> >     (match_operand:VCVTF 1 "register_operand") diff --git
> > a/gcc/testsuite/gcc.target/arm/simd/vusdot-autovec.c
> > b/gcc/testsuite/gcc.target/arm/simd/vusdot-autovec.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..7cc56f68817d77d6950df0ab37
> 2d
> > 6fbaad6b3813
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/vusdot-autovec.c
> > @@ -0,0 +1,38 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -march=armv8.2-a+i8mm" } */
> > +
> > +#define N 480
> > +#define SIGNEDNESS_1 unsigned
> > +#define SIGNEDNESS_2 signed
> > +#define SIGNEDNESS_3 signed
> > +#define SIGNEDNESS_4 unsigned
> > +
> > +SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res,
> > +SIGNEDNESS_3 char *restrict a,
> > +   SIGNEDNESS_4 char *restrict b)
> > +{
> > +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> > +    {
> > +      int av = a[i];
> > +      int bv = b[i];
> > +      SIGNEDNESS_2 short mult = av * bv;
> > +      res += mult;
> > +    }
> > +  return res;
> > +}
> > +
> > +SIGNEDNESS_1 int __attribute__ ((noipa)) g (SIGNEDNESS_1 int res,
> > +SIGNEDNESS_3 char *restrict b,
> > +   SIGNEDNESS_4 char *restrict a)
> > +{
> > +  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> > +    {
> > +      int av = a[i];
> > +      int bv = b[i];
> > +      SIGNEDNESS_2 short mult = av * bv;
> > +      res += mult;
> > +    }
> > +  return res;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {vusdot.s8} 2 { target {
> > +arm-*-*-gnueabihf } } } } */
> >
> >
> > --
diff mbox series

Patch

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index fec2cc91d24b6eff7b6fc8fdd54f39b3d646c468..23ad411178db77c5d19bee7452bc1070331c1aa0 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3075,6 +3075,24 @@  (define_expand "<sup>dot_prod<vsi2qi>"
   DONE;
 })
 
+;; Auto-vectorizer pattern for usdot
+(define_expand "usdot_prod<vsi2qi>"
+  [(set (match_operand:VCVTI 0 "register_operand")
+	(plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1
+							"register_operand")
+				   (match_operand:<VSI2QI> 2
+							"register_operand")]
+		     UNSPEC_DOT_US)
+		    (match_operand:VCVTI 3 "register_operand")))]
+  "TARGET_I8MM"
+{
+  emit_insn (
+    gen_neon_usdot<vsi2qi> (operands[3], operands[3], operands[1],
+			    operands[2]));
+  emit_insn (gen_rtx_SET (operands[0], operands[3]));
+  DONE;
+})
+
 (define_expand "neon_copysignf<mode>"
   [(match_operand:VCVTF 0 "register_operand")
    (match_operand:VCVTF 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/arm/simd/vusdot-autovec.c b/gcc/testsuite/gcc.target/arm/simd/vusdot-autovec.c
new file mode 100644
index 0000000000000000000000000000000000000000..7cc56f68817d77d6950df0ab372d6fbaad6b3813
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/vusdot-autovec.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+i8mm" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b,
+   SIGNEDNESS_4 char *restrict a)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-times {vusdot.s8} 2 { target { arm-*-*-gnueabihf } } } } */