diff mbox series

[1/7] arm: Auto-vectorization for MVE: vand

Message ID 1606312469-1370-1-git-send-email-christophe.lyon@linaro.org
State New
Headers show
Series [1/7] arm: Auto-vectorization for MVE: vand | expand

Commit Message

Christophe Lyon Nov. 25, 2020, 1:54 p.m. UTC
This patch enables MVE vandq instructions for auto-vectorization.  MVE
vandq insns in mve.md are modified to use 'and' instead of unspec
expression to support and<mode>3.  The and<mode>3 expander is added to
vec-common.md

2020-11-12  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* gcc/config/arm/iterators.md (supf): Remove VANDQ_S and VANDQ_U.
	(VANQ): Remove.
	* config/arm/mve.md (mve_vandq_u<mode>): New entry for vand
	instruction using expression and.
	(mve_vandq_s<mode>): New expander.
	* config/arm/neon.md (and<mode>3): Renamed into and<mode>3_neon.
	* config/arm/unspecs.md (VANDQ_S, VANDQ_U): Remove.
	* config/arm/vec-common.md (and<mode>3): New expander.

	gcc/testsuite/
	* gcc.target/arm/simd/mve-vand.c: New test.
---
 gcc/config/arm/iterators.md                  |  4 +---
 gcc/config/arm/mve.md                        | 20 ++++++++++++----
 gcc/config/arm/neon.md                       |  2 +-
 gcc/config/arm/unspecs.md                    |  2 --
 gcc/config/arm/vec-common.md                 | 15 ++++++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-vand.c | 34 ++++++++++++++++++++++++++++
 6 files changed, 66 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vand.c

Comments

Andre Vieira (lists) Nov. 25, 2020, 5:17 p.m. UTC | #1
Hi Christophe,

Thanks for these! See some inline comments.

On 25/11/2020 13:54, Christophe Lyon via Gcc-patches wrote:
> This patch enables MVE vandq instructions for auto-vectorization.  MVE
> vandq insns in mve.md are modified to use 'and' instead of unspec
> expression to support and<mode>3.  The and<mode>3 expander is added to
> vec-common.md
>
> 2020-11-12  Christophe Lyon  <christophe.lyon@linaro.org>
>
> 	gcc/
> 	* gcc/config/arm/iterators.md (supf): Remove VANDQ_S and VANDQ_U.
> 	(VANQ): Remove.
> 	* config/arm/mve.md (mve_vandq_u<mode>): New entry for vand
> 	instruction using expression and.
> 	(mve_vandq_s<mode>): New expander.
> 	* config/arm/neon.md (and<mode>3): Renamed into and<mode>3_neon.
> 	* config/arm/unspecs.md (VANDQ_S, VANDQ_U): Remove.
> 	* config/arm/vec-common.md (and<mode>3): New expander.
>
> 	gcc/testsuite/
> 	* gcc.target/arm/simd/mve-vand.c: New test.
> ---
>   gcc/config/arm/iterators.md                  |  4 +---
>   gcc/config/arm/mve.md                        | 20 ++++++++++++----
>   gcc/config/arm/neon.md                       |  2 +-
>   gcc/config/arm/unspecs.md                    |  2 --
>   gcc/config/arm/vec-common.md                 | 15 ++++++++++++
>   gcc/testsuite/gcc.target/arm/simd/mve-vand.c | 34 ++++++++++++++++++++++++++++
>   6 files changed, 66 insertions(+), 11 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vand.c
>
> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> index 592af35..72039e4 100644
> --- a/gcc/config/arm/iterators.md
> +++ b/gcc/config/arm/iterators.md
> @@ -1232,8 +1232,7 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
>   		       (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s")
>   		       (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
>   		       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
> -		       (VADDVQ_P_S "s")	(VADDVQ_P_U "u") (VANDQ_S "s")
> -		       (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u")
> +		       (VADDVQ_P_S "s")	(VADDVQ_P_U "u") (VBICQ_S "s") (VBICQ_U "u")
>   		       (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s")
>   		       (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s")
>   		       (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u")
> @@ -1501,7 +1500,6 @@ (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
>   (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
>   (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
>   (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
> -(define_int_iterator VANDQ [VANDQ_U VANDQ_S])
>   (define_int_iterator VBICQ [VBICQ_S VBICQ_U])
>   (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
>   (define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U])
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index ecbaaa9..975eb7d 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -894,17 +894,27 @@ (define_insn "mve_vaddvq_p_<supf><mode>"
>   ;;
>   ;; [vandq_u, vandq_s])
>   ;;
> -(define_insn "mve_vandq_<supf><mode>"
> +;; signed and unsigned versions are the same: define the unsigned
> +;; insn, and use an expander for the signed one as we still reference
> +;; both names from arm_mve.h.
> +(define_insn "mve_vandq_u<mode>"
>     [
>      (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> -	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
> -		       (match_operand:MVE_2 2 "s_register_operand" "w")]
> -	 VANDQ))
> +	(and:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
> +		       (match_operand:MVE_2 2 "s_register_operand" "w")))
The predicate on the second operand is more restrictive than the one in 
expand 'neon_inv_logic_op2'. This should still work with immediates, or 
well I checked for integers, it generates a loop as such:

         vldrw.32        q3, [r0]
         vldr.64 d4, .L8
         vldr.64 d5, .L8+8
         vand    q3, q3, q2
         vstrw.32        q3, [r2]

MVE does support vand's with immediates, just like NEON, I suspect you 
could just copy the way Neon handles these, possibly worth the little 
extra effort. You can use dest[i] = a[i] & ~1 as a testcase.
If you don't it might still be worth expanding the test to make sure 
other immediates-types combinations don't trigger an ICE?

I'm not sure I understand why it loads it in two 64-bit chunks and not 
do a single load or not just do something like a vmov or vbic immediate. 
Anyhow that's a worry for another day I guess..
>     ]
>     "TARGET_HAVE_MVE"
> -  "vand %q0, %q1, %q2"
> +  "vand\t%q0, %q1, %q2"
>     [(set_attr "type" "mve_move")
>   ])
> +(define_expand "mve_vandq_s<mode>"
> +  [
> +   (set (match_operand:MVE_2 0 "s_register_operand")
> +	(and:MVE_2 (match_operand:MVE_2 1 "s_register_operand")
> +		       (match_operand:MVE_2 2 "s_register_operand")))
> +  ]
> +  "TARGET_HAVE_MVE"
> +)
>   
>   ;;
>   ;; [vbicq_s, vbicq_u])
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 2d76769..dc4707d 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -712,7 +712,7 @@ (define_insn "ior<mode>3"
>   ;; corresponds to the canonical form the middle-end expects to use for
>   ;; immediate bitwise-ANDs.
>   
> -(define_insn "and<mode>3"
> +(define_insn "and<mode>3_neon"
>     [(set (match_operand:VDQ 0 "s_register_operand" "=w,w")
>   	(and:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0")
>   		 (match_operand:VDQ 2 "neon_inv_logic_op2" "w,DL")))]
> diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
> index a3844e9..e8bf68e 100644
> --- a/gcc/config/arm/unspecs.md
> +++ b/gcc/config/arm/unspecs.md
> @@ -601,7 +601,6 @@ (define_c_enum "unspec" [
>     VADDQ_N_S
>     VADDVAQ_S
>     VADDVQ_P_S
> -  VANDQ_S
>     VBICQ_S
>     VBRSRQ_N_S
>     VCADDQ_ROT270_S
> @@ -648,7 +647,6 @@ (define_c_enum "unspec" [
>     VADDQ_N_U
>     VADDVAQ_U
>     VADDVQ_P_U
> -  VANDQ_U
>     VBICQ_U
>     VBRSRQ_N_U
>     VCADDQ_ROT270_U
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index 250e503..3dd694c 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -172,3 +172,18 @@ (define_expand "vec_set<mode>"
>   					       GEN_INT (elem), operands[0]));
>     DONE;
>   })
> +
> +(define_expand "and<mode>3"
> +  [(set (match_operand:VNIM1 0 "s_register_operand" "")
> +	(and:VNIM1 (match_operand:VNIM1 1 "s_register_operand" "")
> +		   (match_operand:VNIM1 2 "neon_inv_logic_op2" "")))]
> +  "TARGET_NEON
> +   || (TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))"
> +)
> +
> +(define_expand "and<mode>3"
> +  [(set (match_operand:VNINOTM1 0 "s_register_operand" "")
> +	(and:VNINOTM1 (match_operand:VNINOTM1 1 "s_register_operand" "")
> +		      (match_operand:VNINOTM1 2 "neon_inv_logic_op2" "")))]
> +  "TARGET_NEON"
> +)
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vand.c b/gcc/testsuite/gcc.target/arm/simd/mve-vand.c
> new file mode 100644
> index 0000000..2e30cd0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vand.c
> @@ -0,0 +1,34 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> +/* { dg-add-options arm_v8_1m_mve } */
> +/* { dg-additional-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)				\
> +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> +    int i;								\
> +    for (i=0; i<NB; i++) {						\
> +      dest[i] = a[i] OP b[i];						\
> +    }									\
> +}
> +
> +/* 64-bit vectors.  */
> +FUNC(s, int, 32, 2, &, vand)
> +FUNC(u, uint, 32, 2, &, vand)
> +FUNC(s, int, 16, 4, &, vand)
> +FUNC(u, uint, 16, 4, &, vand)
> +FUNC(s, int, 8, 8, &, vand)
> +FUNC(u, uint, 8, 8, &, vand)
> +
> +/* 128-bit vectors.  */
> +FUNC(s, int, 32, 4, &, vand)
> +FUNC(u, uint, 32, 4, &, vand)
> +FUNC(s, int, 16, 8, &, vand)
> +FUNC(u, uint, 16, 8, &, vand)
> +FUNC(s, int, 8, 16, &, vand)
> +FUNC(u, uint, 8, 16, &, vand)
> +
> +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> +   functions above.  */
> +/* { dg-final { scan-assembler-times {vand\tq[0-9]+, q[0-9]+, q[0-9]+} 6 } } */
Christophe Lyon Nov. 26, 2020, 3:31 p.m. UTC | #2
Hi Andre,

Thanks for the quick feedback.

On Wed, 25 Nov 2020 at 18:17, Andre Simoes Dias Vieira
<andre.simoesdiasvieira@arm.com> wrote:
>
> Hi Christophe,
>
> Thanks for these! See some inline comments.
>
> On 25/11/2020 13:54, Christophe Lyon via Gcc-patches wrote:
> > This patch enables MVE vandq instructions for auto-vectorization.  MVE
> > vandq insns in mve.md are modified to use 'and' instead of unspec
> > expression to support and<mode>3.  The and<mode>3 expander is added to
> > vec-common.md
> >
> > 2020-11-12  Christophe Lyon  <christophe.lyon@linaro.org>
> >
> >       gcc/
> >       * gcc/config/arm/iterators.md (supf): Remove VANDQ_S and VANDQ_U.
> >       (VANQ): Remove.
> >       * config/arm/mve.md (mve_vandq_u<mode>): New entry for vand
> >       instruction using expression and.
> >       (mve_vandq_s<mode>): New expander.
> >       * config/arm/neon.md (and<mode>3): Renamed into and<mode>3_neon.
> >       * config/arm/unspecs.md (VANDQ_S, VANDQ_U): Remove.
> >       * config/arm/vec-common.md (and<mode>3): New expander.
> >
> >       gcc/testsuite/
> >       * gcc.target/arm/simd/mve-vand.c: New test.
> > ---
> >   gcc/config/arm/iterators.md                  |  4 +---
> >   gcc/config/arm/mve.md                        | 20 ++++++++++++----
> >   gcc/config/arm/neon.md                       |  2 +-
> >   gcc/config/arm/unspecs.md                    |  2 --
> >   gcc/config/arm/vec-common.md                 | 15 ++++++++++++
> >   gcc/testsuite/gcc.target/arm/simd/mve-vand.c | 34 ++++++++++++++++++++++++++++
> >   6 files changed, 66 insertions(+), 11 deletions(-)
> >   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vand.c
> >
> > diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> > index 592af35..72039e4 100644
> > --- a/gcc/config/arm/iterators.md
> > +++ b/gcc/config/arm/iterators.md
> > @@ -1232,8 +1232,7 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
> >                      (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s")
> >                      (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
> >                      (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
> > -                    (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VANDQ_S "s")
> > -                    (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u")
> > +                    (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VBICQ_S "s") (VBICQ_U "u")
> >                      (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s")
> >                      (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s")
> >                      (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u")
> > @@ -1501,7 +1500,6 @@ (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
> >   (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
> >   (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
> >   (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
> > -(define_int_iterator VANDQ [VANDQ_U VANDQ_S])
> >   (define_int_iterator VBICQ [VBICQ_S VBICQ_U])
> >   (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
> >   (define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U])
> > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > index ecbaaa9..975eb7d 100644
> > --- a/gcc/config/arm/mve.md
> > +++ b/gcc/config/arm/mve.md
> > @@ -894,17 +894,27 @@ (define_insn "mve_vaddvq_p_<supf><mode>"
> >   ;;
> >   ;; [vandq_u, vandq_s])
> >   ;;
> > -(define_insn "mve_vandq_<supf><mode>"
> > +;; signed and unsigned versions are the same: define the unsigned
> > +;; insn, and use an expander for the signed one as we still reference
> > +;; both names from arm_mve.h.
> > +(define_insn "mve_vandq_u<mode>"
> >     [
> >      (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> > -     (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
> > -                    (match_operand:MVE_2 2 "s_register_operand" "w")]
> > -      VANDQ))
> > +     (and:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
> > +                    (match_operand:MVE_2 2 "s_register_operand" "w")))
> The predicate on the second operand is more restrictive than the one in
> expand 'neon_inv_logic_op2'. This should still work with immediates, or
> well I checked for integers, it generates a loop as such:
>
Right, thanks for catching it.

>          vldrw.32        q3, [r0]
>          vldr.64 d4, .L8
>          vldr.64 d5, .L8+8
>          vand    q3, q3, q2
>          vstrw.32        q3, [r2]
>
> MVE does support vand's with immediates, just like NEON, I suspect you
> could just copy the way Neon handles these, possibly worth the little
> extra effort. You can use dest[i] = a[i] & ~1 as a testcase.
> If you don't it might still be worth expanding the test to make sure
> other immediates-types combinations don't trigger an ICE?
>
> I'm not sure I understand why it loads it in two 64-bit chunks and not
> do a single load or not just do something like a vmov or vbic immediate.
> Anyhow that's a worry for another day I guess..

Do you mean something like the attached (on top of this patch)?
I dislike the code duplication in mve_vandq_u<mode> which would
become a copy of and<mode>3_neon.

The other concern is that it's not exercised by testcase: as you noted
the compiler uses a pair of loads to prepare the second operand.

But indeed that's probably a separate problem.

I guess your comments apply to patch 2 (vorr)?

Thanks,

Christophe


> >     ]
> >     "TARGET_HAVE_MVE"
> > -  "vand %q0, %q1, %q2"
> > +  "vand\t%q0, %q1, %q2"
> >     [(set_attr "type" "mve_move")
> >   ])
> > +(define_expand "mve_vandq_s<mode>"
> > +  [
> > +   (set (match_operand:MVE_2 0 "s_register_operand")
> > +     (and:MVE_2 (match_operand:MVE_2 1 "s_register_operand")
> > +                    (match_operand:MVE_2 2 "s_register_operand")))
> > +  ]
> > +  "TARGET_HAVE_MVE"
> > +)
> >
> >   ;;
> >   ;; [vbicq_s, vbicq_u])
> > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > index 2d76769..dc4707d 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -712,7 +712,7 @@ (define_insn "ior<mode>3"
> >   ;; corresponds to the canonical form the middle-end expects to use for
> >   ;; immediate bitwise-ANDs.
> >
> > -(define_insn "and<mode>3"
> > +(define_insn "and<mode>3_neon"
> >     [(set (match_operand:VDQ 0 "s_register_operand" "=w,w")
> >       (and:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0")
> >                (match_operand:VDQ 2 "neon_inv_logic_op2" "w,DL")))]
> > diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
> > index a3844e9..e8bf68e 100644
> > --- a/gcc/config/arm/unspecs.md
> > +++ b/gcc/config/arm/unspecs.md
> > @@ -601,7 +601,6 @@ (define_c_enum "unspec" [
> >     VADDQ_N_S
> >     VADDVAQ_S
> >     VADDVQ_P_S
> > -  VANDQ_S
> >     VBICQ_S
> >     VBRSRQ_N_S
> >     VCADDQ_ROT270_S
> > @@ -648,7 +647,6 @@ (define_c_enum "unspec" [
> >     VADDQ_N_U
> >     VADDVAQ_U
> >     VADDVQ_P_U
> > -  VANDQ_U
> >     VBICQ_U
> >     VBRSRQ_N_U
> >     VCADDQ_ROT270_U
> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > index 250e503..3dd694c 100644
> > --- a/gcc/config/arm/vec-common.md
> > +++ b/gcc/config/arm/vec-common.md
> > @@ -172,3 +172,18 @@ (define_expand "vec_set<mode>"
> >                                              GEN_INT (elem), operands[0]));
> >     DONE;
> >   })
> > +
> > +(define_expand "and<mode>3"
> > +  [(set (match_operand:VNIM1 0 "s_register_operand" "")
> > +     (and:VNIM1 (match_operand:VNIM1 1 "s_register_operand" "")
> > +                (match_operand:VNIM1 2 "neon_inv_logic_op2" "")))]
> > +  "TARGET_NEON
> > +   || (TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))"
> > +)
> > +
> > +(define_expand "and<mode>3"
> > +  [(set (match_operand:VNINOTM1 0 "s_register_operand" "")
> > +     (and:VNINOTM1 (match_operand:VNINOTM1 1 "s_register_operand" "")
> > +                   (match_operand:VNINOTM1 2 "neon_inv_logic_op2" "")))]
> > +  "TARGET_NEON"
> > +)
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vand.c b/gcc/testsuite/gcc.target/arm/simd/mve-vand.c
> > new file mode 100644
> > index 0000000..2e30cd0
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vand.c
> > @@ -0,0 +1,34 @@
> > +/* { dg-do assemble } */
> > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> > +/* { dg-add-options arm_v8_1m_mve } */
> > +/* { dg-additional-options "-O3" } */
> > +
> > +#include <stdint.h>
> > +
> > +#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)                         \
> > +  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
> > +    int i;                                                           \
> > +    for (i=0; i<NB; i++) {                                           \
> > +      dest[i] = a[i] OP b[i];                                                \
> > +    }                                                                        \
> > +}
> > +
> > +/* 64-bit vectors.  */
> > +FUNC(s, int, 32, 2, &, vand)
> > +FUNC(u, uint, 32, 2, &, vand)
> > +FUNC(s, int, 16, 4, &, vand)
> > +FUNC(u, uint, 16, 4, &, vand)
> > +FUNC(s, int, 8, 8, &, vand)
> > +FUNC(u, uint, 8, 8, &, vand)
> > +
> > +/* 128-bit vectors.  */
> > +FUNC(s, int, 32, 4, &, vand)
> > +FUNC(u, uint, 32, 4, &, vand)
> > +FUNC(s, int, 16, 8, &, vand)
> > +FUNC(u, uint, 16, 8, &, vand)
> > +FUNC(s, int, 8, 16, &, vand)
> > +FUNC(u, uint, 8, 16, &, vand)
> > +
> > +/* MVE has only 128-bit vectors, so we can vectorize only half of the
> > +   functions above.  */
> > +/* { dg-final { scan-assembler-times {vand\tq[0-9]+, q[0-9]+, q[0-9]+} 6 } } */
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 15b37f0..d6fe3de 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -939,19 +939,27 @@ (define_insn "mve_vaddvq_p_<supf><mode>"
 ;; both names from arm_mve.h.
 (define_insn "mve_vandq_u<mode>"
   [
-   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
-	(and:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
-		       (match_operand:MVE_2 2 "s_register_operand" "w")))
+   (set (match_operand:MVE_2 0 "s_register_operand" "=w,w")
+	(and:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w,0")
+		   (match_operand:MVE_2 2 "neon_inv_logic_op2" "w,DL")))
   ]
   "TARGET_HAVE_MVE"
-  "vand\t%q0, %q1, %q2"
+  {
+  switch (which_alternative)
+    {
+    case 0: return "vand\t%q0, %q1, %q2";
+    case 1: return neon_output_logic_immediate ("vand", &operands[2],
+		     <MODE>mode, 1, VALID_NEON_QREG_MODE (<MODE>mode));
+    default: gcc_unreachable ();
+  }
+  }
   [(set_attr "type" "mve_move")
 ])
 (define_expand "mve_vandq_s<mode>"
   [
    (set (match_operand:MVE_2 0 "s_register_operand")
 	(and:MVE_2 (match_operand:MVE_2 1 "s_register_operand")
-		       (match_operand:MVE_2 2 "s_register_operand")))
+		   (match_operand:MVE_2 2 "neon_inv_logic_op2")))
   ]
   "TARGET_HAVE_MVE"
 )
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index 2144520..5f58f7c 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -107,7 +107,7 @@ (define_predicate "vpr_register_operand"
 (define_predicate "imm_for_neon_inv_logic_operand"
   (match_code "const_vector")
 {
-  return (TARGET_NEON
+  return ((TARGET_NEON || TARGET_HAVE_MVE)
           && neon_immediate_valid_for_logic (op, mode, 1, NULL, NULL));
 })
 
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vand.c b/gcc/testsuite/gcc.target/arm/simd/mve-vand.c
index 2e30cd0..341e9aa0 100644
--- a/gcc/testsuite/gcc.target/arm/simd/mve-vand.c
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vand.c
@@ -13,6 +13,14 @@
     }									\
 }
 
+#define FUNC_IMM(SIGN, TYPE, BITS, NB, OP, NAME)				\
+  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP 1;						\
+    }									\
+}
+
 /* 64-bit vectors.  */
 FUNC(s, int, 32, 2, &, vand)
 FUNC(u, uint, 32, 2, &, vand)
@@ -29,6 +37,22 @@ FUNC(u, uint, 16, 8, &, vand)
 FUNC(s, int, 8, 16, &, vand)
 FUNC(u, uint, 8, 16, &, vand)
 
+/* 64-bit vectors.  */
+FUNC_IMM(s, int, 32, 2, &, vandimm)
+FUNC_IMM(u, uint, 32, 2, &, vandimm)
+FUNC_IMM(s, int, 16, 4, &, vandimm)
+FUNC_IMM(u, uint, 16, 4, &, vandimm)
+FUNC_IMM(s, int, 8, 8, &, vandimm)
+FUNC_IMM(u, uint, 8, 8, &, vandimm)
+
+/* 128-bit vectors.  */
+FUNC_IMM(s, int, 32, 4, &, vandimm)
+FUNC_IMM(u, uint, 32, 4, &, vandimm)
+FUNC_IMM(s, int, 16, 8, &, vandimm)
+FUNC_IMM(u, uint, 16, 8, &, vandimm)
+FUNC_IMM(s, int, 8, 16, &, vandimm)
+FUNC_IMM(u, uint, 8, 16, &, vandimm)
+
 /* MVE has only 128-bit vectors, so we can vectorize only half of the
    functions above.  */
-/* { dg-final { scan-assembler-times {vand\tq[0-9]+, q[0-9]+, q[0-9]+} 6 } } */
+/* { dg-final { scan-assembler-times {vand\tq[0-9]+, q[0-9]+, q[0-9]+} 12 } } */
Andre Vieira (lists) Nov. 27, 2020, 2:13 p.m. UTC | #3
Hi Christophe,

On 26/11/2020 15:31, Christophe Lyon wrote:
> Hi Andre,
>
> Thanks for the quick feedback.
>
> On Wed, 25 Nov 2020 at 18:17, Andre Simoes Dias Vieira
> <andre.simoesdiasvieira@arm.com> wrote:
>> Hi Christophe,
>>
>> Thanks for these! See some inline comments.
>>
>> On 25/11/2020 13:54, Christophe Lyon via Gcc-patches wrote:
>>> This patch enables MVE vandq instructions for auto-vectorization.  MVE
>>> vandq insns in mve.md are modified to use 'and' instead of unspec
>>> expression to support and<mode>3.  The and<mode>3 expander is added to
>>> vec-common.md
>>>
>>> 2020-11-12  Christophe Lyon  <christophe.lyon@linaro.org>
>>>
>>>        gcc/
>>>        * gcc/config/arm/iterators.md (supf): Remove VANDQ_S and VANDQ_U.
>>>        (VANQ): Remove.
>>>        * config/arm/mve.md (mve_vandq_u<mode>): New entry for vand
>>>        instruction using expression and.
>>>        (mve_vandq_s<mode>): New expander.
>>>        * config/arm/neon.md (and<mode>3): Renamed into and<mode>3_neon.
>>>        * config/arm/unspecs.md (VANDQ_S, VANDQ_U): Remove.
>>>        * config/arm/vec-common.md (and<mode>3): New expander.
>>>
>>>        gcc/testsuite/
>>>        * gcc.target/arm/simd/mve-vand.c: New test.
>>> ---
>>>    gcc/config/arm/iterators.md                  |  4 +---
>>>    gcc/config/arm/mve.md                        | 20 ++++++++++++----
>>>    gcc/config/arm/neon.md                       |  2 +-
>>>    gcc/config/arm/unspecs.md                    |  2 --
>>>    gcc/config/arm/vec-common.md                 | 15 ++++++++++++
>>>    gcc/testsuite/gcc.target/arm/simd/mve-vand.c | 34 ++++++++++++++++++++++++++++
>>>    6 files changed, 66 insertions(+), 11 deletions(-)
>>>    create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vand.c
>>>
>>> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
>>> index 592af35..72039e4 100644
>>> --- a/gcc/config/arm/iterators.md
>>> +++ b/gcc/config/arm/iterators.md
>>> @@ -1232,8 +1232,7 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
>>>                       (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s")
>>>                       (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
>>>                       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
>>> -                    (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VANDQ_S "s")
>>> -                    (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u")
>>> +                    (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VBICQ_S "s") (VBICQ_U "u")
>>>                       (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s")
>>>                       (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s")
>>>                       (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u")
>>> @@ -1501,7 +1500,6 @@ (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
>>>    (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
>>>    (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
>>>    (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
>>> -(define_int_iterator VANDQ [VANDQ_U VANDQ_S])
>>>    (define_int_iterator VBICQ [VBICQ_S VBICQ_U])
>>>    (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
>>>    (define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U])
>>> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
>>> index ecbaaa9..975eb7d 100644
>>> --- a/gcc/config/arm/mve.md
>>> +++ b/gcc/config/arm/mve.md
>>> @@ -894,17 +894,27 @@ (define_insn "mve_vaddvq_p_<supf><mode>"
>>>    ;;
>>>    ;; [vandq_u, vandq_s])
>>>    ;;
>>> -(define_insn "mve_vandq_<supf><mode>"
>>> +;; signed and unsigned versions are the same: define the unsigned
>>> +;; insn, and use an expander for the signed one as we still reference
>>> +;; both names from arm_mve.h.
>>> +(define_insn "mve_vandq_u<mode>"
>>>      [
>>>       (set (match_operand:MVE_2 0 "s_register_operand" "=w")
>>> -     (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
>>> -                    (match_operand:MVE_2 2 "s_register_operand" "w")]
>>> -      VANDQ))
>>> +     (and:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
>>> +                    (match_operand:MVE_2 2 "s_register_operand" "w")))
>> The predicate on the second operand is more restrictive than the one in
>> expand 'neon_inv_logic_op2'. This should still work with immediates, or
>> well I checked for integers, it generates a loop as such:
>>
> Right, thanks for catching it.
>
>>           vldrw.32        q3, [r0]
>>           vldr.64 d4, .L8
>>           vldr.64 d5, .L8+8
>>           vand    q3, q3, q2
>>           vstrw.32        q3, [r2]
>>
>> MVE does support vand's with immediates, just like NEON, I suspect you
>> could just copy the way Neon handles these, possibly worth the little
>> extra effort. You can use dest[i] = a[i] & ~1 as a testcase.
>> If you don't it might still be worth expanding the test to make sure
>> other immediates-types combinations don't trigger an ICE?
>>
>> I'm not sure I understand why it loads it in two 64-bit chunks and not
>> do a single load or not just do something like a vmov or vbic immediate.
>> Anyhow that's a worry for another day I guess..
> Do you mean something like the attached (on top of this patch)?
> I dislike the code duplication in mve_vandq_u<mode> which would
> become a copy of and<mode>3_neon.
Hi Christophe,

Yeah that's what I meant. I agree with the code duplication. The reason 
we still use separate ones is because of the difference in supported 
modes. Maybe the right way around that would be to redefine VDQ as:

(define_mode_iterator VDQ [(V8QI "TARGET_HAVE_NEON") V16QI
                                                      (V4HI 
"TARGET_HAVE_NEON") V8HI
                                                      (V2SI 
"TARGET_HAVE_NEON") V4SI
                                                      (V4HF 
"TARGET_HAVE_NEON") V8HF
                                                      (V2SF 
"TARGET_HAVE_NEON") V4SF V2DI])

And have a single define_expand and insn for both vector extensions. 
Though we would also need to do something about the type attribut in 
case we want to have different scheduling types for both. Though right 
now we don't do any scheduling for MVE, I don't know whether these can 
be conditionalized on target features though, something to look at.

>
> The other concern is that it's not exercised by testcase: as you noted
> the compiler uses a pair of loads to prepare the second operand.
>
> But indeed that's probably a separate problem.
>
> I guess your comments apply to patch 2 (vorr)?

Yeah, forgot to reply to that one, but yes! I still need to have a look 
at 4-7.

Kind regards,
Andre
Christophe Lyon Nov. 27, 2020, 3:29 p.m. UTC | #4
On Fri, 27 Nov 2020 at 15:13, Andre Vieira (lists)
<andre.simoesdiasvieira@arm.com> wrote:
>
> Hi Christophe,
>
> On 26/11/2020 15:31, Christophe Lyon wrote:
> > Hi Andre,
> >
> > Thanks for the quick feedback.
> >
> > On Wed, 25 Nov 2020 at 18:17, Andre Simoes Dias Vieira
> > <andre.simoesdiasvieira@arm.com> wrote:
> >> Hi Christophe,
> >>
> >> Thanks for these! See some inline comments.
> >>
> >> On 25/11/2020 13:54, Christophe Lyon via Gcc-patches wrote:
> >>> This patch enables MVE vandq instructions for auto-vectorization.  MVE
> >>> vandq insns in mve.md are modified to use 'and' instead of unspec
> >>> expression to support and<mode>3.  The and<mode>3 expander is added to
> >>> vec-common.md
> >>>
> >>> 2020-11-12  Christophe Lyon  <christophe.lyon@linaro.org>
> >>>
> >>>        gcc/
> >>>        * gcc/config/arm/iterators.md (supf): Remove VANDQ_S and VANDQ_U.
> >>>        (VANQ): Remove.
> >>>        * config/arm/mve.md (mve_vandq_u<mode>): New entry for vand
> >>>        instruction using expression and.
> >>>        (mve_vandq_s<mode>): New expander.
> >>>        * config/arm/neon.md (and<mode>3): Renamed into and<mode>3_neon.
> >>>        * config/arm/unspecs.md (VANDQ_S, VANDQ_U): Remove.
> >>>        * config/arm/vec-common.md (and<mode>3): New expander.
> >>>
> >>>        gcc/testsuite/
> >>>        * gcc.target/arm/simd/mve-vand.c: New test.
> >>> ---
> >>>    gcc/config/arm/iterators.md                  |  4 +---
> >>>    gcc/config/arm/mve.md                        | 20 ++++++++++++----
> >>>    gcc/config/arm/neon.md                       |  2 +-
> >>>    gcc/config/arm/unspecs.md                    |  2 --
> >>>    gcc/config/arm/vec-common.md                 | 15 ++++++++++++
> >>>    gcc/testsuite/gcc.target/arm/simd/mve-vand.c | 34 ++++++++++++++++++++++++++++
> >>>    6 files changed, 66 insertions(+), 11 deletions(-)
> >>>    create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vand.c
> >>>
> >>> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> >>> index 592af35..72039e4 100644
> >>> --- a/gcc/config/arm/iterators.md
> >>> +++ b/gcc/config/arm/iterators.md
> >>> @@ -1232,8 +1232,7 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
> >>>                       (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s")
> >>>                       (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
> >>>                       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
> >>> -                    (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VANDQ_S "s")
> >>> -                    (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u")
> >>> +                    (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VBICQ_S "s") (VBICQ_U "u")
> >>>                       (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s")
> >>>                       (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s")
> >>>                       (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u")
> >>> @@ -1501,7 +1500,6 @@ (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
> >>>    (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
> >>>    (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
> >>>    (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
> >>> -(define_int_iterator VANDQ [VANDQ_U VANDQ_S])
> >>>    (define_int_iterator VBICQ [VBICQ_S VBICQ_U])
> >>>    (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
> >>>    (define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U])
> >>> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> >>> index ecbaaa9..975eb7d 100644
> >>> --- a/gcc/config/arm/mve.md
> >>> +++ b/gcc/config/arm/mve.md
> >>> @@ -894,17 +894,27 @@ (define_insn "mve_vaddvq_p_<supf><mode>"
> >>>    ;;
> >>>    ;; [vandq_u, vandq_s])
> >>>    ;;
> >>> -(define_insn "mve_vandq_<supf><mode>"
> >>> +;; signed and unsigned versions are the same: define the unsigned
> >>> +;; insn, and use an expander for the signed one as we still reference
> >>> +;; both names from arm_mve.h.
> >>> +(define_insn "mve_vandq_u<mode>"
> >>>      [
> >>>       (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> >>> -     (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
> >>> -                    (match_operand:MVE_2 2 "s_register_operand" "w")]
> >>> -      VANDQ))
> >>> +     (and:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
> >>> +                    (match_operand:MVE_2 2 "s_register_operand" "w")))
> >> The predicate on the second operand is more restrictive than the one in
> >> expand 'neon_inv_logic_op2'. This should still work with immediates, or
> >> well I checked for integers, it generates a loop as such:
> >>
> > Right, thanks for catching it.
> >
> >>           vldrw.32        q3, [r0]
> >>           vldr.64 d4, .L8
> >>           vldr.64 d5, .L8+8
> >>           vand    q3, q3, q2
> >>           vstrw.32        q3, [r2]
> >>
> >> MVE does support vand's with immediates, just like NEON, I suspect you
> >> could just copy the way Neon handles these, possibly worth the little
> >> extra effort. You can use dest[i] = a[i] & ~1 as a testcase.
> >> If you don't it might still be worth expanding the test to make sure
> >> other immediates-types combinations don't trigger an ICE?
> >>
> >> I'm not sure I understand why it loads it in two 64-bit chunks and not
> >> do a single load or not just do something like a vmov or vbic immediate.
> >> Anyhow that's a worry for another day I guess..
> > Do you mean something like the attached (on top of this patch)?
> > I dislike the code duplication in mve_vandq_u<mode> which would
> > become a copy of and<mode>3_neon.
> Hi Christophe,
>
> Yeah that's what I meant. I agree with the code duplication. The reason
> we still use separate ones is because of the difference in supported
> modes. Maybe the right way around that would be to redefine VDQ as:
>
> (define_mode_iterator VDQ [(V8QI "TARGET_HAVE_NEON") V16QI
>                                                       (V4HI
> "TARGET_HAVE_NEON") V8HI
>                                                       (V2SI
> "TARGET_HAVE_NEON") V4SI
>                                                       (V4HF
> "TARGET_HAVE_NEON") V8HF
>                                                       (V2SF
> "TARGET_HAVE_NEON") V4SF V2DI])
>
> And have a single define_expand and insn for both vector extensions.

Indeed, I can try that.
I have also noticed the VNIM1 / VNINOTM1 pair.

> Though we would also need to do something about the type attribut in
> case we want to have different scheduling types for both. Though right
> now we don't do any scheduling for MVE, I don't know whether these can
> be conditionalized on target features though, something to look at.
>
> >
> > The other concern is that it's not exercised by testcase: as you noted
> > the compiler uses a pair of loads to prepare the second operand.
> >
> > But indeed that's probably a separate problem.
> >
> > I guess your comments apply to patch 2 (vorr)?
>
> Yeah, forgot to reply to that one, but yes! I still need to have a look
> at 4-7.

Ok thanks, I have a WIP fix for #7 (vmvn) anyway.

> Kind regards,
> Andre
>
Christophe Lyon Nov. 30, 2020, 10:39 a.m. UTC | #5
On Fri, 27 Nov 2020 at 16:29, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> On Fri, 27 Nov 2020 at 15:13, Andre Vieira (lists)
> <andre.simoesdiasvieira@arm.com> wrote:
> >
> > Hi Christophe,
> >
> > On 26/11/2020 15:31, Christophe Lyon wrote:
> > > Hi Andre,
> > >
> > > Thanks for the quick feedback.
> > >
> > > On Wed, 25 Nov 2020 at 18:17, Andre Simoes Dias Vieira
> > > <andre.simoesdiasvieira@arm.com> wrote:
> > >> Hi Christophe,
> > >>
> > >> Thanks for these! See some inline comments.
> > >>
> > >> On 25/11/2020 13:54, Christophe Lyon via Gcc-patches wrote:
> > >>> This patch enables MVE vandq instructions for auto-vectorization.  MVE
> > >>> vandq insns in mve.md are modified to use 'and' instead of unspec
> > >>> expression to support and<mode>3.  The and<mode>3 expander is added to
> > >>> vec-common.md
> > >>>
> > >>> 2020-11-12  Christophe Lyon  <christophe.lyon@linaro.org>
> > >>>
> > >>>        gcc/
> > >>>        * gcc/config/arm/iterators.md (supf): Remove VANDQ_S and VANDQ_U.
> > >>>        (VANQ): Remove.
> > >>>        * config/arm/mve.md (mve_vandq_u<mode>): New entry for vand
> > >>>        instruction using expression and.
> > >>>        (mve_vandq_s<mode>): New expander.
> > >>>        * config/arm/neon.md (and<mode>3): Renamed into and<mode>3_neon.
> > >>>        * config/arm/unspecs.md (VANDQ_S, VANDQ_U): Remove.
> > >>>        * config/arm/vec-common.md (and<mode>3): New expander.
> > >>>
> > >>>        gcc/testsuite/
> > >>>        * gcc.target/arm/simd/mve-vand.c: New test.
> > >>> ---
> > >>>    gcc/config/arm/iterators.md                  |  4 +---
> > >>>    gcc/config/arm/mve.md                        | 20 ++++++++++++----
> > >>>    gcc/config/arm/neon.md                       |  2 +-
> > >>>    gcc/config/arm/unspecs.md                    |  2 --
> > >>>    gcc/config/arm/vec-common.md                 | 15 ++++++++++++
> > >>>    gcc/testsuite/gcc.target/arm/simd/mve-vand.c | 34 ++++++++++++++++++++++++++++
> > >>>    6 files changed, 66 insertions(+), 11 deletions(-)
> > >>>    create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vand.c
> > >>>
> > >>> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> > >>> index 592af35..72039e4 100644
> > >>> --- a/gcc/config/arm/iterators.md
> > >>> +++ b/gcc/config/arm/iterators.md
> > >>> @@ -1232,8 +1232,7 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
> > >>>                       (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s")
> > >>>                       (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
> > >>>                       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
> > >>> -                    (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VANDQ_S "s")
> > >>> -                    (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u")
> > >>> +                    (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VBICQ_S "s") (VBICQ_U "u")
> > >>>                       (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s")
> > >>>                       (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s")
> > >>>                       (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u")
> > >>> @@ -1501,7 +1500,6 @@ (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
> > >>>    (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
> > >>>    (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
> > >>>    (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
> > >>> -(define_int_iterator VANDQ [VANDQ_U VANDQ_S])
> > >>>    (define_int_iterator VBICQ [VBICQ_S VBICQ_U])
> > >>>    (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
> > >>>    (define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U])
> > >>> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > >>> index ecbaaa9..975eb7d 100644
> > >>> --- a/gcc/config/arm/mve.md
> > >>> +++ b/gcc/config/arm/mve.md
> > >>> @@ -894,17 +894,27 @@ (define_insn "mve_vaddvq_p_<supf><mode>"
> > >>>    ;;
> > >>>    ;; [vandq_u, vandq_s])
> > >>>    ;;
> > >>> -(define_insn "mve_vandq_<supf><mode>"
> > >>> +;; signed and unsigned versions are the same: define the unsigned
> > >>> +;; insn, and use an expander for the signed one as we still reference
> > >>> +;; both names from arm_mve.h.
> > >>> +(define_insn "mve_vandq_u<mode>"
> > >>>      [
> > >>>       (set (match_operand:MVE_2 0 "s_register_operand" "=w")
> > >>> -     (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
> > >>> -                    (match_operand:MVE_2 2 "s_register_operand" "w")]
> > >>> -      VANDQ))
> > >>> +     (and:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
> > >>> +                    (match_operand:MVE_2 2 "s_register_operand" "w")))
> > >> The predicate on the second operand is more restrictive than the one in
> > >> expand 'neon_inv_logic_op2'. This should still work with immediates, or
> > >> well I checked for integers, it generates a loop as such:
> > >>
> > > Right, thanks for catching it.
> > >
> > >>           vldrw.32        q3, [r0]
> > >>           vldr.64 d4, .L8
> > >>           vldr.64 d5, .L8+8
> > >>           vand    q3, q3, q2
> > >>           vstrw.32        q3, [r2]
> > >>
> > >> MVE does support vand's with immediates, just like NEON, I suspect you
> > >> could just copy the way Neon handles these, possibly worth the little
> > >> extra effort. You can use dest[i] = a[i] & ~1 as a testcase.
> > >> If you don't it might still be worth expanding the test to make sure
> > >> other immediates-types combinations don't trigger an ICE?
> > >>
> > >> I'm not sure I understand why it loads it in two 64-bit chunks and not
> > >> do a single load or not just do something like a vmov or vbic immediate.
> > >> Anyhow that's a worry for another day I guess..
> > > Do you mean something like the attached (on top of this patch)?
> > > I dislike the code duplication in mve_vandq_u<mode> which would
> > > become a copy of and<mode>3_neon.
> > Hi Christophe,
> >
> > Yeah that's what I meant. I agree with the code duplication. The reason
> > we still use separate ones is because of the difference in supported
> > modes. Maybe the right way around that would be to redefine VDQ as:
> >
> > (define_mode_iterator VDQ [(V8QI "TARGET_HAVE_NEON") V16QI
> >                                                       (V4HI
> > "TARGET_HAVE_NEON") V8HI
> >                                                       (V2SI
> > "TARGET_HAVE_NEON") V4SI
> >                                                       (V4HF
> > "TARGET_HAVE_NEON") V8HF
> >                                                       (V2SF
> > "TARGET_HAVE_NEON") V4SF V2DI])
> >
> > And have a single define_expand and insn for both vector extensions.
>
> Indeed, I can try that.
> I have also noticed the VNIM1 / VNINOTM1 pair.
>
> > Though we would also need to do something about the type attribut in
> > case we want to have different scheduling types for both. Though right
> > now we don't do any scheduling for MVE, I don't know whether these can
> > be conditionalized on target features though, something to look at.
> >
> > >
> > > The other concern is that it's not exercised by testcase: as you noted
> > > the compiler uses a pair of loads to prepare the second operand.
> > >
> > > But indeed that's probably a separate problem.
> > >
> > > I guess your comments apply to patch 2 (vorr)?
> >
> > Yeah, forgot to reply to that one, but yes! I still need to have a look
> > at 4-7.
>
> Ok thanks, I have a WIP fix for #7 (vmvn) anyway.

And I never sent #7 because I knew it wasn't ready yet :-)

>
> > Kind regards,
> > Andre
> >
diff mbox series

Patch

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 592af35..72039e4 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1232,8 +1232,7 @@  (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
 		       (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s")
 		       (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
 		       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
-		       (VADDVQ_P_S "s")	(VADDVQ_P_U "u") (VANDQ_S "s")
-		       (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u")
+		       (VADDVQ_P_S "s")	(VADDVQ_P_U "u") (VBICQ_S "s") (VBICQ_U "u")
 		       (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s")
 		       (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s")
 		       (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u")
@@ -1501,7 +1500,6 @@  (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
 (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
 (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
 (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
-(define_int_iterator VANDQ [VANDQ_U VANDQ_S])
 (define_int_iterator VBICQ [VBICQ_S VBICQ_U])
 (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
 (define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index ecbaaa9..975eb7d 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -894,17 +894,27 @@  (define_insn "mve_vaddvq_p_<supf><mode>"
 ;;
 ;; [vandq_u, vandq_s])
 ;;
-(define_insn "mve_vandq_<supf><mode>"
+;; signed and unsigned versions are the same: define the unsigned
+;; insn, and use an expander for the signed one as we still reference
+;; both names from arm_mve.h.
+(define_insn "mve_vandq_u<mode>"
   [
    (set (match_operand:MVE_2 0 "s_register_operand" "=w")
-	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
-		       (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VANDQ))
+	(and:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
+		       (match_operand:MVE_2 2 "s_register_operand" "w")))
   ]
   "TARGET_HAVE_MVE"
-  "vand %q0, %q1, %q2"
+  "vand\t%q0, %q1, %q2"
   [(set_attr "type" "mve_move")
 ])
+(define_expand "mve_vandq_s<mode>"
+  [
+   (set (match_operand:MVE_2 0 "s_register_operand")
+	(and:MVE_2 (match_operand:MVE_2 1 "s_register_operand")
+		       (match_operand:MVE_2 2 "s_register_operand")))
+  ]
+  "TARGET_HAVE_MVE"
+)
 
 ;;
 ;; [vbicq_s, vbicq_u])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 2d76769..dc4707d 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -712,7 +712,7 @@  (define_insn "ior<mode>3"
 ;; corresponds to the canonical form the middle-end expects to use for
 ;; immediate bitwise-ANDs.
 
-(define_insn "and<mode>3"
+(define_insn "and<mode>3_neon"
   [(set (match_operand:VDQ 0 "s_register_operand" "=w,w")
 	(and:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0")
 		 (match_operand:VDQ 2 "neon_inv_logic_op2" "w,DL")))]
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index a3844e9..e8bf68e 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -601,7 +601,6 @@  (define_c_enum "unspec" [
   VADDQ_N_S
   VADDVAQ_S
   VADDVQ_P_S
-  VANDQ_S
   VBICQ_S
   VBRSRQ_N_S
   VCADDQ_ROT270_S
@@ -648,7 +647,6 @@  (define_c_enum "unspec" [
   VADDQ_N_U
   VADDVAQ_U
   VADDVQ_P_U
-  VANDQ_U
   VBICQ_U
   VBRSRQ_N_U
   VCADDQ_ROT270_U
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 250e503..3dd694c 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -172,3 +172,18 @@  (define_expand "vec_set<mode>"
 					       GEN_INT (elem), operands[0]));
   DONE;
 })
+
+(define_expand "and<mode>3"
+  [(set (match_operand:VNIM1 0 "s_register_operand" "")
+	(and:VNIM1 (match_operand:VNIM1 1 "s_register_operand" "")
+		   (match_operand:VNIM1 2 "neon_inv_logic_op2" "")))]
+  "TARGET_NEON
+   || (TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))"
+)
+
+(define_expand "and<mode>3"
+  [(set (match_operand:VNINOTM1 0 "s_register_operand" "")
+	(and:VNINOTM1 (match_operand:VNINOTM1 1 "s_register_operand" "")
+		      (match_operand:VNINOTM1 2 "neon_inv_logic_op2" "")))]
+  "TARGET_NEON"
+)
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vand.c b/gcc/testsuite/gcc.target/arm/simd/mve-vand.c
new file mode 100644
index 0000000..2e30cd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vand.c
@@ -0,0 +1,34 @@ 
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+#define FUNC(SIGN, TYPE, BITS, NB, OP, NAME)				\
+  void test_ ## NAME ##_ ## SIGN ## BITS ## x ## NB (TYPE##BITS##_t * __restrict__ dest, TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \
+    int i;								\
+    for (i=0; i<NB; i++) {						\
+      dest[i] = a[i] OP b[i];						\
+    }									\
+}
+
+/* 64-bit vectors.  */
+FUNC(s, int, 32, 2, &, vand)
+FUNC(u, uint, 32, 2, &, vand)
+FUNC(s, int, 16, 4, &, vand)
+FUNC(u, uint, 16, 4, &, vand)
+FUNC(s, int, 8, 8, &, vand)
+FUNC(u, uint, 8, 8, &, vand)
+
+/* 128-bit vectors.  */
+FUNC(s, int, 32, 4, &, vand)
+FUNC(u, uint, 32, 4, &, vand)
+FUNC(s, int, 16, 8, &, vand)
+FUNC(u, uint, 16, 8, &, vand)
+FUNC(s, int, 8, 16, &, vand)
+FUNC(u, uint, 8, 16, &, vand)
+
+/* MVE has only 128-bit vectors, so we can vectorize only half of the
+   functions above.  */
+/* { dg-final { scan-assembler-times {vand\tq[0-9]+, q[0-9]+, q[0-9]+} 6 } } */