diff mbox series

AArch64 RFC: Don't cost all scalar operations during vectorization if scalar will fuse

Message ID patch-14784-tamar@arm.com
State New
Headers show
Series AArch64 RFC: Don't cost all scalar operations during vectorization if scalar will fuse | expand

Commit Message

Tamar Christina Aug. 31, 2021, 1:27 p.m. UTC
Hi All,

As the vectorizer has improved over time in capabilities it has started
over-vectorizing.  This has causes regressions in the order of 1-7x on libraries
that Arm produces.

The vector costs actually do make a lot of sense and I don't think that they are
wrong.  I think that the costs for the scalar code are wrong.

In particular the costing doesn't take into effect that scalar operation
can/will fuse as this happens in RTL.  Because of this the costs for the scalars
end up being always higher.

As an example the loop in PR 97984:

void x (long * __restrict a, long * __restrict b)
{
  a[0] *= b[0];
  a[1] *= b[1];
  a[0] += b[0];
  a[1] += b[1];
}

generates:

x:
        ldp     x2, x3, [x0]
        ldr     x4, [x1]
        ldr     q1, [x1]
        mul     x2, x2, x4
        ldr     x4, [x1, 8]
        fmov    d0, x2
        ins     v0.d[1], x3
        mul     x1, x3, x4
        ins     v0.d[1], x1
        add     v0.2d, v0.2d, v1.2d
        str     q0, [x0]
        ret

On an actual loop the prologue costs would make the loop too expensive so we
produce the scalar output, but with SLP there's no loop overhead costs so we end
up trying to vectorize this. Because SLP discovery is started from the stores we
will end up vectorizing and costing the add but not the MUL.

To counter this the patch adjusts the costing when it finds an operation that
can be fused and discounts the cost of the "other" operation being fused in.

The attached testcase shows that even when we discount it we still get still get
vectorized code when profitable to do so, e.g. SVE.

This happens as well with other operations such as scalar operations where
shifts can be fused in or for e.g. bfxil.  As such sending this for feedback.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master? If the approach is acceptable I can add support for more.

Thanks,
Tamar

gcc/ChangeLog:

	PR target/97984
	* config/aarch64/aarch64.c (aarch64_add_stmt_cost): Check for fusing
	madd.

gcc/testsuite/ChangeLog:

	PR target/97984
	* gcc.target/aarch64/pr97984-1.c: New test.
	* gcc.target/aarch64/pr97984-2.c: New test.
	* gcc.target/aarch64/pr97984-3.c: New test.
	* gcc.target/aarch64/pr97984-4.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4cd4b037f2606e515ad8f4669d2cd13a509dd0a4..329b556311310d86aaf546d7b395a3750a9d57d4 100644


--

Comments

Richard Sandiford Aug. 31, 2021, 2:49 p.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> As the vectorizer has improved over time in capabilities it has started
> over-vectorizing.  This has causes regressions in the order of 1-7x on libraries
> that Arm produces.
>
> The vector costs actually do make a lot of sense and I don't think that they are
> wrong.  I think that the costs for the scalar code are wrong.
>
> In particular the costing doesn't take into effect that scalar operation
> can/will fuse as this happens in RTL.  Because of this the costs for the scalars
> end up being always higher.
>
> As an example the loop in PR 97984:
>
> void x (long * __restrict a, long * __restrict b)
> {
>   a[0] *= b[0];
>   a[1] *= b[1];
>   a[0] += b[0];
>   a[1] += b[1];
> }
>
> generates:
>
> x:
>         ldp     x2, x3, [x0]
>         ldr     x4, [x1]
>         ldr     q1, [x1]
>         mul     x2, x2, x4
>         ldr     x4, [x1, 8]
>         fmov    d0, x2
>         ins     v0.d[1], x3
>         mul     x1, x3, x4
>         ins     v0.d[1], x1
>         add     v0.2d, v0.2d, v1.2d
>         str     q0, [x0]
>         ret
>
> On an actual loop the prologue costs would make the loop too expensive so we
> produce the scalar output, but with SLP there's no loop overhead costs so we end
> up trying to vectorize this. Because SLP discovery is started from the stores we
> will end up vectorizing and costing the add but not the MUL.
>
> To counter this the patch adjusts the costing when it finds an operation that
> can be fused and discounts the cost of the "other" operation being fused in.
>
> The attached testcase shows that even when we discount it we still get still get
> vectorized code when profitable to do so, e.g. SVE.
>
> This happens as well with other operations such as scalar operations where
> shifts can be fused in or for e.g. bfxil.  As such sending this for feedback.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master? If the approach is acceptable I can add support for more.
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	PR target/97984
> 	* config/aarch64/aarch64.c (aarch64_add_stmt_cost): Check for fusing
> 	madd.
>
> gcc/testsuite/ChangeLog:
>
> 	PR target/97984
> 	* gcc.target/aarch64/pr97984-1.c: New test.
> 	* gcc.target/aarch64/pr97984-2.c: New test.
> 	* gcc.target/aarch64/pr97984-3.c: New test.
> 	* gcc.target/aarch64/pr97984-4.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 4cd4b037f2606e515ad8f4669d2cd13a509dd0a4..329b556311310d86aaf546d7b395a3750a9d57d4 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -15536,6 +15536,39 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
>  	stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
>  						  vectype, stmt_cost);
>  
> +      /* Scale costs if operation is fusing.  */
> +      if (stmt_info && kind == scalar_stmt)
> +      {
> +	if (gassign *stmt = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info)))
> +	  {
> +	    switch (gimple_assign_rhs_code (stmt))
> +	    {
> +	    case PLUS_EXPR:
> +	    case MINUS_EXPR:
> +	      {
> +		/* Check if operation can fuse into MSUB or MADD.  */
> +		tree rhs1 = gimple_assign_rhs1 (stmt);
> +		if (gassign *stmt1 = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (rhs1)))
> +		  if (gimple_assign_rhs_code (stmt1) == MULT_EXPR)
> +		    {
> +		      stmt_cost = 0;
> +		      break;
> +		   }
> +		tree rhs2 = gimple_assign_rhs2 (stmt);
> +		if (gassign *stmt2 = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (rhs2)))
> +		  if (gimple_assign_rhs_code (stmt2) == MULT_EXPR)
> +		    {
> +		      stmt_cost = 0;
> +		      break;
> +		    }
> +	      }
> +	      break;
> +	    default:
> +	      break;
> +	    }
> +	  }
> +      }
> +

The difficulty with this is that we can also use MLA-type operations
for SVE, and for Advanced SIMD if the mode is not DI.  It's not just
a scalar thing.

We already take the combination into account (via aarch64_multiply_add_p)
when estimating issue rates.  But we don't take it into account for
latencies because of the reason above: if the multiplications are
vectorisable, then the combination applies to both the scalar and
the vector code, so the adjustments cancel out.  (Admittedly that
decision predates the special Advanced SIMD handling in
aarch64_multiply_add_p, so we might want to revisit it.)

I think the key feature for this testcase is that the multiplication is
not part of the vector code.  I think that's something we need to check
if we're going to cost the scalar code more cheaply.

But for this particular testcase, I think the main problem is that
we count the cost of the scalar loads even though those are needed
by other users.  E.g.:

int foo(long *restrict res, long *restrict foo, long a, long b)
{
  res[0] = ((foo[0] * a) >> 1) + foo[0];
  res[1] = ((foo[1] * b) >> 1) + foo[1];
}

gets vectorised as:

        mov     x4, x0
        mov     w0, w5
        ldr     x5, [x1, 8]
        ldr     x6, [x1]
        mul     x3, x3, x5
        ldr     q1, [x1]
        mul     x2, x2, x6
        fmov    d0, x2
        ins     v0.d[1], x3
        ins     v0.d[1], x3
        ssra    v1.2d, v0.2d, 1
        str     q1, [x4]
        ret

which is surely worse than the scalar:

        mov     x4, x0
        mov     w0, w5
        ldp     x5, x1, [x1]
        mul     x2, x5, x2
        mul     x3, x1, x3
        add     x2, x5, x2, asr 1
        add     x3, x1, x3, asr 1
        stp     x2, x3, [x4]
        ret

even though both versions can hide the shift.  There's also the point
that two-element vectors can be stored using a single scalar STP (and
loaded using a single LDP), so it's not always accurate to cost the
scalar stores as being twice as expensive as the vector stores.

The fact that there are multiple problems doesn't mean that we shouldn't
start with the costing of instruction combinations.  It's just that
tackling the other aspects might be more general.

If we do start by tackling the costing of instruction combinations,
I think we need to introduce the “multiplication is not vectorised”
condition described above.

Thanks,
Richard

>        if (stmt_info && aarch64_use_new_vector_costs_p ())
>  	{
>  	  /* Account for any extra "embedded" costs that apply additively
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-1.c b/gcc/testsuite/gcc.target/aarch64/pr97984-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..9d403eb76ec3a72747f47e718a88ed6b062643f9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr97984-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -fdump-tree-slp-all" } */
> +
> +void x (long * __restrict a, long * __restrict b)
> +{
> +  a[0] *= b[0];
> +  a[1] *= b[1];
> +  a[0] += b[0];
> +  a[1] += b[1];
> +}
> +
> +/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 1 "slp2" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-2.c b/gcc/testsuite/gcc.target/aarch64/pr97984-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..a4086380fd613035f7ce3e8e8c89e853efa1304e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr97984-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -std=c99 -fdump-tree-vect-all" } */
> +
> +void x (long * __restrict a, long * __restrict b, int n)
> +{
> +  for (int i = 0; i < n; i++) {
> +    a[i] *= b[i];
> +    a[i] += b[i];
> +  }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-3.c b/gcc/testsuite/gcc.target/aarch64/pr97984-3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..c6c8d351834705998b3c87a40edf1a00a4bb80f9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr97984-3.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -fdump-tree-slp-all" } */
> +
> +void x (long * __restrict a, long * __restrict b, long * __restrict c)
> +{
> +  a[0] *= b[0];
> +  a[1] *= b[1];
> +  a[0] += c[0];
> +  a[1] += c[1];
> +}
> +
> +/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 1 "slp2" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-4.c b/gcc/testsuite/gcc.target/aarch64/pr97984-4.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..d03b0bb84dd822ab3b61e229c01be4cd1fc7f1d4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr97984-4.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -std=c99 -fdump-tree-vect-all -march=armv8.3-a+sve" } */
> +
> +void x (long * __restrict a, long * __restrict b, int n)
> +{
> +  for (int i = 0; i < n; i++) {
> +    a[i] *= b[i];
> +    a[i] += b[i];
> +  }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" } } */
Richard Biener Sept. 1, 2021, 1:47 p.m. UTC | #2
On Tue, Aug 31, 2021 at 4:50 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Tamar Christina <tamar.christina@arm.com> writes:
> > Hi All,
> >
> > As the vectorizer has improved over time in capabilities it has started
> > over-vectorizing.  This has causes regressions in the order of 1-7x on libraries
> > that Arm produces.
> >
> > The vector costs actually do make a lot of sense and I don't think that they are
> > wrong.  I think that the costs for the scalar code are wrong.
> >
> > In particular the costing doesn't take into effect that scalar operation
> > can/will fuse as this happens in RTL.  Because of this the costs for the scalars
> > end up being always higher.
> >
> > As an example the loop in PR 97984:
> >
> > void x (long * __restrict a, long * __restrict b)
> > {
> >   a[0] *= b[0];
> >   a[1] *= b[1];
> >   a[0] += b[0];
> >   a[1] += b[1];
> > }
> >
> > generates:
> >
> > x:
> >         ldp     x2, x3, [x0]
> >         ldr     x4, [x1]
> >         ldr     q1, [x1]
> >         mul     x2, x2, x4
> >         ldr     x4, [x1, 8]
> >         fmov    d0, x2
> >         ins     v0.d[1], x3
> >         mul     x1, x3, x4
> >         ins     v0.d[1], x1
> >         add     v0.2d, v0.2d, v1.2d
> >         str     q0, [x0]
> >         ret
> >
> > On an actual loop the prologue costs would make the loop too expensive so we
> > produce the scalar output, but with SLP there's no loop overhead costs so we end
> > up trying to vectorize this. Because SLP discovery is started from the stores we
> > will end up vectorizing and costing the add but not the MUL.
> >
> > To counter this the patch adjusts the costing when it finds an operation that
> > can be fused and discounts the cost of the "other" operation being fused in.
> >
> > The attached testcase shows that even when we discount it we still get still get
> > vectorized code when profitable to do so, e.g. SVE.
> >
> > This happens as well with other operations such as scalar operations where
> > shifts can be fused in or for e.g. bfxil.  As such sending this for feedback.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master? If the approach is acceptable I can add support for more.
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> >       PR target/97984
> >       * config/aarch64/aarch64.c (aarch64_add_stmt_cost): Check for fusing
> >       madd.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       PR target/97984
> >       * gcc.target/aarch64/pr97984-1.c: New test.
> >       * gcc.target/aarch64/pr97984-2.c: New test.
> >       * gcc.target/aarch64/pr97984-3.c: New test.
> >       * gcc.target/aarch64/pr97984-4.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index 4cd4b037f2606e515ad8f4669d2cd13a509dd0a4..329b556311310d86aaf546d7b395a3750a9d57d4 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -15536,6 +15536,39 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
> >       stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
> >                                                 vectype, stmt_cost);
> >
> > +      /* Scale costs if operation is fusing.  */
> > +      if (stmt_info && kind == scalar_stmt)
> > +      {
> > +     if (gassign *stmt = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info)))
> > +       {
> > +         switch (gimple_assign_rhs_code (stmt))
> > +         {
> > +         case PLUS_EXPR:
> > +         case MINUS_EXPR:
> > +           {
> > +             /* Check if operation can fuse into MSUB or MADD.  */
> > +             tree rhs1 = gimple_assign_rhs1 (stmt);
> > +             if (gassign *stmt1 = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (rhs1)))
> > +               if (gimple_assign_rhs_code (stmt1) == MULT_EXPR)
> > +                 {
> > +                   stmt_cost = 0;
> > +                   break;
> > +                }
> > +             tree rhs2 = gimple_assign_rhs2 (stmt);
> > +             if (gassign *stmt2 = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (rhs2)))
> > +               if (gimple_assign_rhs_code (stmt2) == MULT_EXPR)
> > +                 {
> > +                   stmt_cost = 0;
> > +                   break;
> > +                 }
> > +           }
> > +           break;
> > +         default:
> > +           break;
> > +         }
> > +       }
> > +      }
> > +
>
> The difficulty with this is that we can also use MLA-type operations
> for SVE, and for Advanced SIMD if the mode is not DI.  It's not just
> a scalar thing.
>
> We already take the combination into account (via aarch64_multiply_add_p)
> when estimating issue rates.  But we don't take it into account for
> latencies because of the reason above: if the multiplications are
> vectorisable, then the combination applies to both the scalar and
> the vector code, so the adjustments cancel out.  (Admittedly that
> decision predates the special Advanced SIMD handling in
> aarch64_multiply_add_p, so we might want to revisit it.)
>
> I think the key feature for this testcase is that the multiplication is
> not part of the vector code.  I think that's something we need to check
> if we're going to cost the scalar code more cheaply.
>
> But for this particular testcase, I think the main problem is that
> we count the cost of the scalar loads even though those are needed
> by other users.  E.g.:
>
> int foo(long *restrict res, long *restrict foo, long a, long b)
> {
>   res[0] = ((foo[0] * a) >> 1) + foo[0];
>   res[1] = ((foo[1] * b) >> 1) + foo[1];
> }
>
> gets vectorised as:
>
>         mov     x4, x0
>         mov     w0, w5
>         ldr     x5, [x1, 8]
>         ldr     x6, [x1]
>         mul     x3, x3, x5
>         ldr     q1, [x1]
>         mul     x2, x2, x6
>         fmov    d0, x2
>         ins     v0.d[1], x3
>         ins     v0.d[1], x3
>         ssra    v1.2d, v0.2d, 1
>         str     q1, [x4]
>         ret
>
> which is surely worse than the scalar:
>
>         mov     x4, x0
>         mov     w0, w5
>         ldp     x5, x1, [x1]
>         mul     x2, x5, x2
>         mul     x3, x1, x3
>         add     x2, x5, x2, asr 1
>         add     x3, x1, x3, asr 1
>         stp     x2, x3, [x4]
>         ret
>
> even though both versions can hide the shift.  There's also the point
> that two-element vectors can be stored using a single scalar STP (and
> loaded using a single LDP), so it's not always accurate to cost the
> scalar stores as being twice as expensive as the vector stores.
>
> The fact that there are multiple problems doesn't mean that we shouldn't
> start with the costing of instruction combinations.  It's just that
> tackling the other aspects might be more general.
>
> If we do start by tackling the costing of instruction combinations,
> I think we need to introduce the “multiplication is not vectorised”
> condition described above.

BB vectorization costing should already not cost the scalar load
since it's not going away but it should cost a vector load.  Does
this for some reason not work for you?  Indeed looking with
a cross I see

t.c:3:10: note: Costing subgraph:
t.c:3:10: note: node 0x35f03b0 (max_nunits=2, refcnt=1)
t.c:3:10: note: op template: *res_12(D) = _4;
t.c:3:10: note:         stmt 0 *res_12(D) = _4;
t.c:3:10: note:         stmt 1 MEM[(long int *)res_12(D) + 8B] = _8;
t.c:3:10: note:         children 0x35f0440
t.c:3:10: note: node 0x35f0440 (max_nunits=2, refcnt=1)
t.c:3:10: note: op template: _4 = _1 + _3;
t.c:3:10: note:         stmt 0 _4 = _1 + _3;
t.c:3:10: note:         stmt 1 _8 = _5 + _7;
t.c:3:10: note:         children 0x35f04d0 0x35f0560
t.c:3:10: note: node 0x35f04d0 (max_nunits=2, refcnt=2)
t.c:3:10: note: op template: _1 = *foo_10(D);
t.c:3:10: note:         stmt 0 _1 = *foo_10(D);
t.c:3:10: note:         stmt 1 _5 = MEM[(long int *)foo_10(D) + 8B];
t.c:3:10: note: node 0x35f0560 (max_nunits=2, refcnt=1)
t.c:3:10: note: op template: _3 = _2 >> 1;
t.c:3:10: note:         stmt 0 _3 = _2 >> 1;
t.c:3:10: note:         stmt 1 _7 = _6 >> 1;
t.c:3:10: note:         children 0x35f05f0 0x35f0710
t.c:3:10: note: node (external) 0x35f05f0 (max_nunits=2, refcnt=1)
t.c:3:10: note:         stmt 0 _2 = _1 * a_11(D);
t.c:3:10: note:         stmt 1 _6 = _5 * b_14(D);
t.c:3:10: note:         children 0x35f04d0 0x35f0680
t.c:3:10: note: node (external) 0x35f0680 (max_nunits=1, refcnt=1)
t.c:3:10: note:         { a_11(D), b_14(D) }
t.c:3:10: note: node (constant) 0x35f0710 (max_nunits=1, refcnt=1)
t.c:3:10: note:         { 1, 1 }

so the promoted external node 0x35f05f0 should keep the load live.
vect_bb_slp_scalar_cost relies on PURE_SLP_STMT but
that's unreliable here since the per-stmt setting cannot capture the
different uses.  The code shares intend (and some bugs) with
vect_bb_slp_mark_live_stmts and the problem in general is a bit
difficult given the lack of back-mapping from stmt to SLP nodes
referencing it.

Mind to open a bugreport?

Richard.

>
> Thanks,
> Richard
>
> >        if (stmt_info && aarch64_use_new_vector_costs_p ())
> >       {
> >         /* Account for any extra "embedded" costs that apply additively
> > diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-1.c b/gcc/testsuite/gcc.target/aarch64/pr97984-1.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..9d403eb76ec3a72747f47e718a88ed6b062643f9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/pr97984-1.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O3 -fdump-tree-slp-all" } */
> > +
> > +void x (long * __restrict a, long * __restrict b)
> > +{
> > +  a[0] *= b[0];
> > +  a[1] *= b[1];
> > +  a[0] += b[0];
> > +  a[1] += b[1];
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 1 "slp2" } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-2.c b/gcc/testsuite/gcc.target/aarch64/pr97984-2.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..a4086380fd613035f7ce3e8e8c89e853efa1304e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/pr97984-2.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O3 -std=c99 -fdump-tree-vect-all" } */
> > +
> > +void x (long * __restrict a, long * __restrict b, int n)
> > +{
> > +  for (int i = 0; i < n; i++) {
> > +    a[i] *= b[i];
> > +    a[i] += b[i];
> > +  }
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 1 "vect" } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-3.c b/gcc/testsuite/gcc.target/aarch64/pr97984-3.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..c6c8d351834705998b3c87a40edf1a00a4bb80f9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/pr97984-3.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O3 -fdump-tree-slp-all" } */
> > +
> > +void x (long * __restrict a, long * __restrict b, long * __restrict c)
> > +{
> > +  a[0] *= b[0];
> > +  a[1] *= b[1];
> > +  a[0] += c[0];
> > +  a[1] += c[1];
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 1 "slp2" } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-4.c b/gcc/testsuite/gcc.target/aarch64/pr97984-4.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..d03b0bb84dd822ab3b61e229c01be4cd1fc7f1d4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/pr97984-4.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O3 -std=c99 -fdump-tree-vect-all -march=armv8.3-a+sve" } */
> > +
> > +void x (long * __restrict a, long * __restrict b, int n)
> > +{
> > +  for (int i = 0; i < n; i++) {
> > +    a[i] *= b[i];
> > +    a[i] += b[i];
> > +  }
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" } } */
Richard Biener Sept. 2, 2021, 12:18 p.m. UTC | #3
On Wed, Sep 1, 2021 at 3:47 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Tue, Aug 31, 2021 at 4:50 PM Richard Sandiford via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > Tamar Christina <tamar.christina@arm.com> writes:
> > > Hi All,
> > >
> > > As the vectorizer has improved over time in capabilities it has started
> > > over-vectorizing.  This has causes regressions in the order of 1-7x on libraries
> > > that Arm produces.
> > >
> > > The vector costs actually do make a lot of sense and I don't think that they are
> > > wrong.  I think that the costs for the scalar code are wrong.
> > >
> > > In particular the costing doesn't take into effect that scalar operation
> > > can/will fuse as this happens in RTL.  Because of this the costs for the scalars
> > > end up being always higher.
> > >
> > > As an example the loop in PR 97984:
> > >
> > > void x (long * __restrict a, long * __restrict b)
> > > {
> > >   a[0] *= b[0];
> > >   a[1] *= b[1];
> > >   a[0] += b[0];
> > >   a[1] += b[1];
> > > }
> > >
> > > generates:
> > >
> > > x:
> > >         ldp     x2, x3, [x0]
> > >         ldr     x4, [x1]
> > >         ldr     q1, [x1]
> > >         mul     x2, x2, x4
> > >         ldr     x4, [x1, 8]
> > >         fmov    d0, x2
> > >         ins     v0.d[1], x3
> > >         mul     x1, x3, x4
> > >         ins     v0.d[1], x1
> > >         add     v0.2d, v0.2d, v1.2d
> > >         str     q0, [x0]
> > >         ret
> > >
> > > On an actual loop the prologue costs would make the loop too expensive so we
> > > produce the scalar output, but with SLP there's no loop overhead costs so we end
> > > up trying to vectorize this. Because SLP discovery is started from the stores we
> > > will end up vectorizing and costing the add but not the MUL.
> > >
> > > To counter this the patch adjusts the costing when it finds an operation that
> > > can be fused and discounts the cost of the "other" operation being fused in.
> > >
> > > The attached testcase shows that even when we discount it we still get still get
> > > vectorized code when profitable to do so, e.g. SVE.
> > >
> > > This happens as well with other operations such as scalar operations where
> > > shifts can be fused in or for e.g. bfxil.  As such sending this for feedback.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >
> > > Ok for master? If the approach is acceptable I can add support for more.
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > >       PR target/97984
> > >       * config/aarch64/aarch64.c (aarch64_add_stmt_cost): Check for fusing
> > >       madd.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >       PR target/97984
> > >       * gcc.target/aarch64/pr97984-1.c: New test.
> > >       * gcc.target/aarch64/pr97984-2.c: New test.
> > >       * gcc.target/aarch64/pr97984-3.c: New test.
> > >       * gcc.target/aarch64/pr97984-4.c: New test.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > > index 4cd4b037f2606e515ad8f4669d2cd13a509dd0a4..329b556311310d86aaf546d7b395a3750a9d57d4 100644
> > > --- a/gcc/config/aarch64/aarch64.c
> > > +++ b/gcc/config/aarch64/aarch64.c
> > > @@ -15536,6 +15536,39 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
> > >       stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
> > >                                                 vectype, stmt_cost);
> > >
> > > +      /* Scale costs if operation is fusing.  */
> > > +      if (stmt_info && kind == scalar_stmt)
> > > +      {
> > > +     if (gassign *stmt = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info)))
> > > +       {
> > > +         switch (gimple_assign_rhs_code (stmt))
> > > +         {
> > > +         case PLUS_EXPR:
> > > +         case MINUS_EXPR:
> > > +           {
> > > +             /* Check if operation can fuse into MSUB or MADD.  */
> > > +             tree rhs1 = gimple_assign_rhs1 (stmt);
> > > +             if (gassign *stmt1 = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (rhs1)))
> > > +               if (gimple_assign_rhs_code (stmt1) == MULT_EXPR)
> > > +                 {
> > > +                   stmt_cost = 0;
> > > +                   break;
> > > +                }
> > > +             tree rhs2 = gimple_assign_rhs2 (stmt);
> > > +             if (gassign *stmt2 = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (rhs2)))
> > > +               if (gimple_assign_rhs_code (stmt2) == MULT_EXPR)
> > > +                 {
> > > +                   stmt_cost = 0;
> > > +                   break;
> > > +                 }
> > > +           }
> > > +           break;
> > > +         default:
> > > +           break;
> > > +         }
> > > +       }
> > > +      }
> > > +
> >
> > The difficulty with this is that we can also use MLA-type operations
> > for SVE, and for Advanced SIMD if the mode is not DI.  It's not just
> > a scalar thing.
> >
> > We already take the combination into account (via aarch64_multiply_add_p)
> > when estimating issue rates.  But we don't take it into account for
> > latencies because of the reason above: if the multiplications are
> > vectorisable, then the combination applies to both the scalar and
> > the vector code, so the adjustments cancel out.  (Admittedly that
> > decision predates the special Advanced SIMD handling in
> > aarch64_multiply_add_p, so we might want to revisit it.)
> >
> > I think the key feature for this testcase is that the multiplication is
> > not part of the vector code.  I think that's something we need to check
> > if we're going to cost the scalar code more cheaply.
> >
> > But for this particular testcase, I think the main problem is that
> > we count the cost of the scalar loads even though those are needed
> > by other users.  E.g.:
> >
> > int foo(long *restrict res, long *restrict foo, long a, long b)
> > {
> >   res[0] = ((foo[0] * a) >> 1) + foo[0];
> >   res[1] = ((foo[1] * b) >> 1) + foo[1];
> > }
> >
> > gets vectorised as:
> >
> >         mov     x4, x0
> >         mov     w0, w5
> >         ldr     x5, [x1, 8]
> >         ldr     x6, [x1]
> >         mul     x3, x3, x5
> >         ldr     q1, [x1]
> >         mul     x2, x2, x6
> >         fmov    d0, x2
> >         ins     v0.d[1], x3
> >         ins     v0.d[1], x3
> >         ssra    v1.2d, v0.2d, 1
> >         str     q1, [x4]
> >         ret
> >
> > which is surely worse than the scalar:
> >
> >         mov     x4, x0
> >         mov     w0, w5
> >         ldp     x5, x1, [x1]
> >         mul     x2, x5, x2
> >         mul     x3, x1, x3
> >         add     x2, x5, x2, asr 1
> >         add     x3, x1, x3, asr 1
> >         stp     x2, x3, [x4]
> >         ret
> >
> > even though both versions can hide the shift.  There's also the point
> > that two-element vectors can be stored using a single scalar STP (and
> > loaded using a single LDP), so it's not always accurate to cost the
> > scalar stores as being twice as expensive as the vector stores.
> >
> > The fact that there are multiple problems doesn't mean that we shouldn't
> > start with the costing of instruction combinations.  It's just that
> > tackling the other aspects might be more general.
> >
> > If we do start by tackling the costing of instruction combinations,
> > I think we need to introduce the “multiplication is not vectorised”
> > condition described above.
>
> BB vectorization costing should already not cost the scalar load
> since it's not going away but it should cost a vector load.  Does
> this for some reason not work for you?  Indeed looking with
> a cross I see
>
> t.c:3:10: note: Costing subgraph:
> t.c:3:10: note: node 0x35f03b0 (max_nunits=2, refcnt=1)
> t.c:3:10: note: op template: *res_12(D) = _4;
> t.c:3:10: note:         stmt 0 *res_12(D) = _4;
> t.c:3:10: note:         stmt 1 MEM[(long int *)res_12(D) + 8B] = _8;
> t.c:3:10: note:         children 0x35f0440
> t.c:3:10: note: node 0x35f0440 (max_nunits=2, refcnt=1)
> t.c:3:10: note: op template: _4 = _1 + _3;
> t.c:3:10: note:         stmt 0 _4 = _1 + _3;
> t.c:3:10: note:         stmt 1 _8 = _5 + _7;
> t.c:3:10: note:         children 0x35f04d0 0x35f0560
> t.c:3:10: note: node 0x35f04d0 (max_nunits=2, refcnt=2)
> t.c:3:10: note: op template: _1 = *foo_10(D);
> t.c:3:10: note:         stmt 0 _1 = *foo_10(D);
> t.c:3:10: note:         stmt 1 _5 = MEM[(long int *)foo_10(D) + 8B];
> t.c:3:10: note: node 0x35f0560 (max_nunits=2, refcnt=1)
> t.c:3:10: note: op template: _3 = _2 >> 1;
> t.c:3:10: note:         stmt 0 _3 = _2 >> 1;
> t.c:3:10: note:         stmt 1 _7 = _6 >> 1;
> t.c:3:10: note:         children 0x35f05f0 0x35f0710
> t.c:3:10: note: node (external) 0x35f05f0 (max_nunits=2, refcnt=1)
> t.c:3:10: note:         stmt 0 _2 = _1 * a_11(D);
> t.c:3:10: note:         stmt 1 _6 = _5 * b_14(D);
> t.c:3:10: note:         children 0x35f04d0 0x35f0680
> t.c:3:10: note: node (external) 0x35f0680 (max_nunits=1, refcnt=1)
> t.c:3:10: note:         { a_11(D), b_14(D) }
> t.c:3:10: note: node (constant) 0x35f0710 (max_nunits=1, refcnt=1)
> t.c:3:10: note:         { 1, 1 }
>
> so the promoted external node 0x35f05f0 should keep the load live.
> vect_bb_slp_scalar_cost relies on PURE_SLP_STMT but
> that's unreliable here since the per-stmt setting cannot capture the
> different uses.  The code shares intend (and some bugs) with
> vect_bb_slp_mark_live_stmts and the problem in general is a bit
> difficult given the lack of back-mapping from stmt to SLP nodes
> referencing it.

So I think the situation might only appear with externals that have
"internal" operands, and the partitioning should (hopefully) guarantee
that we're seeing all SLP instances that eventually use a scalar
stmt so that we can produce a set of stmts used as operands
of 'extern' nodes and avoid costing those on the scalar side.

If they are not code-generated as vector extracts - but that's sth
we don't compute reliably yet unfortunately.

Ideally for such "in SLP instance" externals - that is, those that
also participate in vectorized stmts, we'd have the vector extract
and external compose more explicitely represented which would
eventually allow those to be optimized by optimize_slp as well.

> Mind to open a bugreport?

PR102176

Richard.

> Richard.
>
> >
> > Thanks,
> > Richard
> >
> > >        if (stmt_info && aarch64_use_new_vector_costs_p ())
> > >       {
> > >         /* Account for any extra "embedded" costs that apply additively
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-1.c b/gcc/testsuite/gcc.target/aarch64/pr97984-1.c
> > > new file mode 100644
> > > index 0000000000000000000000000000000000000000..9d403eb76ec3a72747f47e718a88ed6b062643f9
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/pr97984-1.c
> > > @@ -0,0 +1,12 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-additional-options "-O3 -fdump-tree-slp-all" } */
> > > +
> > > +void x (long * __restrict a, long * __restrict b)
> > > +{
> > > +  a[0] *= b[0];
> > > +  a[1] *= b[1];
> > > +  a[0] += b[0];
> > > +  a[1] += b[1];
> > > +}
> > > +
> > > +/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 1 "slp2" } } */
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-2.c b/gcc/testsuite/gcc.target/aarch64/pr97984-2.c
> > > new file mode 100644
> > > index 0000000000000000000000000000000000000000..a4086380fd613035f7ce3e8e8c89e853efa1304e
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/pr97984-2.c
> > > @@ -0,0 +1,12 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-additional-options "-O3 -std=c99 -fdump-tree-vect-all" } */
> > > +
> > > +void x (long * __restrict a, long * __restrict b, int n)
> > > +{
> > > +  for (int i = 0; i < n; i++) {
> > > +    a[i] *= b[i];
> > > +    a[i] += b[i];
> > > +  }
> > > +}
> > > +
> > > +/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 1 "vect" } } */
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-3.c b/gcc/testsuite/gcc.target/aarch64/pr97984-3.c
> > > new file mode 100644
> > > index 0000000000000000000000000000000000000000..c6c8d351834705998b3c87a40edf1a00a4bb80f9
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/pr97984-3.c
> > > @@ -0,0 +1,12 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-additional-options "-O3 -fdump-tree-slp-all" } */
> > > +
> > > +void x (long * __restrict a, long * __restrict b, long * __restrict c)
> > > +{
> > > +  a[0] *= b[0];
> > > +  a[1] *= b[1];
> > > +  a[0] += c[0];
> > > +  a[1] += c[1];
> > > +}
> > > +
> > > +/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 1 "slp2" } } */
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-4.c b/gcc/testsuite/gcc.target/aarch64/pr97984-4.c
> > > new file mode 100644
> > > index 0000000000000000000000000000000000000000..d03b0bb84dd822ab3b61e229c01be4cd1fc7f1d4
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/pr97984-4.c
> > > @@ -0,0 +1,12 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-additional-options "-O3 -std=c99 -fdump-tree-vect-all -march=armv8.3-a+sve" } */
> > > +
> > > +void x (long * __restrict a, long * __restrict b, int n)
> > > +{
> > > +  for (int i = 0; i < n; i++) {
> > > +    a[i] *= b[i];
> > > +    a[i] += b[i];
> > > +  }
> > > +}
> > > +
> > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" } } */
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4cd4b037f2606e515ad8f4669d2cd13a509dd0a4..329b556311310d86aaf546d7b395a3750a9d57d4 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15536,6 +15536,39 @@  aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
 	stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
 						  vectype, stmt_cost);
 
+      /* Scale costs if operation is fusing.  */
+      if (stmt_info && kind == scalar_stmt)
+      {
+	if (gassign *stmt = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info)))
+	  {
+	    switch (gimple_assign_rhs_code (stmt))
+	    {
+	    case PLUS_EXPR:
+	    case MINUS_EXPR:
+	      {
+		/* Check if operation can fuse into MSUB or MADD.  */
+		tree rhs1 = gimple_assign_rhs1 (stmt);
+		if (gassign *stmt1 = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (rhs1)))
+		  if (gimple_assign_rhs_code (stmt1) == MULT_EXPR)
+		    {
+		      stmt_cost = 0;
+		      break;
+		   }
+		tree rhs2 = gimple_assign_rhs2 (stmt);
+		if (gassign *stmt2 = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (rhs2)))
+		  if (gimple_assign_rhs_code (stmt2) == MULT_EXPR)
+		    {
+		      stmt_cost = 0;
+		      break;
+		    }
+	      }
+	      break;
+	    default:
+	      break;
+	    }
+	  }
+      }
+
       if (stmt_info && aarch64_use_new_vector_costs_p ())
 	{
 	  /* Account for any extra "embedded" costs that apply additively
diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-1.c b/gcc/testsuite/gcc.target/aarch64/pr97984-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..9d403eb76ec3a72747f47e718a88ed6b062643f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr97984-1.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-slp-all" } */
+
+void x (long * __restrict a, long * __restrict b)
+{
+  a[0] *= b[0];
+  a[1] *= b[1];
+  a[0] += b[0];
+  a[1] += b[1];
+}
+
+/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 1 "slp2" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-2.c b/gcc/testsuite/gcc.target/aarch64/pr97984-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..a4086380fd613035f7ce3e8e8c89e853efa1304e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr97984-2.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99 -fdump-tree-vect-all" } */
+
+void x (long * __restrict a, long * __restrict b, int n)
+{
+  for (int i = 0; i < n; i++) {
+    a[i] *= b[i];
+    a[i] += b[i];
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-3.c b/gcc/testsuite/gcc.target/aarch64/pr97984-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..c6c8d351834705998b3c87a40edf1a00a4bb80f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr97984-3.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-slp-all" } */
+
+void x (long * __restrict a, long * __restrict b, long * __restrict c)
+{
+  a[0] *= b[0];
+  a[1] *= b[1];
+  a[0] += c[0];
+  a[1] += c[1];
+}
+
+/* { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 1 "slp2" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr97984-4.c b/gcc/testsuite/gcc.target/aarch64/pr97984-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..d03b0bb84dd822ab3b61e229c01be4cd1fc7f1d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr97984-4.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99 -fdump-tree-vect-all -march=armv8.3-a+sve" } */
+
+void x (long * __restrict a, long * __restrict b, int n)
+{
+  for (int i = 0; i < n; i++) {
+    a[i] *= b[i];
+    a[i] += b[i];
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" } } */