diff mbox series

AArch64: only discount MLA for vector and scalar statements

Message ID patch-18011-tamar@arm.com
State New
Headers show
Series AArch64: only discount MLA for vector and scalar statements | expand

Commit Message

Tamar Christina Nov. 15, 2023, 5:02 p.m. UTC
Hi All,

In testcases gcc.dg/tree-ssa/slsr-19.c  and gcc.dg/tree-ssa/slsr-20.c we have a
fairly simple computation.  On the current generic costing we generate:

f:
        add     w0, w0, 2
        madd    w1, w0, w1, w1
        lsl     w0, w1, 1
        ret

but on any other cost model but generic (including the new up coming generic)
we generate:

f:
        adrp    x2, .LC0
        dup     v31.2s, w0
        fmov    s30, w1
        ldr     d29, [x2, #:lo12:.LC0]
        add     v31.2s, v31.2s, v29.2s
        mul     v31.2s, v31.2s, v30.s[0]
        addp    v31.2s, v31.2s, v31.2s
        fmov    w0, s31
        ret
.LC0:
        .word   2
        .word   4

This seems to be because the vectorizer thinks the vector transfers are free:

x1_4 + x2_6 1 times vector_stmt costs 0 in body
x1_4 + x2_6 1 times vec_to_scalar costs 0 in body  

This happens because the stmt it's using to get the cost of register transfers
for the given type happens to be one feeding into a MUL.  we incorrectly
discount the + for the register transfer.

This is fixed by guarding the check for aarch64_multiply_add_p with a kind
check and only do it for scalar_stmt and vector_stmt.

I'm sending this separate to my patch series but it's required for it.
It also seems to fix overvectorization cases in fotonik3d_r in SPECCPU 2017.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (aarch64_adjust_stmt_cost): Guard mla.
	(aarch64_vector_costs::count_ops): Likewise.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 06ec22057e10fd591710aa4c795a78f34eeaa8e5..0f05877ead3dca6477ebc70f53c632e4eb48d439 100644




--
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 06ec22057e10fd591710aa4c795a78f34eeaa8e5..0f05877ead3dca6477ebc70f53c632e4eb48d439 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -14587,7 +14587,7 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
 	}
 
       gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
-      if (assign)
+      if ((kind == scalar_stmt || kind == vector_stmt) && assign)
 	{
 	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
 	  if (!vect_is_reduction (stmt_info)
@@ -14669,7 +14669,9 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
     }
 
   /* Assume that multiply-adds will become a single operation.  */
-  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
+  if (stmt_info
+      && (kind == scalar_stmt || kind == vector_stmt)
+      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
     return;
 
   /* Assume that bool AND with compare operands will become a single

Comments

Richard Sandiford Nov. 16, 2023, 10:34 a.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> In testcases gcc.dg/tree-ssa/slsr-19.c  and gcc.dg/tree-ssa/slsr-20.c we have a
> fairly simple computation.  On the current generic costing we generate:
>
> f:
>         add     w0, w0, 2
>         madd    w1, w0, w1, w1
>         lsl     w0, w1, 1
>         ret
>
> but on any other cost model but generic (including the new up coming generic)
> we generate:
>
> f:
>         adrp    x2, .LC0
>         dup     v31.2s, w0
>         fmov    s30, w1
>         ldr     d29, [x2, #:lo12:.LC0]
>         add     v31.2s, v31.2s, v29.2s
>         mul     v31.2s, v31.2s, v30.s[0]
>         addp    v31.2s, v31.2s, v31.2s
>         fmov    w0, s31
>         ret
> .LC0:
>         .word   2
>         .word   4
>
> This seems to be because the vectorizer thinks the vector transfers are free:
>
> x1_4 + x2_6 1 times vector_stmt costs 0 in body
> x1_4 + x2_6 1 times vec_to_scalar costs 0 in body  
>
> This happens because the stmt it's using to get the cost of register transfers
> for the given type happens to be one feeding into a MUL.  we incorrectly
> discount the + for the register transfer.
>
> This is fixed by guarding the check for aarch64_multiply_add_p with a kind
> check and only do it for scalar_stmt and vector_stmt.
>
> I'm sending this separate to my patch series but it's required for it.
> It also seems to fix overvectorization cases in fotonik3d_r in SPECCPU 2017.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.cc (aarch64_adjust_stmt_cost): Guard mla.
> 	(aarch64_vector_costs::count_ops): Likewise.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 06ec22057e10fd591710aa4c795a78f34eeaa8e5..0f05877ead3dca6477ebc70f53c632e4eb48d439 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -14587,7 +14587,7 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
>  	}
>  
>        gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
> -      if (assign)
> +      if ((kind == scalar_stmt || kind == vector_stmt) && assign)
>  	{
>  	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
>  	  if (!vect_is_reduction (stmt_info)

This properly protects both the MLA and aarch64_bool_compound_p tests
(good!), so...

> @@ -14669,7 +14669,9 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
>      }
>  
>    /* Assume that multiply-adds will become a single operation.  */
> -  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
> +  if (stmt_info
> +      && (kind == scalar_stmt || kind == vector_stmt)
> +      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
>      return;
>  
>    /* Assume that bool AND with compare operands will become a single

...I think we should do the same here, for the code that beings with
the comment line above.  It's probably worth sharing the:

  if (stmt_info && (kind == scalar_stmt || kind == vector_stmt))

condition to help avoid the same situation in future.

OK with that change, thanks.

Richard
diff mbox series

Patch

--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -14587,7 +14587,7 @@  aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
 	}
 
       gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
-      if (assign)
+      if ((kind == scalar_stmt || kind == vector_stmt) && assign)
 	{
 	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
 	  if (!vect_is_reduction (stmt_info)
@@ -14669,7 +14669,9 @@  aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
     }
 
   /* Assume that multiply-adds will become a single operation.  */
-  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
+  if (stmt_info
+      && (kind == scalar_stmt || kind == vector_stmt)
+      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
     return;
 
   /* Assume that bool AND with compare operands will become a single