diff mbox series

[5/8] AArch64: Update Generic Armv9-a cost model to release costs

Message ID ZqNqh0V4pBZHSX/i@arm.com
State New
Headers show
Series [1/8] AArch64: Update Neoverse V2 cost model to release costs | expand

Commit Message

Tamar Christina July 26, 2024, 9:21 a.m. UTC
Hi All,

this updates the costs for gener-armv9-a based on the updated costs for
Neoverse V2 and Neoverse N2.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/tuning_models/generic_armv9_a.h: Update costs.

---




--

Comments

Kyrylo Tkachov July 26, 2024, 12:35 p.m. UTC | #1
Hi Tamar,

> On 26 Jul 2024, at 11:21, Tamar Christina <tamar.christina@arm.com> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> Hi All,
> 
> this updates the costs for gener-armv9-a based on the updated costs for
> Neoverse V2 and Neoverse N2.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?


Remind me again, are we using a blend of costs of N2 and V2 for the generic Armv9 costs?
If so, this is ok.
Thanks,
Kyrill

> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>        * config/aarch64/tuning_models/generic_armv9_a.h: Update costs.
> 
> ---
> diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
> index 0a08c4b4347332d85e15bece30859129feb2d492..b39a0c73db910888168790888d24ddf4406bf1ee 100644
> --- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
> +++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
> @@ -58,7 +58,7 @@ static const advsimd_vec_cost generic_armv9_a_advsimd_vector_cost =
>   2, /* ld2_st2_permute_cost */
>   2, /* ld3_st3_permute_cost  */
>   3, /* ld4_st4_permute_cost  */
> -  3, /* permute_cost  */
> +  2, /* permute_cost  */
>   4, /* reduc_i8_cost  */
>   4, /* reduc_i16_cost  */
>   2, /* reduc_i32_cost  */
> @@ -87,28 +87,28 @@ static const sve_vec_cost generic_armv9_a_sve_vector_cost =
>   {
>     2, /* int_stmt_cost  */
>     2, /* fp_stmt_cost  */
> -    3, /* ld2_st2_permute_cost  */
> -    4, /* ld3_st3_permute_cost  */
> -    4, /* ld4_st4_permute_cost  */
> -    3, /* permute_cost  */
> +    2, /* ld2_st2_permute_cost  */
> +    3, /* ld3_st3_permute_cost  */
> +    3, /* ld4_st4_permute_cost  */
> +    2, /* permute_cost  */
>     /* Theoretically, a reduction involving 15 scalar ADDs could
>        complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
> -       completes in 11 cycles, so give it a cost of 15 + 6.  */
> -    21, /* reduc_i8_cost  */
> -    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
> -    13, /* reduc_i16_cost  */
> -    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
> -    9, /* reduc_i32_cost  */
> -    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
> -    2, /* reduc_i64_cost  */
> +       completes in 9 cycles, so give it a cost of 15 + 4.  */
> +    19, /* reduc_i8_cost  */
> +    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 8: 7 + 5.  */
> +    12, /* reduc_i16_cost  */
> +    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 6: 3 + 4.  */
> +    7, /* reduc_i32_cost  */
> +    /* Likewise for 1 scalar ADDs (~1 cycles) vs. 4: 1 + 3.  */
> +    4, /* reduc_i64_cost  */
>     /* Theoretically, a reduction involving 7 scalar FADDs could
> -       complete in ~8 cycles and would have a cost of 14.  FADDV
> -       completes in 6 cycles, so give it a cost of 14 - 2.  */
> -    12, /* reduc_f16_cost  */
> -    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
> -    6, /* reduc_f32_cost  */
> -    /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
> -    2, /* reduc_f64_cost  */
> +       complete in ~8 cycles and would have a cost of  7.  FADDV
> +       completes in 8 cycles, so give it a cost of 7 + 0.  */
> +    7, /* reduc_f16_cost  */
> +    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 3 + 2.  */
> +    5, /* reduc_f32_cost  */
> +    /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 1 + 2.  */
> +    3, /* reduc_f64_cost  */
>     2, /* store_elt_extra_cost  */
>     /* This value is just inherited from the Cortex-A57 table.  */
>     8, /* vec_to_scalar_cost  */
> @@ -128,7 +128,7 @@ static const sve_vec_cost generic_armv9_a_sve_vector_cost =
>   /* A strided Advanced SIMD x64 load would take two parallel FP loads
>      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
>      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
> -     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
> +     (cost 8) and a vec_construct (cost 4).  Add a full vector operation
>      (cost 2) to that, to avoid the difference being lost in rounding.
> 
>      There is no easy comparison between a strided Advanced SIMD x32 load
> @@ -166,14 +166,14 @@ static const aarch64_sve_vec_issue_info generic_armv9_a_sve_issue_info =
> {
>   {
>     {
> -      3, /* loads_per_cycle  */
> +      3, /* loads_stores_per_cycle  */
>       2, /* stores_per_cycle  */
>       2, /* general_ops_per_cycle  */
>       0, /* fp_simd_load_general_ops  */
>       1 /* fp_simd_store_general_ops  */
>     },
>     2, /* ld2_st2_general_ops  */
> -    3, /* ld3_st3_general_ops  */
> +    2, /* ld3_st3_general_ops  */
>     3 /* ld4_st4_general_ops  */
>   },
>   2, /* pred_ops_per_cycle  */
> @@ -191,7 +191,7 @@ static const aarch64_vec_issue_info generic_armv9_a_vec_issue_info =
>   &generic_armv9_a_sve_issue_info
> };
> 
> -/* Neoverse N2 costs for vector insn classes.  */
> +/* Generic_armv9_a costs for vector insn classes.  */
> static const struct cpu_vector_cost generic_armv9_a_vector_cost =
> {
>   1, /* scalar_int_stmt_cost  */
> @@ -228,7 +228,7 @@ static const struct tune_params generic_armv9_a_tunings =
>   "32:16",     /* loop_align.  */
>   2,   /* int_reassoc_width.  */
>   4,   /* fp_reassoc_width.  */
> -  1,   /* fma_reassoc_width.  */
> +  2,   /* fma_reassoc_width.  */
>   2,   /* vec_reassoc_width.  */
>   2,   /* min_div_recip_mul_sf.  */
>   2,   /* min_div_recip_mul_df.  */
> 
> 
> 
> 
> --
> <rb18668.patch>
Tamar Christina July 26, 2024, 12:38 p.m. UTC | #2
> -----Original Message-----
> From: Kyrylo Tkachov <ktkachov@nvidia.com>
> Sent: Friday, July 26, 2024 1:35 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; ktkachov@gcc.gnu.org; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: Re: [PATCH 5/8]AArch64: Update Generic Armv9-a cost model to release
> costs
> 
> Hi Tamar,
> 
> > On 26 Jul 2024, at 11:21, Tamar Christina <tamar.christina@arm.com> wrote:
> >
> > External email: Use caution opening links or attachments
> >
> >
> > Hi All,
> >
> > this updates the costs for gener-armv9-a based on the updated costs for
> > Neoverse V2 and Neoverse N2.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> 
> 
> Remind me again, are we using a blend of costs of N2 and V2 for the generic
> Armv9 costs?

Yeah, the costs were heavily based on Neoverse N2 and adjusted to favor SVE more.

Thanks,
Tamar

> If so, this is ok.
> Thanks,
> Kyrill
> 
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> >        * config/aarch64/tuning_models/generic_armv9_a.h: Update costs.
> >
> > ---
> > diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
> b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
> > index
> 0a08c4b4347332d85e15bece30859129feb2d492..b39a0c73db9108881687908
> 88d24ddf4406bf1ee 100644
> > --- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
> > +++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
> > @@ -58,7 +58,7 @@ static const advsimd_vec_cost
> generic_armv9_a_advsimd_vector_cost =
> >   2, /* ld2_st2_permute_cost */
> >   2, /* ld3_st3_permute_cost  */
> >   3, /* ld4_st4_permute_cost  */
> > -  3, /* permute_cost  */
> > +  2, /* permute_cost  */
> >   4, /* reduc_i8_cost  */
> >   4, /* reduc_i16_cost  */
> >   2, /* reduc_i32_cost  */
> > @@ -87,28 +87,28 @@ static const sve_vec_cost
> generic_armv9_a_sve_vector_cost =
> >   {
> >     2, /* int_stmt_cost  */
> >     2, /* fp_stmt_cost  */
> > -    3, /* ld2_st2_permute_cost  */
> > -    4, /* ld3_st3_permute_cost  */
> > -    4, /* ld4_st4_permute_cost  */
> > -    3, /* permute_cost  */
> > +    2, /* ld2_st2_permute_cost  */
> > +    3, /* ld3_st3_permute_cost  */
> > +    3, /* ld4_st4_permute_cost  */
> > +    2, /* permute_cost  */
> >     /* Theoretically, a reduction involving 15 scalar ADDs could
> >        complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
> > -       completes in 11 cycles, so give it a cost of 15 + 6.  */
> > -    21, /* reduc_i8_cost  */
> > -    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
> > -    13, /* reduc_i16_cost  */
> > -    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
> > -    9, /* reduc_i32_cost  */
> > -    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
> > -    2, /* reduc_i64_cost  */
> > +       completes in 9 cycles, so give it a cost of 15 + 4.  */
> > +    19, /* reduc_i8_cost  */
> > +    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 8: 7 + 5.  */
> > +    12, /* reduc_i16_cost  */
> > +    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 6: 3 + 4.  */
> > +    7, /* reduc_i32_cost  */
> > +    /* Likewise for 1 scalar ADDs (~1 cycles) vs. 4: 1 + 3.  */
> > +    4, /* reduc_i64_cost  */
> >     /* Theoretically, a reduction involving 7 scalar FADDs could
> > -       complete in ~8 cycles and would have a cost of 14.  FADDV
> > -       completes in 6 cycles, so give it a cost of 14 - 2.  */
> > -    12, /* reduc_f16_cost  */
> > -    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
> > -    6, /* reduc_f32_cost  */
> > -    /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
> > -    2, /* reduc_f64_cost  */
> > +       complete in ~8 cycles and would have a cost of  7.  FADDV
> > +       completes in 8 cycles, so give it a cost of 7 + 0.  */
> > +    7, /* reduc_f16_cost  */
> > +    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 3 + 2.  */
> > +    5, /* reduc_f32_cost  */
> > +    /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 1 + 2.  */
> > +    3, /* reduc_f64_cost  */
> >     2, /* store_elt_extra_cost  */
> >     /* This value is just inherited from the Cortex-A57 table.  */
> >     8, /* vec_to_scalar_cost  */
> > @@ -128,7 +128,7 @@ static const sve_vec_cost
> generic_armv9_a_sve_vector_cost =
> >   /* A strided Advanced SIMD x64 load would take two parallel FP loads
> >      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
> >      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
> > -     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
> > +     (cost 8) and a vec_construct (cost 4).  Add a full vector operation
> >      (cost 2) to that, to avoid the difference being lost in rounding.
> >
> >      There is no easy comparison between a strided Advanced SIMD x32 load
> > @@ -166,14 +166,14 @@ static const aarch64_sve_vec_issue_info
> generic_armv9_a_sve_issue_info =
> > {
> >   {
> >     {
> > -      3, /* loads_per_cycle  */
> > +      3, /* loads_stores_per_cycle  */
> >       2, /* stores_per_cycle  */
> >       2, /* general_ops_per_cycle  */
> >       0, /* fp_simd_load_general_ops  */
> >       1 /* fp_simd_store_general_ops  */
> >     },
> >     2, /* ld2_st2_general_ops  */
> > -    3, /* ld3_st3_general_ops  */
> > +    2, /* ld3_st3_general_ops  */
> >     3 /* ld4_st4_general_ops  */
> >   },
> >   2, /* pred_ops_per_cycle  */
> > @@ -191,7 +191,7 @@ static const aarch64_vec_issue_info
> generic_armv9_a_vec_issue_info =
> >   &generic_armv9_a_sve_issue_info
> > };
> >
> > -/* Neoverse N2 costs for vector insn classes.  */
> > +/* Generic_armv9_a costs for vector insn classes.  */
> > static const struct cpu_vector_cost generic_armv9_a_vector_cost =
> > {
> >   1, /* scalar_int_stmt_cost  */
> > @@ -228,7 +228,7 @@ static const struct tune_params
> generic_armv9_a_tunings =
> >   "32:16",     /* loop_align.  */
> >   2,   /* int_reassoc_width.  */
> >   4,   /* fp_reassoc_width.  */
> > -  1,   /* fma_reassoc_width.  */
> > +  2,   /* fma_reassoc_width.  */
> >   2,   /* vec_reassoc_width.  */
> >   2,   /* min_div_recip_mul_sf.  */
> >   2,   /* min_div_recip_mul_df.  */
> >
> >
> >
> >
> > --
> > <rb18668.patch>
diff mbox series

Patch

diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
index 0a08c4b4347332d85e15bece30859129feb2d492..b39a0c73db910888168790888d24ddf4406bf1ee 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
@@ -58,7 +58,7 @@  static const advsimd_vec_cost generic_armv9_a_advsimd_vector_cost =
   2, /* ld2_st2_permute_cost */
   2, /* ld3_st3_permute_cost  */
   3, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
+  2, /* permute_cost  */
   4, /* reduc_i8_cost  */
   4, /* reduc_i16_cost  */
   2, /* reduc_i32_cost  */
@@ -87,28 +87,28 @@  static const sve_vec_cost generic_armv9_a_sve_vector_cost =
   {
     2, /* int_stmt_cost  */
     2, /* fp_stmt_cost  */
-    3, /* ld2_st2_permute_cost  */
-    4, /* ld3_st3_permute_cost  */
-    4, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
+    2, /* ld2_st2_permute_cost  */
+    3, /* ld3_st3_permute_cost  */
+    3, /* ld4_st4_permute_cost  */
+    2, /* permute_cost  */
     /* Theoretically, a reduction involving 15 scalar ADDs could
        complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
-       completes in 11 cycles, so give it a cost of 15 + 6.  */
-    21, /* reduc_i8_cost  */
-    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
-    13, /* reduc_i16_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
-    9, /* reduc_i32_cost  */
-    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
-    2, /* reduc_i64_cost  */
+       completes in 9 cycles, so give it a cost of 15 + 4.  */
+    19, /* reduc_i8_cost  */
+    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 8: 7 + 5.  */
+    12, /* reduc_i16_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 6: 3 + 4.  */
+    7, /* reduc_i32_cost  */
+    /* Likewise for 1 scalar ADDs (~1 cycles) vs. 4: 1 + 3.  */
+    4, /* reduc_i64_cost  */
     /* Theoretically, a reduction involving 7 scalar FADDs could
-       complete in ~8 cycles and would have a cost of 14.  FADDV
-       completes in 6 cycles, so give it a cost of 14 - 2.  */
-    12, /* reduc_f16_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
-    6, /* reduc_f32_cost  */
-    /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
-    2, /* reduc_f64_cost  */
+       complete in ~8 cycles and would have a cost of  7.  FADDV
+       completes in 8 cycles, so give it a cost of 7 + 0.  */
+    7, /* reduc_f16_cost  */
+    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 3 + 2.  */
+    5, /* reduc_f32_cost  */
+    /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 1 + 2.  */
+    3, /* reduc_f64_cost  */
     2, /* store_elt_extra_cost  */
     /* This value is just inherited from the Cortex-A57 table.  */
     8, /* vec_to_scalar_cost  */
@@ -128,7 +128,7 @@  static const sve_vec_cost generic_armv9_a_sve_vector_cost =
   /* A strided Advanced SIMD x64 load would take two parallel FP loads
      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
-     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
+     (cost 8) and a vec_construct (cost 4).  Add a full vector operation
      (cost 2) to that, to avoid the difference being lost in rounding.
 
      There is no easy comparison between a strided Advanced SIMD x32 load
@@ -166,14 +166,14 @@  static const aarch64_sve_vec_issue_info generic_armv9_a_sve_issue_info =
 {
   {
     {
-      3, /* loads_per_cycle  */
+      3, /* loads_stores_per_cycle  */
       2, /* stores_per_cycle  */
       2, /* general_ops_per_cycle  */
       0, /* fp_simd_load_general_ops  */
       1 /* fp_simd_store_general_ops  */
     },
     2, /* ld2_st2_general_ops  */
-    3, /* ld3_st3_general_ops  */
+    2, /* ld3_st3_general_ops  */
     3 /* ld4_st4_general_ops  */
   },
   2, /* pred_ops_per_cycle  */
@@ -191,7 +191,7 @@  static const aarch64_vec_issue_info generic_armv9_a_vec_issue_info =
   &generic_armv9_a_sve_issue_info
 };
 
-/* Neoverse N2 costs for vector insn classes.  */
+/* Generic_armv9_a costs for vector insn classes.  */
 static const struct cpu_vector_cost generic_armv9_a_vector_cost =
 {
   1, /* scalar_int_stmt_cost  */
@@ -228,7 +228,7 @@  static const struct tune_params generic_armv9_a_tunings =
   "32:16",	/* loop_align.  */
   2,	/* int_reassoc_width.  */
   4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
+  2,	/* fma_reassoc_width.  */
   2,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */