Message ID | 20231123042744.1347370-1-philipp.tomsich@vrull.eu |
---|---|
State | New |
Headers | show |
Series | [v2] aarch64: Add support for Ampere-1B (-mcpu=ampere1b) CPU | expand |
Philipp Tomsich <philipp.tomsich@vrull.eu> writes: > This patch adds initial support for Ampere-1B core. > > The Ampere-1B core implements ARMv8.7 with the following (compiler > visible) extensions: > - CSSC (Common Short Sequence Compression instructions), > - MTE (Memory Tagging Extension) > - SM3/SM4 > > gcc/ChangeLog: > > * config/aarch64/aarch64-cores.def (AARCH64_CORE): Add ampere-1b > * config/aarch64/aarch64-cost-tables.h: Add ampere1b_extra_costs > * config/aarch64/aarch64-tune.md: Regenerate > * config/aarch64/aarch64.cc: Include ampere1b tuning model > * doc/invoke.texi: Document -mcpu=ampere1b > * config/aarch64/tuning_models/ampere1b.h: New file. OK, thanks, but: > > Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu> > --- > > Changes in v2: > - moved ampere1b model to a separated file > - regenerated aarch64-tune.md after rebase > > gcc/config/aarch64/aarch64-cores.def | 1 + > gcc/config/aarch64/aarch64-cost-tables.h | 107 ++++++++++++++++++ > gcc/config/aarch64/aarch64-tune.md | 2 +- > gcc/config/aarch64/aarch64.cc | 1 + > gcc/config/aarch64/tuning_models/ampere1b.h | 114 ++++++++++++++++++++ > gcc/doc/invoke.texi | 2 +- > 6 files changed, 225 insertions(+), 2 deletions(-) > create mode 100644 gcc/config/aarch64/tuning_models/ampere1b.h > > diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def > index 16752b77f4b..ad896a80f1f 100644 > --- a/gcc/config/aarch64/aarch64-cores.def > +++ b/gcc/config/aarch64/aarch64-cores.def > @@ -74,6 +74,7 @@ AARCH64_CORE("thunderxt83", thunderxt83, thunderx, V8A, (CRC, CRYPTO), thu > /* Ampere Computing ('\xC0') cores. */ > AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES, SHA3), ampere1, 0xC0, 0xac3, -1) > AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG, AES, SHA3, SM4, MEMTAG), ampere1a, 0xC0, 0xac4, -1) > +AARCH64_CORE("ampere1b", ampere1b, cortexa57, V8_7A, (F16, RNG, AES, SHA3, SM4, MEMTAG, CSSC), ampere1b, 0xC0, 0xac5, -1) > /* Do not swap around "emag" and "xgene1", > this order is required to handle variant correctly. */ > AARCH64_CORE("emag", emag, xgene1, V8A, (CRC, CRYPTO), emag, 0x50, 0x000, 3) > diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h > index 0cb638f3a13..4c8da7f119b 100644 > --- a/gcc/config/aarch64/aarch64-cost-tables.h > +++ b/gcc/config/aarch64/aarch64-cost-tables.h > @@ -882,4 +882,111 @@ const struct cpu_cost_table ampere1a_extra_costs = > } > }; > > +const struct cpu_cost_table ampere1b_extra_costs = > +{ > + /* ALU */ > + { > + 0, /* arith. */ > + 0, /* logical. */ > + 0, /* shift. */ > + COSTS_N_INSNS (1), /* shift_reg. */ > + 0, /* arith_shift. */ > + COSTS_N_INSNS (1), /* arith_shift_reg. */ > + 0, /* log_shift. */ > + COSTS_N_INSNS (1), /* log_shift_reg. */ > + 0, /* extend. */ > + COSTS_N_INSNS (1), /* extend_arith. */ > + 0, /* bfi. */ > + 0, /* bfx. */ > + 0, /* clz. */ > + 0, /* rev. */ > + 0, /* non_exec. */ > + true /* non_exec_costs_exec. */ > + }, > + { > + /* MULT SImode */ > + { > + COSTS_N_INSNS (2), /* simple. */ > + COSTS_N_INSNS (2), /* flag_setting. */ > + COSTS_N_INSNS (2), /* extend. */ > + COSTS_N_INSNS (3), /* add. */ > + COSTS_N_INSNS (3), /* extend_add. */ > + COSTS_N_INSNS (12) /* idiv. */ > + }, > + /* MULT DImode */ > + { > + COSTS_N_INSNS (2), /* simple. */ > + 0, /* flag_setting (N/A). */ > + COSTS_N_INSNS (2), /* extend. */ > + COSTS_N_INSNS (3), /* add. */ > + COSTS_N_INSNS (3), /* extend_add. */ > + COSTS_N_INSNS (18) /* idiv. */ > + } > + }, > + /* LD/ST */ > + { > + COSTS_N_INSNS (2), /* load. */ > + COSTS_N_INSNS (2), /* load_sign_extend. */ > + 0, /* ldrd (n/a). */ > + 0, /* ldm_1st. */ > + 0, /* ldm_regs_per_insn_1st. */ > + 0, /* ldm_regs_per_insn_subsequent. */ > + COSTS_N_INSNS (3), /* loadf. */ > + COSTS_N_INSNS (3), /* loadd. */ > + COSTS_N_INSNS (3), /* load_unaligned. */ > + 0, /* store. */ > + 0, /* strd. */ > + 0, /* stm_1st. */ > + 0, /* stm_regs_per_insn_1st. */ > + 0, /* stm_regs_per_insn_subsequent. */ > + COSTS_N_INSNS (1), /* storef. */ > + COSTS_N_INSNS (1), /* stored. */ > + COSTS_N_INSNS (1), /* store_unaligned. */ > + COSTS_N_INSNS (3), /* loadv. */ > + COSTS_N_INSNS (3) /* storev. */ > + }, > + { > + /* FP SFmode */ > + { > + COSTS_N_INSNS (18), /* div. */ > + COSTS_N_INSNS (3), /* mult. */ > + COSTS_N_INSNS (3), /* mult_addsub. */ > + COSTS_N_INSNS (3), /* fma. */ > + COSTS_N_INSNS (2), /* addsub. */ > + COSTS_N_INSNS (1), /* fpconst. */ > + COSTS_N_INSNS (2), /* neg. */ > + COSTS_N_INSNS (2), /* compare. */ > + COSTS_N_INSNS (2), /* widen. */ > + COSTS_N_INSNS (2), /* narrow. */ > + COSTS_N_INSNS (6), /* toint. */ > + COSTS_N_INSNS (4), /* fromint. */ > + COSTS_N_INSNS (2) /* roundint. */ > + }, > + /* FP DFmode */ > + { > + COSTS_N_INSNS (18), /* div. */ > + COSTS_N_INSNS (3), /* mult. */ > + COSTS_N_INSNS (3), /* mult_addsub. */ > + COSTS_N_INSNS (3), /* fma. */ > + COSTS_N_INSNS (2), /* addsub. */ > + COSTS_N_INSNS (1), /* fpconst. */ > + COSTS_N_INSNS (2), /* neg. */ > + COSTS_N_INSNS (2), /* compare. */ > + COSTS_N_INSNS (2), /* widen. */ > + COSTS_N_INSNS (2), /* narrow. */ > + COSTS_N_INSNS (6), /* toint. */ > + COSTS_N_INSNS (4), /* fromint. */ > + COSTS_N_INSNS (2) /* roundint. */ > + } > + }, > + /* Vector */ > + { > + COSTS_N_INSNS (1), /* alu. */ > + COSTS_N_INSNS (2), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (1), /* dup. */ > + COSTS_N_INSNS (1) /* extract. */ > + } > +}; > + > #endif > diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md > index 61bb8521125..98e6882d432 100644 > --- a/gcc/config/aarch64/aarch64-tune.md > +++ b/gcc/config/aarch64/aarch64-tune.md > @@ -1,5 +1,5 @@ > ;; -*- buffer-read-only: t -*- > ;; Generated automatically by gentune.sh from aarch64-cores.def > (define_attr "tune" > - "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a" > + "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,ampere1b,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a" > (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index f6f6f94bf43..6a6cceb52c7 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -375,6 +375,7 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] = > #include "tuning_models/neoversen1.h" > #include "tuning_models/ampere1.h" > #include "tuning_models/ampere1a.h" > +#include "tuning_models/ampere1b.h" > #include "tuning_models/neoversev1.h" > #include "tuning_models/neoverse512tvb.h" > #include "tuning_models/neoversen2.h" > diff --git a/gcc/config/aarch64/tuning_models/ampere1b.h b/gcc/config/aarch64/tuning_models/ampere1b.h > new file mode 100644 > index 00000000000..c4928f50d29 > --- /dev/null > +++ b/gcc/config/aarch64/tuning_models/ampere1b.h > @@ -0,0 +1,114 @@ > +/* Tuning model description for the Ampere1B core. > + Copyright (C) 2023 Free Software Foundation, Inc. > + > + This file is part of GCC. > + > + GCC is free software; you can redistribute it and/or modify it > + under the terms of the GNU General Public License as published by > + the Free Software Foundation; either version 3, or (at your option) > + any later version. > + > + GCC is distributed in the hope that it will be useful, but > + WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + General Public License for more details. > + > + You should have received a copy of the GNU General Public License > + along with GCC; see the file COPYING3. If not see > + <http://www.gnu.org/licenses/>. */ > + > +#ifndef GCC_AARCH64_H_AMPERE1B > +#define GCC_AARCH64_H_AMPERE1B > + > +#include "generic.h" > + > +static const cpu_prefetch_tune ampere1b_prefetch_tune = > +{ > + 48, /* num_slots */ > + 64, /* l1_cache_size */ > + 64, /* l1_cache_line_size */ > + 2048, /* l2_cache_size */ > + true, /* prefetch_dynamic_strides */ > + -1, /* minimum_stride */ > + -1 /* default_opt_level */ > +}; > + > +static const advsimd_vec_cost ampere1b_advsimd_vector_cost = > +{ > + 1, /* int_stmt_cost */ > + 3, /* fp_stmt_cost */ > + 0, /* ld2_st2_permute_cost */ > + 0, /* ld3_st3_permute_cost */ > + 0, /* ld4_st4_permute_cost */ > + 2, /* permute_cost */ > + 8, /* reduc_i8_cost */ > + 6, /* reduc_i16_cost */ > + 4, /* reduc_i32_cost */ > + 2, /* reduc_i64_cost */ > + 9, /* reduc_f16_cost */ > + 6, /* reduc_f32_cost */ > + 3, /* reduc_f64_cost */ > + 5, /* store_elt_extra_cost */ > + 5, /* vec_to_scalar_cost */ > + 5, /* scalar_to_vec_cost */ > + 4, /* align_load_cost */ > + 4, /* unalign_load_cost */ > + 1, /* unalign_store_cost */ > + 1 /* store_cost */ > +}; > + > +/* Ampere-1B costs for vector insn classes. */ > +static const struct cpu_vector_cost ampere1b_vector_cost = > +{ > + 1, /* scalar_int_stmt_cost */ > + 3, /* scalar_fp_stmt_cost */ > + 4, /* scalar_load_cost */ > + 1, /* scalar_store_cost */ > + 1, /* cond_taken_branch_cost */ > + 1, /* cond_not_taken_branch_cost */ > + &ere1b_advsimd_vector_cost, /* advsimd */ > + nullptr, /* sve */ > + nullptr /* issue_info */ > +}; > + > +static const struct tune_params ampere1b_tunings = > +{ > + &ere1b_extra_costs, > + &generic_addrcost_table, > + &generic_regmove_cost, > + &ere1b_vector_cost, > + &generic_branch_cost, > + &generic_approx_modes, > + SVE_NOT_IMPLEMENTED, /* sve_width */ > + { 3, /* load_int. */ > + 1, /* store_int. */ > + 4, /* load_fp. */ > + 4, /* store_fp. */ > + 4, /* load_pred. */ > + 4 /* store_pred. */ > + }, /* memmov_cost. */ > + 4, /* issue_rate */ > + (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC | > + AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK | > + AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ | > + AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ | > + AARCH64_FUSE_ADDSUB_2REG_CONST1), > + /* fusible_ops */ > + "32", /* function_align. */ > + "4", /* jump_align. */ > + "32:16", /* loop_align. */ > + 2, /* int_reassoc_width. */ > + 4, /* fp_reassoc_width. */ > + 1, /* fma_reassoc_width. */ > + 2, /* vec_reassoc_width. */ > + 2, /* min_div_recip_mul_sf. */ > + 2, /* min_div_recip_mul_df. */ > + 0, /* max_case_values. */ > + tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */ > + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ would you mind testing the performance of AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS? The plan was to make that the default at some stage. Richard > + &ere1b_prefetch_tune, > + AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */ > + AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */ > +}; > + > +#endif /* GCC_AARCH64_H_AMPERE1B */ > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi > index 7666de1e9b2..54e4101e7a2 100644 > --- a/gcc/doc/invoke.texi > +++ b/gcc/doc/invoke.texi > @@ -20851,7 +20851,7 @@ performance of the code. Permissible values for this option are: > @samp{cortex-r82}, @samp{cortex-x1}, @samp{cortex-x1c}, @samp{cortex-x2}, > @samp{cortex-x3}, @samp{cortex-x4}, @samp{cortex-a510}, @samp{cortex-a520}, > @samp{cortex-a710}, @samp{cortex-a715}, @samp{cortex-a720}, @samp{ampere1}, > -@samp{ampere1a}, and @samp{native}. > +@samp{ampere1a}, @samp{ampere1b}, and @samp{native}. > > The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53}, > @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},
On Tue, 28 Nov 2023 at 12:21, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Philipp Tomsich <philipp.tomsich@vrull.eu> writes: > > This patch adds initial support for Ampere-1B core. > > > > The Ampere-1B core implements ARMv8.7 with the following (compiler > > visible) extensions: > > - CSSC (Common Short Sequence Compression instructions), > > - MTE (Memory Tagging Extension) > > - SM3/SM4 > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-cores.def (AARCH64_CORE): Add ampere-1b > > * config/aarch64/aarch64-cost-tables.h: Add ampere1b_extra_costs > > * config/aarch64/aarch64-tune.md: Regenerate > > * config/aarch64/aarch64.cc: Include ampere1b tuning model > > * doc/invoke.texi: Document -mcpu=ampere1b > > * config/aarch64/tuning_models/ampere1b.h: New file. > > OK, thanks, but: > > > > > Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu> > > --- > > > > Changes in v2: > > - moved ampere1b model to a separated file > > - regenerated aarch64-tune.md after rebase > > > > gcc/config/aarch64/aarch64-cores.def | 1 + > > gcc/config/aarch64/aarch64-cost-tables.h | 107 ++++++++++++++++++ > > gcc/config/aarch64/aarch64-tune.md | 2 +- > > gcc/config/aarch64/aarch64.cc | 1 + > > gcc/config/aarch64/tuning_models/ampere1b.h | 114 ++++++++++++++++++++ > > gcc/doc/invoke.texi | 2 +- > > 6 files changed, 225 insertions(+), 2 deletions(-) > > create mode 100644 gcc/config/aarch64/tuning_models/ampere1b.h > > > > diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def > > index 16752b77f4b..ad896a80f1f 100644 > > --- a/gcc/config/aarch64/aarch64-cores.def > > +++ b/gcc/config/aarch64/aarch64-cores.def > > @@ -74,6 +74,7 @@ AARCH64_CORE("thunderxt83", thunderxt83, thunderx, V8A, (CRC, CRYPTO), thu > > /* Ampere Computing ('\xC0') cores. */ > > AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES, SHA3), ampere1, 0xC0, 0xac3, -1) > > AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG, AES, SHA3, SM4, MEMTAG), ampere1a, 0xC0, 0xac4, -1) > > +AARCH64_CORE("ampere1b", ampere1b, cortexa57, V8_7A, (F16, RNG, AES, SHA3, SM4, MEMTAG, CSSC), ampere1b, 0xC0, 0xac5, -1) > > /* Do not swap around "emag" and "xgene1", > > this order is required to handle variant correctly. */ > > AARCH64_CORE("emag", emag, xgene1, V8A, (CRC, CRYPTO), emag, 0x50, 0x000, 3) > > diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h > > index 0cb638f3a13..4c8da7f119b 100644 > > --- a/gcc/config/aarch64/aarch64-cost-tables.h > > +++ b/gcc/config/aarch64/aarch64-cost-tables.h > > @@ -882,4 +882,111 @@ const struct cpu_cost_table ampere1a_extra_costs = > > } > > }; > > > > +const struct cpu_cost_table ampere1b_extra_costs = > > +{ > > + /* ALU */ > > + { > > + 0, /* arith. */ > > + 0, /* logical. */ > > + 0, /* shift. */ > > + COSTS_N_INSNS (1), /* shift_reg. */ > > + 0, /* arith_shift. */ > > + COSTS_N_INSNS (1), /* arith_shift_reg. */ > > + 0, /* log_shift. */ > > + COSTS_N_INSNS (1), /* log_shift_reg. */ > > + 0, /* extend. */ > > + COSTS_N_INSNS (1), /* extend_arith. */ > > + 0, /* bfi. */ > > + 0, /* bfx. */ > > + 0, /* clz. */ > > + 0, /* rev. */ > > + 0, /* non_exec. */ > > + true /* non_exec_costs_exec. */ > > + }, > > + { > > + /* MULT SImode */ > > + { > > + COSTS_N_INSNS (2), /* simple. */ > > + COSTS_N_INSNS (2), /* flag_setting. */ > > + COSTS_N_INSNS (2), /* extend. */ > > + COSTS_N_INSNS (3), /* add. */ > > + COSTS_N_INSNS (3), /* extend_add. */ > > + COSTS_N_INSNS (12) /* idiv. */ > > + }, > > + /* MULT DImode */ > > + { > > + COSTS_N_INSNS (2), /* simple. */ > > + 0, /* flag_setting (N/A). */ > > + COSTS_N_INSNS (2), /* extend. */ > > + COSTS_N_INSNS (3), /* add. */ > > + COSTS_N_INSNS (3), /* extend_add. */ > > + COSTS_N_INSNS (18) /* idiv. */ > > + } > > + }, > > + /* LD/ST */ > > + { > > + COSTS_N_INSNS (2), /* load. */ > > + COSTS_N_INSNS (2), /* load_sign_extend. */ > > + 0, /* ldrd (n/a). */ > > + 0, /* ldm_1st. */ > > + 0, /* ldm_regs_per_insn_1st. */ > > + 0, /* ldm_regs_per_insn_subsequent. */ > > + COSTS_N_INSNS (3), /* loadf. */ > > + COSTS_N_INSNS (3), /* loadd. */ > > + COSTS_N_INSNS (3), /* load_unaligned. */ > > + 0, /* store. */ > > + 0, /* strd. */ > > + 0, /* stm_1st. */ > > + 0, /* stm_regs_per_insn_1st. */ > > + 0, /* stm_regs_per_insn_subsequent. */ > > + COSTS_N_INSNS (1), /* storef. */ > > + COSTS_N_INSNS (1), /* stored. */ > > + COSTS_N_INSNS (1), /* store_unaligned. */ > > + COSTS_N_INSNS (3), /* loadv. */ > > + COSTS_N_INSNS (3) /* storev. */ > > + }, > > + { > > + /* FP SFmode */ > > + { > > + COSTS_N_INSNS (18), /* div. */ > > + COSTS_N_INSNS (3), /* mult. */ > > + COSTS_N_INSNS (3), /* mult_addsub. */ > > + COSTS_N_INSNS (3), /* fma. */ > > + COSTS_N_INSNS (2), /* addsub. */ > > + COSTS_N_INSNS (1), /* fpconst. */ > > + COSTS_N_INSNS (2), /* neg. */ > > + COSTS_N_INSNS (2), /* compare. */ > > + COSTS_N_INSNS (2), /* widen. */ > > + COSTS_N_INSNS (2), /* narrow. */ > > + COSTS_N_INSNS (6), /* toint. */ > > + COSTS_N_INSNS (4), /* fromint. */ > > + COSTS_N_INSNS (2) /* roundint. */ > > + }, > > + /* FP DFmode */ > > + { > > + COSTS_N_INSNS (18), /* div. */ > > + COSTS_N_INSNS (3), /* mult. */ > > + COSTS_N_INSNS (3), /* mult_addsub. */ > > + COSTS_N_INSNS (3), /* fma. */ > > + COSTS_N_INSNS (2), /* addsub. */ > > + COSTS_N_INSNS (1), /* fpconst. */ > > + COSTS_N_INSNS (2), /* neg. */ > > + COSTS_N_INSNS (2), /* compare. */ > > + COSTS_N_INSNS (2), /* widen. */ > > + COSTS_N_INSNS (2), /* narrow. */ > > + COSTS_N_INSNS (6), /* toint. */ > > + COSTS_N_INSNS (4), /* fromint. */ > > + COSTS_N_INSNS (2) /* roundint. */ > > + } > > + }, > > + /* Vector */ > > + { > > + COSTS_N_INSNS (1), /* alu. */ > > + COSTS_N_INSNS (2), /* mult. */ > > + COSTS_N_INSNS (1), /* movi. */ > > + COSTS_N_INSNS (1), /* dup. */ > > + COSTS_N_INSNS (1) /* extract. */ > > + } > > +}; > > + > > #endif > > diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md > > index 61bb8521125..98e6882d432 100644 > > --- a/gcc/config/aarch64/aarch64-tune.md > > +++ b/gcc/config/aarch64/aarch64-tune.md > > @@ -1,5 +1,5 @@ > > ;; -*- buffer-read-only: t -*- > > ;; Generated automatically by gentune.sh from aarch64-cores.def > > (define_attr "tune" > > - "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a" > > + "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,ampere1b,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a" > > (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > > index f6f6f94bf43..6a6cceb52c7 100644 > > --- a/gcc/config/aarch64/aarch64.cc > > +++ b/gcc/config/aarch64/aarch64.cc > > @@ -375,6 +375,7 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] = > > #include "tuning_models/neoversen1.h" > > #include "tuning_models/ampere1.h" > > #include "tuning_models/ampere1a.h" > > +#include "tuning_models/ampere1b.h" > > #include "tuning_models/neoversev1.h" > > #include "tuning_models/neoverse512tvb.h" > > #include "tuning_models/neoversen2.h" > > diff --git a/gcc/config/aarch64/tuning_models/ampere1b.h b/gcc/config/aarch64/tuning_models/ampere1b.h > > new file mode 100644 > > index 00000000000..c4928f50d29 > > --- /dev/null > > +++ b/gcc/config/aarch64/tuning_models/ampere1b.h > > @@ -0,0 +1,114 @@ > > +/* Tuning model description for the Ampere1B core. > > + Copyright (C) 2023 Free Software Foundation, Inc. > > + > > + This file is part of GCC. > > + > > + GCC is free software; you can redistribute it and/or modify it > > + under the terms of the GNU General Public License as published by > > + the Free Software Foundation; either version 3, or (at your option) > > + any later version. > > + > > + GCC is distributed in the hope that it will be useful, but > > + WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + General Public License for more details. > > + > > + You should have received a copy of the GNU General Public License > > + along with GCC; see the file COPYING3. If not see > > + <http://www.gnu.org/licenses/>. */ > > + > > +#ifndef GCC_AARCH64_H_AMPERE1B > > +#define GCC_AARCH64_H_AMPERE1B > > + > > +#include "generic.h" > > + > > +static const cpu_prefetch_tune ampere1b_prefetch_tune = > > +{ > > + 48, /* num_slots */ > > + 64, /* l1_cache_size */ > > + 64, /* l1_cache_line_size */ > > + 2048, /* l2_cache_size */ > > + true, /* prefetch_dynamic_strides */ > > + -1, /* minimum_stride */ > > + -1 /* default_opt_level */ > > +}; > > + > > +static const advsimd_vec_cost ampere1b_advsimd_vector_cost = > > +{ > > + 1, /* int_stmt_cost */ > > + 3, /* fp_stmt_cost */ > > + 0, /* ld2_st2_permute_cost */ > > + 0, /* ld3_st3_permute_cost */ > > + 0, /* ld4_st4_permute_cost */ > > + 2, /* permute_cost */ > > + 8, /* reduc_i8_cost */ > > + 6, /* reduc_i16_cost */ > > + 4, /* reduc_i32_cost */ > > + 2, /* reduc_i64_cost */ > > + 9, /* reduc_f16_cost */ > > + 6, /* reduc_f32_cost */ > > + 3, /* reduc_f64_cost */ > > + 5, /* store_elt_extra_cost */ > > + 5, /* vec_to_scalar_cost */ > > + 5, /* scalar_to_vec_cost */ > > + 4, /* align_load_cost */ > > + 4, /* unalign_load_cost */ > > + 1, /* unalign_store_cost */ > > + 1 /* store_cost */ > > +}; > > + > > +/* Ampere-1B costs for vector insn classes. */ > > +static const struct cpu_vector_cost ampere1b_vector_cost = > > +{ > > + 1, /* scalar_int_stmt_cost */ > > + 3, /* scalar_fp_stmt_cost */ > > + 4, /* scalar_load_cost */ > > + 1, /* scalar_store_cost */ > > + 1, /* cond_taken_branch_cost */ > > + 1, /* cond_not_taken_branch_cost */ > > + &ere1b_advsimd_vector_cost, /* advsimd */ > > + nullptr, /* sve */ > > + nullptr /* issue_info */ > > +}; > > + > > +static const struct tune_params ampere1b_tunings = > > +{ > > + &ere1b_extra_costs, > > + &generic_addrcost_table, > > + &generic_regmove_cost, > > + &ere1b_vector_cost, > > + &generic_branch_cost, > > + &generic_approx_modes, > > + SVE_NOT_IMPLEMENTED, /* sve_width */ > > + { 3, /* load_int. */ > > + 1, /* store_int. */ > > + 4, /* load_fp. */ > > + 4, /* store_fp. */ > > + 4, /* load_pred. */ > > + 4 /* store_pred. */ > > + }, /* memmov_cost. */ > > + 4, /* issue_rate */ > > + (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC | > > + AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK | > > + AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ | > > + AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ | > > + AARCH64_FUSE_ADDSUB_2REG_CONST1), > > + /* fusible_ops */ > > + "32", /* function_align. */ > > + "4", /* jump_align. */ > > + "32:16", /* loop_align. */ > > + 2, /* int_reassoc_width. */ > > + 4, /* fp_reassoc_width. */ > > + 1, /* fma_reassoc_width. */ > > + 2, /* vec_reassoc_width. */ > > + 2, /* min_div_recip_mul_sf. */ > > + 2, /* min_div_recip_mul_df. */ > > + 0, /* max_case_values. */ > > + tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */ > > + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ > > would you mind testing the performance of > AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS? The plan was to make that > the default at some stage. We are tuning the entire vector costs for the entire Ampere1-family (some changes will be needed due to ongoing work to improve the vectorization of x264 and optimized permute/shuffle sequences) and plan to include AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS throughout. However, this is unlikely to be completed before the end of this year. OK to apply now and revisit later? > > Richard > > > + &ere1b_prefetch_tune, > > + AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */ > > + AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */ > > +}; > > + > > +#endif /* GCC_AARCH64_H_AMPERE1B */ > > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi > > index 7666de1e9b2..54e4101e7a2 100644 > > --- a/gcc/doc/invoke.texi > > +++ b/gcc/doc/invoke.texi > > @@ -20851,7 +20851,7 @@ performance of the code. Permissible values for this option are: > > @samp{cortex-r82}, @samp{cortex-x1}, @samp{cortex-x1c}, @samp{cortex-x2}, > > @samp{cortex-x3}, @samp{cortex-x4}, @samp{cortex-a510}, @samp{cortex-a520}, > > @samp{cortex-a710}, @samp{cortex-a715}, @samp{cortex-a720}, @samp{ampere1}, > > -@samp{ampere1a}, and @samp{native}. > > +@samp{ampere1a}, @samp{ampere1b}, and @samp{native}. > > > > The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53}, > > @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},
Philipp Tomsich <philipp.tomsich@vrull.eu> writes: > On Tue, 28 Nov 2023 at 12:21, Richard Sandiford > <richard.sandiford@arm.com> wrote: >> >> Philipp Tomsich <philipp.tomsich@vrull.eu> writes: >> > This patch adds initial support for Ampere-1B core. >> > >> > The Ampere-1B core implements ARMv8.7 with the following (compiler >> > visible) extensions: >> > - CSSC (Common Short Sequence Compression instructions), >> > - MTE (Memory Tagging Extension) >> > - SM3/SM4 >> > >> > gcc/ChangeLog: >> > >> > * config/aarch64/aarch64-cores.def (AARCH64_CORE): Add ampere-1b >> > * config/aarch64/aarch64-cost-tables.h: Add ampere1b_extra_costs >> > * config/aarch64/aarch64-tune.md: Regenerate >> > * config/aarch64/aarch64.cc: Include ampere1b tuning model >> > * doc/invoke.texi: Document -mcpu=ampere1b >> > * config/aarch64/tuning_models/ampere1b.h: New file. >> >> OK, thanks, but: >> >> > >> > Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu> >> > --- >> > >> > Changes in v2: >> > - moved ampere1b model to a separated file >> > - regenerated aarch64-tune.md after rebase >> > >> > gcc/config/aarch64/aarch64-cores.def | 1 + >> > gcc/config/aarch64/aarch64-cost-tables.h | 107 ++++++++++++++++++ >> > gcc/config/aarch64/aarch64-tune.md | 2 +- >> > gcc/config/aarch64/aarch64.cc | 1 + >> > gcc/config/aarch64/tuning_models/ampere1b.h | 114 ++++++++++++++++++++ >> > gcc/doc/invoke.texi | 2 +- >> > 6 files changed, 225 insertions(+), 2 deletions(-) >> > create mode 100644 gcc/config/aarch64/tuning_models/ampere1b.h >> > >> > diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def >> > index 16752b77f4b..ad896a80f1f 100644 >> > --- a/gcc/config/aarch64/aarch64-cores.def >> > +++ b/gcc/config/aarch64/aarch64-cores.def >> > @@ -74,6 +74,7 @@ AARCH64_CORE("thunderxt83", thunderxt83, thunderx, V8A, (CRC, CRYPTO), thu >> > /* Ampere Computing ('\xC0') cores. */ >> > AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES, SHA3), ampere1, 0xC0, 0xac3, -1) >> > AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG, AES, SHA3, SM4, MEMTAG), ampere1a, 0xC0, 0xac4, -1) >> > +AARCH64_CORE("ampere1b", ampere1b, cortexa57, V8_7A, (F16, RNG, AES, SHA3, SM4, MEMTAG, CSSC), ampere1b, 0xC0, 0xac5, -1) >> > /* Do not swap around "emag" and "xgene1", >> > this order is required to handle variant correctly. */ >> > AARCH64_CORE("emag", emag, xgene1, V8A, (CRC, CRYPTO), emag, 0x50, 0x000, 3) >> > diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h >> > index 0cb638f3a13..4c8da7f119b 100644 >> > --- a/gcc/config/aarch64/aarch64-cost-tables.h >> > +++ b/gcc/config/aarch64/aarch64-cost-tables.h >> > @@ -882,4 +882,111 @@ const struct cpu_cost_table ampere1a_extra_costs = >> > } >> > }; >> > >> > +const struct cpu_cost_table ampere1b_extra_costs = >> > +{ >> > + /* ALU */ >> > + { >> > + 0, /* arith. */ >> > + 0, /* logical. */ >> > + 0, /* shift. */ >> > + COSTS_N_INSNS (1), /* shift_reg. */ >> > + 0, /* arith_shift. */ >> > + COSTS_N_INSNS (1), /* arith_shift_reg. */ >> > + 0, /* log_shift. */ >> > + COSTS_N_INSNS (1), /* log_shift_reg. */ >> > + 0, /* extend. */ >> > + COSTS_N_INSNS (1), /* extend_arith. */ >> > + 0, /* bfi. */ >> > + 0, /* bfx. */ >> > + 0, /* clz. */ >> > + 0, /* rev. */ >> > + 0, /* non_exec. */ >> > + true /* non_exec_costs_exec. */ >> > + }, >> > + { >> > + /* MULT SImode */ >> > + { >> > + COSTS_N_INSNS (2), /* simple. */ >> > + COSTS_N_INSNS (2), /* flag_setting. */ >> > + COSTS_N_INSNS (2), /* extend. */ >> > + COSTS_N_INSNS (3), /* add. */ >> > + COSTS_N_INSNS (3), /* extend_add. */ >> > + COSTS_N_INSNS (12) /* idiv. */ >> > + }, >> > + /* MULT DImode */ >> > + { >> > + COSTS_N_INSNS (2), /* simple. */ >> > + 0, /* flag_setting (N/A). */ >> > + COSTS_N_INSNS (2), /* extend. */ >> > + COSTS_N_INSNS (3), /* add. */ >> > + COSTS_N_INSNS (3), /* extend_add. */ >> > + COSTS_N_INSNS (18) /* idiv. */ >> > + } >> > + }, >> > + /* LD/ST */ >> > + { >> > + COSTS_N_INSNS (2), /* load. */ >> > + COSTS_N_INSNS (2), /* load_sign_extend. */ >> > + 0, /* ldrd (n/a). */ >> > + 0, /* ldm_1st. */ >> > + 0, /* ldm_regs_per_insn_1st. */ >> > + 0, /* ldm_regs_per_insn_subsequent. */ >> > + COSTS_N_INSNS (3), /* loadf. */ >> > + COSTS_N_INSNS (3), /* loadd. */ >> > + COSTS_N_INSNS (3), /* load_unaligned. */ >> > + 0, /* store. */ >> > + 0, /* strd. */ >> > + 0, /* stm_1st. */ >> > + 0, /* stm_regs_per_insn_1st. */ >> > + 0, /* stm_regs_per_insn_subsequent. */ >> > + COSTS_N_INSNS (1), /* storef. */ >> > + COSTS_N_INSNS (1), /* stored. */ >> > + COSTS_N_INSNS (1), /* store_unaligned. */ >> > + COSTS_N_INSNS (3), /* loadv. */ >> > + COSTS_N_INSNS (3) /* storev. */ >> > + }, >> > + { >> > + /* FP SFmode */ >> > + { >> > + COSTS_N_INSNS (18), /* div. */ >> > + COSTS_N_INSNS (3), /* mult. */ >> > + COSTS_N_INSNS (3), /* mult_addsub. */ >> > + COSTS_N_INSNS (3), /* fma. */ >> > + COSTS_N_INSNS (2), /* addsub. */ >> > + COSTS_N_INSNS (1), /* fpconst. */ >> > + COSTS_N_INSNS (2), /* neg. */ >> > + COSTS_N_INSNS (2), /* compare. */ >> > + COSTS_N_INSNS (2), /* widen. */ >> > + COSTS_N_INSNS (2), /* narrow. */ >> > + COSTS_N_INSNS (6), /* toint. */ >> > + COSTS_N_INSNS (4), /* fromint. */ >> > + COSTS_N_INSNS (2) /* roundint. */ >> > + }, >> > + /* FP DFmode */ >> > + { >> > + COSTS_N_INSNS (18), /* div. */ >> > + COSTS_N_INSNS (3), /* mult. */ >> > + COSTS_N_INSNS (3), /* mult_addsub. */ >> > + COSTS_N_INSNS (3), /* fma. */ >> > + COSTS_N_INSNS (2), /* addsub. */ >> > + COSTS_N_INSNS (1), /* fpconst. */ >> > + COSTS_N_INSNS (2), /* neg. */ >> > + COSTS_N_INSNS (2), /* compare. */ >> > + COSTS_N_INSNS (2), /* widen. */ >> > + COSTS_N_INSNS (2), /* narrow. */ >> > + COSTS_N_INSNS (6), /* toint. */ >> > + COSTS_N_INSNS (4), /* fromint. */ >> > + COSTS_N_INSNS (2) /* roundint. */ >> > + } >> > + }, >> > + /* Vector */ >> > + { >> > + COSTS_N_INSNS (1), /* alu. */ >> > + COSTS_N_INSNS (2), /* mult. */ >> > + COSTS_N_INSNS (1), /* movi. */ >> > + COSTS_N_INSNS (1), /* dup. */ >> > + COSTS_N_INSNS (1) /* extract. */ >> > + } >> > +}; >> > + >> > #endif >> > diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md >> > index 61bb8521125..98e6882d432 100644 >> > --- a/gcc/config/aarch64/aarch64-tune.md >> > +++ b/gcc/config/aarch64/aarch64-tune.md >> > @@ -1,5 +1,5 @@ >> > ;; -*- buffer-read-only: t -*- >> > ;; Generated automatically by gentune.sh from aarch64-cores.def >> > (define_attr "tune" >> > - "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a" >> > + "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,ampere1b,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a" >> > (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) >> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc >> > index f6f6f94bf43..6a6cceb52c7 100644 >> > --- a/gcc/config/aarch64/aarch64.cc >> > +++ b/gcc/config/aarch64/aarch64.cc >> > @@ -375,6 +375,7 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] = >> > #include "tuning_models/neoversen1.h" >> > #include "tuning_models/ampere1.h" >> > #include "tuning_models/ampere1a.h" >> > +#include "tuning_models/ampere1b.h" >> > #include "tuning_models/neoversev1.h" >> > #include "tuning_models/neoverse512tvb.h" >> > #include "tuning_models/neoversen2.h" >> > diff --git a/gcc/config/aarch64/tuning_models/ampere1b.h b/gcc/config/aarch64/tuning_models/ampere1b.h >> > new file mode 100644 >> > index 00000000000..c4928f50d29 >> > --- /dev/null >> > +++ b/gcc/config/aarch64/tuning_models/ampere1b.h >> > @@ -0,0 +1,114 @@ >> > +/* Tuning model description for the Ampere1B core. >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> > + >> > + This file is part of GCC. >> > + >> > + GCC is free software; you can redistribute it and/or modify it >> > + under the terms of the GNU General Public License as published by >> > + the Free Software Foundation; either version 3, or (at your option) >> > + any later version. >> > + >> > + GCC is distributed in the hope that it will be useful, but >> > + WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + General Public License for more details. >> > + >> > + You should have received a copy of the GNU General Public License >> > + along with GCC; see the file COPYING3. If not see >> > + <http://www.gnu.org/licenses/>. */ >> > + >> > +#ifndef GCC_AARCH64_H_AMPERE1B >> > +#define GCC_AARCH64_H_AMPERE1B >> > + >> > +#include "generic.h" >> > + >> > +static const cpu_prefetch_tune ampere1b_prefetch_tune = >> > +{ >> > + 48, /* num_slots */ >> > + 64, /* l1_cache_size */ >> > + 64, /* l1_cache_line_size */ >> > + 2048, /* l2_cache_size */ >> > + true, /* prefetch_dynamic_strides */ >> > + -1, /* minimum_stride */ >> > + -1 /* default_opt_level */ >> > +}; >> > + >> > +static const advsimd_vec_cost ampere1b_advsimd_vector_cost = >> > +{ >> > + 1, /* int_stmt_cost */ >> > + 3, /* fp_stmt_cost */ >> > + 0, /* ld2_st2_permute_cost */ >> > + 0, /* ld3_st3_permute_cost */ >> > + 0, /* ld4_st4_permute_cost */ >> > + 2, /* permute_cost */ >> > + 8, /* reduc_i8_cost */ >> > + 6, /* reduc_i16_cost */ >> > + 4, /* reduc_i32_cost */ >> > + 2, /* reduc_i64_cost */ >> > + 9, /* reduc_f16_cost */ >> > + 6, /* reduc_f32_cost */ >> > + 3, /* reduc_f64_cost */ >> > + 5, /* store_elt_extra_cost */ >> > + 5, /* vec_to_scalar_cost */ >> > + 5, /* scalar_to_vec_cost */ >> > + 4, /* align_load_cost */ >> > + 4, /* unalign_load_cost */ >> > + 1, /* unalign_store_cost */ >> > + 1 /* store_cost */ >> > +}; >> > + >> > +/* Ampere-1B costs for vector insn classes. */ >> > +static const struct cpu_vector_cost ampere1b_vector_cost = >> > +{ >> > + 1, /* scalar_int_stmt_cost */ >> > + 3, /* scalar_fp_stmt_cost */ >> > + 4, /* scalar_load_cost */ >> > + 1, /* scalar_store_cost */ >> > + 1, /* cond_taken_branch_cost */ >> > + 1, /* cond_not_taken_branch_cost */ >> > + &ere1b_advsimd_vector_cost, /* advsimd */ >> > + nullptr, /* sve */ >> > + nullptr /* issue_info */ >> > +}; >> > + >> > +static const struct tune_params ampere1b_tunings = >> > +{ >> > + &ere1b_extra_costs, >> > + &generic_addrcost_table, >> > + &generic_regmove_cost, >> > + &ere1b_vector_cost, >> > + &generic_branch_cost, >> > + &generic_approx_modes, >> > + SVE_NOT_IMPLEMENTED, /* sve_width */ >> > + { 3, /* load_int. */ >> > + 1, /* store_int. */ >> > + 4, /* load_fp. */ >> > + 4, /* store_fp. */ >> > + 4, /* load_pred. */ >> > + 4 /* store_pred. */ >> > + }, /* memmov_cost. */ >> > + 4, /* issue_rate */ >> > + (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC | >> > + AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK | >> > + AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ | >> > + AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ | >> > + AARCH64_FUSE_ADDSUB_2REG_CONST1), >> > + /* fusible_ops */ >> > + "32", /* function_align. */ >> > + "4", /* jump_align. */ >> > + "32:16", /* loop_align. */ >> > + 2, /* int_reassoc_width. */ >> > + 4, /* fp_reassoc_width. */ >> > + 1, /* fma_reassoc_width. */ >> > + 2, /* vec_reassoc_width. */ >> > + 2, /* min_div_recip_mul_sf. */ >> > + 2, /* min_div_recip_mul_df. */ >> > + 0, /* max_case_values. */ >> > + tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */ >> > + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ >> >> would you mind testing the performance of >> AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS? The plan was to make that >> the default at some stage. > > We are tuning the entire vector costs for the entire Ampere1-family > (some changes will be needed due to ongoing work to improve the > vectorization of x264 and optimized permute/shuffle sequences) and > plan to include AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS throughout. > However, this is unlikely to be completed before the end of this year. > > OK to apply now and revisit later? Yeah. I meant the testing above to be a separate thing, just for information. I imagined that at some point we'd simply remove AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS and use the new costs unconditionally for all cores. I was just curious what effect that would have on this core (without any further vector tuning beyond what's in the current patch). Thanks, Richard
Applied to master, thanks! Philipp. On Tue, 28 Nov 2023 at 12:57, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Philipp Tomsich <philipp.tomsich@vrull.eu> writes: > > On Tue, 28 Nov 2023 at 12:21, Richard Sandiford > > <richard.sandiford@arm.com> wrote: > >> > >> Philipp Tomsich <philipp.tomsich@vrull.eu> writes: > >> > This patch adds initial support for Ampere-1B core. > >> > > >> > The Ampere-1B core implements ARMv8.7 with the following (compiler > >> > visible) extensions: > >> > - CSSC (Common Short Sequence Compression instructions), > >> > - MTE (Memory Tagging Extension) > >> > - SM3/SM4 > >> > > >> > gcc/ChangeLog: > >> > > >> > * config/aarch64/aarch64-cores.def (AARCH64_CORE): Add ampere-1b > >> > * config/aarch64/aarch64-cost-tables.h: Add ampere1b_extra_costs > >> > * config/aarch64/aarch64-tune.md: Regenerate > >> > * config/aarch64/aarch64.cc: Include ampere1b tuning model > >> > * doc/invoke.texi: Document -mcpu=ampere1b > >> > * config/aarch64/tuning_models/ampere1b.h: New file. > >> > >> OK, thanks, but: > >> > >> > > >> > Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu> > >> > --- > >> > > >> > Changes in v2: > >> > - moved ampere1b model to a separated file > >> > - regenerated aarch64-tune.md after rebase > >> > > >> > gcc/config/aarch64/aarch64-cores.def | 1 + > >> > gcc/config/aarch64/aarch64-cost-tables.h | 107 ++++++++++++++++++ > >> > gcc/config/aarch64/aarch64-tune.md | 2 +- > >> > gcc/config/aarch64/aarch64.cc | 1 + > >> > gcc/config/aarch64/tuning_models/ampere1b.h | 114 ++++++++++++++++++++ > >> > gcc/doc/invoke.texi | 2 +- > >> > 6 files changed, 225 insertions(+), 2 deletions(-) > >> > create mode 100644 gcc/config/aarch64/tuning_models/ampere1b.h > >> > > >> > diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def > >> > index 16752b77f4b..ad896a80f1f 100644 > >> > --- a/gcc/config/aarch64/aarch64-cores.def > >> > +++ b/gcc/config/aarch64/aarch64-cores.def > >> > @@ -74,6 +74,7 @@ AARCH64_CORE("thunderxt83", thunderxt83, thunderx, V8A, (CRC, CRYPTO), thu > >> > /* Ampere Computing ('\xC0') cores. */ > >> > AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES, SHA3), ampere1, 0xC0, 0xac3, -1) > >> > AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG, AES, SHA3, SM4, MEMTAG), ampere1a, 0xC0, 0xac4, -1) > >> > +AARCH64_CORE("ampere1b", ampere1b, cortexa57, V8_7A, (F16, RNG, AES, SHA3, SM4, MEMTAG, CSSC), ampere1b, 0xC0, 0xac5, -1) > >> > /* Do not swap around "emag" and "xgene1", > >> > this order is required to handle variant correctly. */ > >> > AARCH64_CORE("emag", emag, xgene1, V8A, (CRC, CRYPTO), emag, 0x50, 0x000, 3) > >> > diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h > >> > index 0cb638f3a13..4c8da7f119b 100644 > >> > --- a/gcc/config/aarch64/aarch64-cost-tables.h > >> > +++ b/gcc/config/aarch64/aarch64-cost-tables.h > >> > @@ -882,4 +882,111 @@ const struct cpu_cost_table ampere1a_extra_costs = > >> > } > >> > }; > >> > > >> > +const struct cpu_cost_table ampere1b_extra_costs = > >> > +{ > >> > + /* ALU */ > >> > + { > >> > + 0, /* arith. */ > >> > + 0, /* logical. */ > >> > + 0, /* shift. */ > >> > + COSTS_N_INSNS (1), /* shift_reg. */ > >> > + 0, /* arith_shift. */ > >> > + COSTS_N_INSNS (1), /* arith_shift_reg. */ > >> > + 0, /* log_shift. */ > >> > + COSTS_N_INSNS (1), /* log_shift_reg. */ > >> > + 0, /* extend. */ > >> > + COSTS_N_INSNS (1), /* extend_arith. */ > >> > + 0, /* bfi. */ > >> > + 0, /* bfx. */ > >> > + 0, /* clz. */ > >> > + 0, /* rev. */ > >> > + 0, /* non_exec. */ > >> > + true /* non_exec_costs_exec. */ > >> > + }, > >> > + { > >> > + /* MULT SImode */ > >> > + { > >> > + COSTS_N_INSNS (2), /* simple. */ > >> > + COSTS_N_INSNS (2), /* flag_setting. */ > >> > + COSTS_N_INSNS (2), /* extend. */ > >> > + COSTS_N_INSNS (3), /* add. */ > >> > + COSTS_N_INSNS (3), /* extend_add. */ > >> > + COSTS_N_INSNS (12) /* idiv. */ > >> > + }, > >> > + /* MULT DImode */ > >> > + { > >> > + COSTS_N_INSNS (2), /* simple. */ > >> > + 0, /* flag_setting (N/A). */ > >> > + COSTS_N_INSNS (2), /* extend. */ > >> > + COSTS_N_INSNS (3), /* add. */ > >> > + COSTS_N_INSNS (3), /* extend_add. */ > >> > + COSTS_N_INSNS (18) /* idiv. */ > >> > + } > >> > + }, > >> > + /* LD/ST */ > >> > + { > >> > + COSTS_N_INSNS (2), /* load. */ > >> > + COSTS_N_INSNS (2), /* load_sign_extend. */ > >> > + 0, /* ldrd (n/a). */ > >> > + 0, /* ldm_1st. */ > >> > + 0, /* ldm_regs_per_insn_1st. */ > >> > + 0, /* ldm_regs_per_insn_subsequent. */ > >> > + COSTS_N_INSNS (3), /* loadf. */ > >> > + COSTS_N_INSNS (3), /* loadd. */ > >> > + COSTS_N_INSNS (3), /* load_unaligned. */ > >> > + 0, /* store. */ > >> > + 0, /* strd. */ > >> > + 0, /* stm_1st. */ > >> > + 0, /* stm_regs_per_insn_1st. */ > >> > + 0, /* stm_regs_per_insn_subsequent. */ > >> > + COSTS_N_INSNS (1), /* storef. */ > >> > + COSTS_N_INSNS (1), /* stored. */ > >> > + COSTS_N_INSNS (1), /* store_unaligned. */ > >> > + COSTS_N_INSNS (3), /* loadv. */ > >> > + COSTS_N_INSNS (3) /* storev. */ > >> > + }, > >> > + { > >> > + /* FP SFmode */ > >> > + { > >> > + COSTS_N_INSNS (18), /* div. */ > >> > + COSTS_N_INSNS (3), /* mult. */ > >> > + COSTS_N_INSNS (3), /* mult_addsub. */ > >> > + COSTS_N_INSNS (3), /* fma. */ > >> > + COSTS_N_INSNS (2), /* addsub. */ > >> > + COSTS_N_INSNS (1), /* fpconst. */ > >> > + COSTS_N_INSNS (2), /* neg. */ > >> > + COSTS_N_INSNS (2), /* compare. */ > >> > + COSTS_N_INSNS (2), /* widen. */ > >> > + COSTS_N_INSNS (2), /* narrow. */ > >> > + COSTS_N_INSNS (6), /* toint. */ > >> > + COSTS_N_INSNS (4), /* fromint. */ > >> > + COSTS_N_INSNS (2) /* roundint. */ > >> > + }, > >> > + /* FP DFmode */ > >> > + { > >> > + COSTS_N_INSNS (18), /* div. */ > >> > + COSTS_N_INSNS (3), /* mult. */ > >> > + COSTS_N_INSNS (3), /* mult_addsub. */ > >> > + COSTS_N_INSNS (3), /* fma. */ > >> > + COSTS_N_INSNS (2), /* addsub. */ > >> > + COSTS_N_INSNS (1), /* fpconst. */ > >> > + COSTS_N_INSNS (2), /* neg. */ > >> > + COSTS_N_INSNS (2), /* compare. */ > >> > + COSTS_N_INSNS (2), /* widen. */ > >> > + COSTS_N_INSNS (2), /* narrow. */ > >> > + COSTS_N_INSNS (6), /* toint. */ > >> > + COSTS_N_INSNS (4), /* fromint. */ > >> > + COSTS_N_INSNS (2) /* roundint. */ > >> > + } > >> > + }, > >> > + /* Vector */ > >> > + { > >> > + COSTS_N_INSNS (1), /* alu. */ > >> > + COSTS_N_INSNS (2), /* mult. */ > >> > + COSTS_N_INSNS (1), /* movi. */ > >> > + COSTS_N_INSNS (1), /* dup. */ > >> > + COSTS_N_INSNS (1) /* extract. */ > >> > + } > >> > +}; > >> > + > >> > #endif > >> > diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md > >> > index 61bb8521125..98e6882d432 100644 > >> > --- a/gcc/config/aarch64/aarch64-tune.md > >> > +++ b/gcc/config/aarch64/aarch64-tune.md > >> > @@ -1,5 +1,5 @@ > >> > ;; -*- buffer-read-only: t -*- > >> > ;; Generated automatically by gentune.sh from aarch64-cores.def > >> > (define_attr "tune" > >> > - "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a" > >> > + "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,ampere1b,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a" > >> > (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) > >> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > >> > index f6f6f94bf43..6a6cceb52c7 100644 > >> > --- a/gcc/config/aarch64/aarch64.cc > >> > +++ b/gcc/config/aarch64/aarch64.cc > >> > @@ -375,6 +375,7 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] = > >> > #include "tuning_models/neoversen1.h" > >> > #include "tuning_models/ampere1.h" > >> > #include "tuning_models/ampere1a.h" > >> > +#include "tuning_models/ampere1b.h" > >> > #include "tuning_models/neoversev1.h" > >> > #include "tuning_models/neoverse512tvb.h" > >> > #include "tuning_models/neoversen2.h" > >> > diff --git a/gcc/config/aarch64/tuning_models/ampere1b.h b/gcc/config/aarch64/tuning_models/ampere1b.h > >> > new file mode 100644 > >> > index 00000000000..c4928f50d29 > >> > --- /dev/null > >> > +++ b/gcc/config/aarch64/tuning_models/ampere1b.h > >> > @@ -0,0 +1,114 @@ > >> > +/* Tuning model description for the Ampere1B core. > >> > + Copyright (C) 2023 Free Software Foundation, Inc. > >> > + > >> > + This file is part of GCC. > >> > + > >> > + GCC is free software; you can redistribute it and/or modify it > >> > + under the terms of the GNU General Public License as published by > >> > + the Free Software Foundation; either version 3, or (at your option) > >> > + any later version. > >> > + > >> > + GCC is distributed in the hope that it will be useful, but > >> > + WITHOUT ANY WARRANTY; without even the implied warranty of > >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> > + General Public License for more details. > >> > + > >> > + You should have received a copy of the GNU General Public License > >> > + along with GCC; see the file COPYING3. If not see > >> > + <http://www.gnu.org/licenses/>. */ > >> > + > >> > +#ifndef GCC_AARCH64_H_AMPERE1B > >> > +#define GCC_AARCH64_H_AMPERE1B > >> > + > >> > +#include "generic.h" > >> > + > >> > +static const cpu_prefetch_tune ampere1b_prefetch_tune = > >> > +{ > >> > + 48, /* num_slots */ > >> > + 64, /* l1_cache_size */ > >> > + 64, /* l1_cache_line_size */ > >> > + 2048, /* l2_cache_size */ > >> > + true, /* prefetch_dynamic_strides */ > >> > + -1, /* minimum_stride */ > >> > + -1 /* default_opt_level */ > >> > +}; > >> > + > >> > +static const advsimd_vec_cost ampere1b_advsimd_vector_cost = > >> > +{ > >> > + 1, /* int_stmt_cost */ > >> > + 3, /* fp_stmt_cost */ > >> > + 0, /* ld2_st2_permute_cost */ > >> > + 0, /* ld3_st3_permute_cost */ > >> > + 0, /* ld4_st4_permute_cost */ > >> > + 2, /* permute_cost */ > >> > + 8, /* reduc_i8_cost */ > >> > + 6, /* reduc_i16_cost */ > >> > + 4, /* reduc_i32_cost */ > >> > + 2, /* reduc_i64_cost */ > >> > + 9, /* reduc_f16_cost */ > >> > + 6, /* reduc_f32_cost */ > >> > + 3, /* reduc_f64_cost */ > >> > + 5, /* store_elt_extra_cost */ > >> > + 5, /* vec_to_scalar_cost */ > >> > + 5, /* scalar_to_vec_cost */ > >> > + 4, /* align_load_cost */ > >> > + 4, /* unalign_load_cost */ > >> > + 1, /* unalign_store_cost */ > >> > + 1 /* store_cost */ > >> > +}; > >> > + > >> > +/* Ampere-1B costs for vector insn classes. */ > >> > +static const struct cpu_vector_cost ampere1b_vector_cost = > >> > +{ > >> > + 1, /* scalar_int_stmt_cost */ > >> > + 3, /* scalar_fp_stmt_cost */ > >> > + 4, /* scalar_load_cost */ > >> > + 1, /* scalar_store_cost */ > >> > + 1, /* cond_taken_branch_cost */ > >> > + 1, /* cond_not_taken_branch_cost */ > >> > + &ere1b_advsimd_vector_cost, /* advsimd */ > >> > + nullptr, /* sve */ > >> > + nullptr /* issue_info */ > >> > +}; > >> > + > >> > +static const struct tune_params ampere1b_tunings = > >> > +{ > >> > + &ere1b_extra_costs, > >> > + &generic_addrcost_table, > >> > + &generic_regmove_cost, > >> > + &ere1b_vector_cost, > >> > + &generic_branch_cost, > >> > + &generic_approx_modes, > >> > + SVE_NOT_IMPLEMENTED, /* sve_width */ > >> > + { 3, /* load_int. */ > >> > + 1, /* store_int. */ > >> > + 4, /* load_fp. */ > >> > + 4, /* store_fp. */ > >> > + 4, /* load_pred. */ > >> > + 4 /* store_pred. */ > >> > + }, /* memmov_cost. */ > >> > + 4, /* issue_rate */ > >> > + (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC | > >> > + AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK | > >> > + AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ | > >> > + AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ | > >> > + AARCH64_FUSE_ADDSUB_2REG_CONST1), > >> > + /* fusible_ops */ > >> > + "32", /* function_align. */ > >> > + "4", /* jump_align. */ > >> > + "32:16", /* loop_align. */ > >> > + 2, /* int_reassoc_width. */ > >> > + 4, /* fp_reassoc_width. */ > >> > + 1, /* fma_reassoc_width. */ > >> > + 2, /* vec_reassoc_width. */ > >> > + 2, /* min_div_recip_mul_sf. */ > >> > + 2, /* min_div_recip_mul_df. */ > >> > + 0, /* max_case_values. */ > >> > + tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */ > >> > + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ > >> > >> would you mind testing the performance of > >> AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS? The plan was to make that > >> the default at some stage. > > > > We are tuning the entire vector costs for the entire Ampere1-family > > (some changes will be needed due to ongoing work to improve the > > vectorization of x264 and optimized permute/shuffle sequences) and > > plan to include AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS throughout. > > However, this is unlikely to be completed before the end of this year. > > > > OK to apply now and revisit later? > > Yeah. I meant the testing above to be a separate thing, just for > information. I imagined that at some point we'd simply remove > AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS and use the new costs > unconditionally for all cores. I was just curious what effect that > would have on this core (without any further vector tuning beyond > what's in the current patch). > > Thanks, > Richard
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index 16752b77f4b..ad896a80f1f 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -74,6 +74,7 @@ AARCH64_CORE("thunderxt83", thunderxt83, thunderx, V8A, (CRC, CRYPTO), thu /* Ampere Computing ('\xC0') cores. */ AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES, SHA3), ampere1, 0xC0, 0xac3, -1) AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG, AES, SHA3, SM4, MEMTAG), ampere1a, 0xC0, 0xac4, -1) +AARCH64_CORE("ampere1b", ampere1b, cortexa57, V8_7A, (F16, RNG, AES, SHA3, SM4, MEMTAG, CSSC), ampere1b, 0xC0, 0xac5, -1) /* Do not swap around "emag" and "xgene1", this order is required to handle variant correctly. */ AARCH64_CORE("emag", emag, xgene1, V8A, (CRC, CRYPTO), emag, 0x50, 0x000, 3) diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h index 0cb638f3a13..4c8da7f119b 100644 --- a/gcc/config/aarch64/aarch64-cost-tables.h +++ b/gcc/config/aarch64/aarch64-cost-tables.h @@ -882,4 +882,111 @@ const struct cpu_cost_table ampere1a_extra_costs = } }; +const struct cpu_cost_table ampere1b_extra_costs = +{ + /* ALU */ + { + 0, /* arith. */ + 0, /* logical. */ + 0, /* shift. */ + COSTS_N_INSNS (1), /* shift_reg. */ + 0, /* arith_shift. */ + COSTS_N_INSNS (1), /* arith_shift_reg. */ + 0, /* log_shift. */ + COSTS_N_INSNS (1), /* log_shift_reg. */ + 0, /* extend. */ + COSTS_N_INSNS (1), /* extend_arith. */ + 0, /* bfi. */ + 0, /* bfx. */ + 0, /* clz. */ + 0, /* rev. */ + 0, /* non_exec. */ + true /* non_exec_costs_exec. */ + }, + { + /* MULT SImode */ + { + COSTS_N_INSNS (2), /* simple. */ + COSTS_N_INSNS (2), /* flag_setting. */ + COSTS_N_INSNS (2), /* extend. */ + COSTS_N_INSNS (3), /* add. */ + COSTS_N_INSNS (3), /* extend_add. */ + COSTS_N_INSNS (12) /* idiv. */ + }, + /* MULT DImode */ + { + COSTS_N_INSNS (2), /* simple. */ + 0, /* flag_setting (N/A). */ + COSTS_N_INSNS (2), /* extend. */ + COSTS_N_INSNS (3), /* add. */ + COSTS_N_INSNS (3), /* extend_add. */ + COSTS_N_INSNS (18) /* idiv. */ + } + }, + /* LD/ST */ + { + COSTS_N_INSNS (2), /* load. */ + COSTS_N_INSNS (2), /* load_sign_extend. */ + 0, /* ldrd (n/a). */ + 0, /* ldm_1st. */ + 0, /* ldm_regs_per_insn_1st. */ + 0, /* ldm_regs_per_insn_subsequent. */ + COSTS_N_INSNS (3), /* loadf. */ + COSTS_N_INSNS (3), /* loadd. */ + COSTS_N_INSNS (3), /* load_unaligned. */ + 0, /* store. */ + 0, /* strd. */ + 0, /* stm_1st. */ + 0, /* stm_regs_per_insn_1st. */ + 0, /* stm_regs_per_insn_subsequent. */ + COSTS_N_INSNS (1), /* storef. */ + COSTS_N_INSNS (1), /* stored. */ + COSTS_N_INSNS (1), /* store_unaligned. */ + COSTS_N_INSNS (3), /* loadv. */ + COSTS_N_INSNS (3) /* storev. */ + }, + { + /* FP SFmode */ + { + COSTS_N_INSNS (18), /* div. */ + COSTS_N_INSNS (3), /* mult. */ + COSTS_N_INSNS (3), /* mult_addsub. */ + COSTS_N_INSNS (3), /* fma. */ + COSTS_N_INSNS (2), /* addsub. */ + COSTS_N_INSNS (1), /* fpconst. */ + COSTS_N_INSNS (2), /* neg. */ + COSTS_N_INSNS (2), /* compare. */ + COSTS_N_INSNS (2), /* widen. */ + COSTS_N_INSNS (2), /* narrow. */ + COSTS_N_INSNS (6), /* toint. */ + COSTS_N_INSNS (4), /* fromint. */ + COSTS_N_INSNS (2) /* roundint. */ + }, + /* FP DFmode */ + { + COSTS_N_INSNS (18), /* div. */ + COSTS_N_INSNS (3), /* mult. */ + COSTS_N_INSNS (3), /* mult_addsub. */ + COSTS_N_INSNS (3), /* fma. */ + COSTS_N_INSNS (2), /* addsub. */ + COSTS_N_INSNS (1), /* fpconst. */ + COSTS_N_INSNS (2), /* neg. */ + COSTS_N_INSNS (2), /* compare. */ + COSTS_N_INSNS (2), /* widen. */ + COSTS_N_INSNS (2), /* narrow. */ + COSTS_N_INSNS (6), /* toint. */ + COSTS_N_INSNS (4), /* fromint. */ + COSTS_N_INSNS (2) /* roundint. */ + } + }, + /* Vector */ + { + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (2), /* mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ + } +}; + #endif diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md index 61bb8521125..98e6882d432 100644 --- a/gcc/config/aarch64/aarch64-tune.md +++ b/gcc/config/aarch64/aarch64-tune.md @@ -1,5 +1,5 @@ ;; -*- buffer-read-only: t -*- ;; Generated automatically by gentune.sh from aarch64-cores.def (define_attr "tune" - "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a" + "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,ampere1b,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a" (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index f6f6f94bf43..6a6cceb52c7 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -375,6 +375,7 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] = #include "tuning_models/neoversen1.h" #include "tuning_models/ampere1.h" #include "tuning_models/ampere1a.h" +#include "tuning_models/ampere1b.h" #include "tuning_models/neoversev1.h" #include "tuning_models/neoverse512tvb.h" #include "tuning_models/neoversen2.h" diff --git a/gcc/config/aarch64/tuning_models/ampere1b.h b/gcc/config/aarch64/tuning_models/ampere1b.h new file mode 100644 index 00000000000..c4928f50d29 --- /dev/null +++ b/gcc/config/aarch64/tuning_models/ampere1b.h @@ -0,0 +1,114 @@ +/* Tuning model description for the Ampere1B core. + Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +#ifndef GCC_AARCH64_H_AMPERE1B +#define GCC_AARCH64_H_AMPERE1B + +#include "generic.h" + +static const cpu_prefetch_tune ampere1b_prefetch_tune = +{ + 48, /* num_slots */ + 64, /* l1_cache_size */ + 64, /* l1_cache_line_size */ + 2048, /* l2_cache_size */ + true, /* prefetch_dynamic_strides */ + -1, /* minimum_stride */ + -1 /* default_opt_level */ +}; + +static const advsimd_vec_cost ampere1b_advsimd_vector_cost = +{ + 1, /* int_stmt_cost */ + 3, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ + 2, /* permute_cost */ + 8, /* reduc_i8_cost */ + 6, /* reduc_i16_cost */ + 4, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 9, /* reduc_f16_cost */ + 6, /* reduc_f32_cost */ + 3, /* reduc_f64_cost */ + 5, /* store_elt_extra_cost */ + 5, /* vec_to_scalar_cost */ + 5, /* scalar_to_vec_cost */ + 4, /* align_load_cost */ + 4, /* unalign_load_cost */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ +}; + +/* Ampere-1B costs for vector insn classes. */ +static const struct cpu_vector_cost ampere1b_vector_cost = +{ + 1, /* scalar_int_stmt_cost */ + 3, /* scalar_fp_stmt_cost */ + 4, /* scalar_load_cost */ + 1, /* scalar_store_cost */ + 1, /* cond_taken_branch_cost */ + 1, /* cond_not_taken_branch_cost */ + &ere1b_advsimd_vector_cost, /* advsimd */ + nullptr, /* sve */ + nullptr /* issue_info */ +}; + +static const struct tune_params ampere1b_tunings = +{ + &ere1b_extra_costs, + &generic_addrcost_table, + &generic_regmove_cost, + &ere1b_vector_cost, + &generic_branch_cost, + &generic_approx_modes, + SVE_NOT_IMPLEMENTED, /* sve_width */ + { 3, /* load_int. */ + 1, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ + 4, /* issue_rate */ + (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC | + AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK | + AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ | + AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ | + AARCH64_FUSE_ADDSUB_2REG_CONST1), + /* fusible_ops */ + "32", /* function_align. */ + "4", /* jump_align. */ + "32:16", /* loop_align. */ + 2, /* int_reassoc_width. */ + 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ + 2, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ + &ere1b_prefetch_tune, + AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */ + AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */ +}; + +#endif /* GCC_AARCH64_H_AMPERE1B */ diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 7666de1e9b2..54e4101e7a2 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -20851,7 +20851,7 @@ performance of the code. Permissible values for this option are: @samp{cortex-r82}, @samp{cortex-x1}, @samp{cortex-x1c}, @samp{cortex-x2}, @samp{cortex-x3}, @samp{cortex-x4}, @samp{cortex-a510}, @samp{cortex-a520}, @samp{cortex-a710}, @samp{cortex-a715}, @samp{cortex-a720}, @samp{ampere1}, -@samp{ampere1a}, and @samp{native}. +@samp{ampere1a}, @samp{ampere1b}, and @samp{native}. The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53}, @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},
This patch adds initial support for Ampere-1B core. The Ampere-1B core implements ARMv8.7 with the following (compiler visible) extensions: - CSSC (Common Short Sequence Compression instructions), - MTE (Memory Tagging Extension) - SM3/SM4 gcc/ChangeLog: * config/aarch64/aarch64-cores.def (AARCH64_CORE): Add ampere-1b * config/aarch64/aarch64-cost-tables.h: Add ampere1b_extra_costs * config/aarch64/aarch64-tune.md: Regenerate * config/aarch64/aarch64.cc: Include ampere1b tuning model * doc/invoke.texi: Document -mcpu=ampere1b * config/aarch64/tuning_models/ampere1b.h: New file. Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu> --- Changes in v2: - moved ampere1b model to a separated file - regenerated aarch64-tune.md after rebase gcc/config/aarch64/aarch64-cores.def | 1 + gcc/config/aarch64/aarch64-cost-tables.h | 107 ++++++++++++++++++ gcc/config/aarch64/aarch64-tune.md | 2 +- gcc/config/aarch64/aarch64.cc | 1 + gcc/config/aarch64/tuning_models/ampere1b.h | 114 ++++++++++++++++++++ gcc/doc/invoke.texi | 2 +- 6 files changed, 225 insertions(+), 2 deletions(-) create mode 100644 gcc/config/aarch64/tuning_models/ampere1b.h