diff mbox series

[1/6] AArch64: Refactor costs models to different files.

Message ID patch-17815-tamar@arm.com
State New
Headers show
Series [1/6] AArch64: Refactor costs models to different files. | expand

Commit Message

Tamar Christina Nov. 15, 2023, 5:06 p.m. UTC
Hi All,

This patch series attempts to move the generic cost model in AArch64 to a new
and modern generic standard.  The current standard is quite old and generates
very suboptimal code out of the box for user of GCC.

The goal is for the new cost model to be beneficial on newer/current Arm
Microarchitectures while not being too negative for older ones.

It does not change any core specific optimization.  The final changes reflect
both performance optimizations and size optimizations.

This first patch just re-organizes the cost structures to their own files.
The AArch64.cc file has gotten very big and it's hard to follow.

No functional changes are expected from this change.  Note that since all the
structures have private visibility I've put them in header files instead.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	PR target/111370
	* config/aarch64/aarch64.cc (generic_addrcost_table,
	exynosm1_addrcost_table,
	xgene1_addrcost_table,
	thunderx2t99_addrcost_table,
	thunderx3t110_addrcost_table,
	tsv110_addrcost_table,
	qdf24xx_addrcost_table,
	a64fx_addrcost_table,
	neoversev1_addrcost_table,
	neoversen2_addrcost_table,
	neoversev2_addrcost_table,
	generic_regmove_cost,
	cortexa57_regmove_cost,
	cortexa53_regmove_cost,
	exynosm1_regmove_cost,
	thunderx_regmove_cost,
	xgene1_regmove_cost,
	qdf24xx_regmove_cost,
	thunderx2t99_regmove_cost,
	thunderx3t110_regmove_cost,
	tsv110_regmove_cost,
	a64fx_regmove_cost,
	neoversen2_regmove_cost,
	neoversev1_regmove_cost,
	neoversev2_regmove_cost,
	generic_vector_cost,
	a64fx_vector_cost,
	qdf24xx_vector_cost,
	thunderx_vector_cost,
	tsv110_vector_cost,
	cortexa57_vector_cost,
	exynosm1_vector_cost,
	xgene1_vector_cost,
	thunderx2t99_vector_cost,
	thunderx3t110_vector_cost,
	ampere1_vector_cost,
	generic_branch_cost,
	generic_tunings,
	cortexa35_tunings,
	cortexa53_tunings,
	cortexa57_tunings,
	cortexa72_tunings,
	cortexa73_tunings,
	exynosm1_tunings,
	thunderxt88_tunings,
	thunderx_tunings,
	tsv110_tunings,
	xgene1_tunings,
	emag_tunings,
	qdf24xx_tunings,
	saphira_tunings,
	thunderx2t99_tunings,
	thunderx3t110_tunings,
	neoversen1_tunings,
	ampere1_tunings,
	ampere1a_tunings,
	neoversev1_vector_cost,
	neoversev1_tunings,
	neoverse512tvb_vector_cost,
	neoverse512tvb_tunings,
	neoversen2_vector_cost,
	neoversen2_tunings,
	neoversev2_vector_cost,
	neoversev2_tunings
	a64fx_tunings): Split into own files.
	* config/aarch64/tuning_models/a64fx.h: New file.
	* config/aarch64/tuning_models/ampere1.h: New file.
	* config/aarch64/tuning_models/ampere1a.h: New file.
	* config/aarch64/tuning_models/cortexa35.h: New file.
	* config/aarch64/tuning_models/cortexa53.h: New file.
	* config/aarch64/tuning_models/cortexa57.h: New file.
	* config/aarch64/tuning_models/cortexa72.h: New file.
	* config/aarch64/tuning_models/cortexa73.h: New file.
	* config/aarch64/tuning_models/emag.h: New file.
	* config/aarch64/tuning_models/exynosm1.h: New file.
	* config/aarch64/tuning_models/generic.h: New file.
	* config/aarch64/tuning_models/neoverse512tvb.h: New file.
	* config/aarch64/tuning_models/neoversen1.h: New file.
	* config/aarch64/tuning_models/neoversen2.h: New file.
	* config/aarch64/tuning_models/neoversev1.h: New file.
	* config/aarch64/tuning_models/neoversev2.h: New file.
	* config/aarch64/tuning_models/qdf24xx.h: New file.
	* config/aarch64/tuning_models/saphira.h: New file.
	* config/aarch64/tuning_models/thunderx.h: New file.
	* config/aarch64/tuning_models/thunderx2t99.h: New file.
	* config/aarch64/tuning_models/thunderx3t110.h: New file.
	* config/aarch64/tuning_models/thunderxt88.h: New file.
	* config/aarch64/tuning_models/tsv110.h: New file.
	* config/aarch64/tuning_models/xgene1.h: New file.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9fbfc548a891f5d11940c6fd3c49a14bfbdec886..07b1cde39209f5c7740e336b499e9aed31e4c515 100644




--
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9fbfc548a891f5d11940c6fd3c49a14bfbdec886..07b1cde39209f5c7740e336b499e9aed31e4c515 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -354,2405 +354,30 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 };
 
 /* Tuning parameters.  */
-
-static const struct cpu_addrcost_table generic_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0 /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table exynosm1_addrcost_table =
-{
-    {
-      0, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      2, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  1, /* register_offset  */
-  1, /* register_sextend  */
-  2, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table xgene1_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  1, /* pre_modify  */
-  1, /* post_modify  */
-  1, /* post_modify_ld3_st3  */
-  1, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  1, /* register_sextend  */
-  1, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
-{
-    {
-      1, /* hi  */
-      1, /* si  */
-      1, /* di  */
-      2, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  2, /* register_offset  */
-  3, /* register_sextend  */
-  3, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
-{
-    {
-      1, /* hi  */
-      1, /* si  */
-      1, /* di  */
-      2, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  2, /* register_offset  */
-  3, /* register_sextend  */
-  3, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table tsv110_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  1, /* register_sextend  */
-  1, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table qdf24xx_addrcost_table =
-{
-    {
-      1, /* hi  */
-      1, /* si  */
-      1, /* di  */
-      2, /* ti  */
-    },
-  1, /* pre_modify  */
-  1, /* post_modify  */
-  1, /* post_modify_ld3_st3  */
-  1, /* post_modify_ld4_st4  */
-  3, /* register_offset  */
-  3, /* register_sextend  */
-  3, /* register_zextend  */
-  2, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table a64fx_addrcost_table =
-{
-    {
-      1, /* hi  */
-      1, /* si  */
-      1, /* di  */
-      2, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  2, /* register_offset  */
-  3, /* register_sextend  */
-  3, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table neoversev1_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  3, /* post_modify_ld3_st3  */
-  3, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0 /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table neoversen2_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  2, /* post_modify_ld3_st3  */
-  2, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0 /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table neoversev2_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  2, /* post_modify_ld3_st3  */
-  2, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0 /* imm_offset  */
-};
-
-static const struct cpu_regmove_cost generic_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  5, /* GP2FP  */
-  5, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost cortexa57_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  5, /* GP2FP  */
-  5, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost cortexa53_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  5, /* GP2FP  */
-  5, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost exynosm1_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost (actual, 4 and 9).  */
-  9, /* GP2FP  */
-  9, /* FP2GP  */
-  1 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost thunderx_regmove_cost =
-{
-  2, /* GP2GP  */
-  2, /* GP2FP  */
-  6, /* FP2GP  */
-  4 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost xgene1_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  8, /* GP2FP  */
-  8, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost qdf24xx_regmove_cost =
-{
-  2, /* GP2GP  */
-  /* Avoid the use of int<->fp moves for spilling.  */
-  6, /* GP2FP  */
-  6, /* FP2GP  */
-  4 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of int<->fp moves for spilling.  */
-  5, /* GP2FP  */
-  6, /* FP2GP  */
-  3, /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of int<->fp moves for spilling.  */
-  4, /* GP2FP  */
-  5, /* FP2GP  */
-  4  /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost tsv110_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  2, /* GP2FP  */
-  3, /* FP2GP  */
-  2  /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost a64fx_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  5, /* GP2FP  */
-  7, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost neoversen2_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Spilling to int<->fp instead of memory is recommended so set
-     realistic costs compared to memmov_cost.  */
-  3, /* GP2FP  */
-  2, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost neoversev1_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Spilling to int<->fp instead of memory is recommended so set
-     realistic costs compared to memmov_cost.  */
-  3, /* GP2FP  */
-  2, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost neoversev2_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Spilling to int<->fp instead of memory is recommended so set
-     realistic costs compared to memmov_cost.  */
-  3, /* GP2FP  */
-  2, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-/* Generic costs for Advanced SIMD vector operations.   */
-static const advsimd_vec_cost generic_advsimd_vector_cost =
-{
-  1, /* int_stmt_cost  */
-  1, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  2, /* reduc_i8_cost  */
-  2, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  2, /* reduc_f16_cost  */
-  2, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  2, /* vec_to_scalar_cost  */
-  1, /* scalar_to_vec_cost  */
-  1, /* align_load_cost  */
-  1, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* Generic costs for SVE vector operations.  */
-static const sve_vec_cost generic_sve_vector_cost =
-{
-  {
-    1, /* int_stmt_cost  */
-    1, /* fp_stmt_cost  */
-    0, /* ld2_st2_permute_cost  */
-    0, /* ld3_st3_permute_cost  */
-    0, /* ld4_st4_permute_cost  */
-    2, /* permute_cost  */
-    2, /* reduc_i8_cost  */
-    2, /* reduc_i16_cost  */
-    2, /* reduc_i32_cost  */
-    2, /* reduc_i64_cost  */
-    2, /* reduc_f16_cost  */
-    2, /* reduc_f32_cost  */
-    2, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    2, /* vec_to_scalar_cost  */
-    1, /* scalar_to_vec_cost  */
-    1, /* align_load_cost  */
-    1, /* unalign_load_cost  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  2, /* clast_cost  */
-  2, /* fadda_f16_cost  */
-  2, /* fadda_f32_cost  */
-  2, /* fadda_f64_cost  */
-  4, /* gather_load_x32_cost  */
-  2, /* gather_load_x64_cost  */
-  1 /* scatter_store_elt_cost  */
-};
-
-/* Generic costs for vector insn classes.  */
-static const struct cpu_vector_cost generic_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  1, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  3, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &generic_advsimd_vector_cost, /* advsimd  */
-  &generic_sve_vector_cost, /* sve */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost a64fx_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  5, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  13, /* reduc_i8_cost  */
-  13, /* reduc_i16_cost  */
-  13, /* reduc_i32_cost  */
-  13, /* reduc_i64_cost  */
-  13, /* reduc_f16_cost  */
-  13, /* reduc_f32_cost  */
-  13, /* reduc_f64_cost  */
-  13, /* store_elt_extra_cost  */
-  13, /* vec_to_scalar_cost  */
-  4, /* scalar_to_vec_cost  */
-  6, /* align_load_cost  */
-  6, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const sve_vec_cost a64fx_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    5, /* fp_stmt_cost  */
-    0, /* ld2_st2_permute_cost  */
-    0, /* ld3_st3_permute_cost  */
-    0, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    13, /* reduc_i8_cost  */
-    13, /* reduc_i16_cost  */
-    13, /* reduc_i32_cost  */
-    13, /* reduc_i64_cost  */
-    13, /* reduc_f16_cost  */
-    13, /* reduc_f32_cost  */
-    13, /* reduc_f64_cost  */
-    13, /* store_elt_extra_cost  */
-    13, /* vec_to_scalar_cost  */
-    4, /* scalar_to_vec_cost  */
-    6, /* align_load_cost  */
-    6, /* unalign_load_cost  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  13, /* clast_cost  */
-  13, /* fadda_f16_cost  */
-  13, /* fadda_f32_cost  */
-  13, /* fadda_f64_cost  */
-  64, /* gather_load_x32_cost  */
-  32, /* gather_load_x64_cost  */
-  1 /* scatter_store_elt_cost  */
-};
-
-static const struct cpu_vector_cost a64fx_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  5, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  3, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &a64fx_advsimd_vector_cost, /* advsimd  */
-  &a64fx_sve_vector_cost, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
-{
-  1, /* int_stmt_cost  */
-  3, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  1, /* reduc_i8_cost  */
-  1, /* reduc_i16_cost  */
-  1, /* reduc_i32_cost  */
-  1, /* reduc_i64_cost  */
-  1, /* reduc_f16_cost  */
-  1, /* reduc_f32_cost  */
-  1, /* reduc_f64_cost  */
-  1, /* store_elt_extra_cost  */
-  1, /* vec_to_scalar_cost  */
-  1, /* scalar_to_vec_cost  */
-  1, /* align_load_cost  */
-  1, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* QDF24XX costs for vector insn classes.  */
-static const struct cpu_vector_cost qdf24xx_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  1, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  3, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &qdf24xx_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-
-static const advsimd_vec_cost thunderx_advsimd_vector_cost =
-{
-  4, /* int_stmt_cost  */
-  1, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  4, /* permute_cost  */
-  2, /* reduc_i8_cost  */
-  2, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  2, /* reduc_f16_cost  */
-  2, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  2, /* vec_to_scalar_cost  */
-  2, /* scalar_to_vec_cost  */
-  3, /* align_load_cost  */
-  5, /* unalign_load_cost  */
-  5, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* ThunderX costs for vector insn classes.  */
-static const struct cpu_vector_cost thunderx_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  3, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  3, /* cond_taken_branch_cost  */
-  3, /* cond_not_taken_branch_cost  */
-  &thunderx_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost tsv110_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  3, /* reduc_i8_cost  */
-  3, /* reduc_i16_cost  */
-  3, /* reduc_i32_cost  */
-  3, /* reduc_i64_cost  */
-  3, /* reduc_f16_cost  */
-  3, /* reduc_f32_cost  */
-  3, /* reduc_f64_cost  */
-  3, /* store_elt_extra_cost  */
-  3, /* vec_to_scalar_cost  */
-  2, /* scalar_to_vec_cost  */
-  5, /* align_load_cost  */
-  5, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const struct cpu_vector_cost tsv110_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  5, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &tsv110_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  8, /* reduc_i8_cost  */
-  8, /* reduc_i16_cost  */
-  8, /* reduc_i32_cost  */
-  8, /* reduc_i64_cost  */
-  8, /* reduc_f16_cost  */
-  8, /* reduc_f32_cost  */
-  8, /* reduc_f64_cost  */
-  8, /* store_elt_extra_cost  */
-  8, /* vec_to_scalar_cost  */
-  8, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* Cortex-A57 costs for vector insn classes.  */
-static const struct cpu_vector_cost cortexa57_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &cortexa57_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
-{
-  3, /* int_stmt_cost  */
-  3, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  3, /* reduc_i8_cost  */
-  3, /* reduc_i16_cost  */
-  3, /* reduc_i32_cost  */
-  3, /* reduc_i64_cost  */
-  3, /* reduc_f16_cost  */
-  3, /* reduc_f32_cost  */
-  3, /* reduc_f64_cost  */
-  3, /* store_elt_extra_cost  */
-  3, /* vec_to_scalar_cost  */
-  3, /* scalar_to_vec_cost  */
-  5, /* align_load_cost  */
-  5, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const struct cpu_vector_cost exynosm1_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  5, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &exynosm1_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost xgene1_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  4, /* reduc_i8_cost  */
-  4, /* reduc_i16_cost  */
-  4, /* reduc_i32_cost  */
-  4, /* reduc_i64_cost  */
-  4, /* reduc_f16_cost  */
-  4, /* reduc_f32_cost  */
-  4, /* reduc_f64_cost  */
-  4, /* store_elt_extra_cost  */
-  4, /* vec_to_scalar_cost  */
-  4, /* scalar_to_vec_cost  */
-  10, /* align_load_cost  */
-  10, /* unalign_load_cost  */
-  2, /* unalign_store_cost  */
-  2  /* store_cost  */
-};
-
-/* Generic costs for vector insn classes.  */
-static const struct cpu_vector_cost xgene1_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  5, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  2, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &xgene1_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
-{
-  4, /* int_stmt_cost  */
-  5, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  10, /* permute_cost  */
-  6, /* reduc_i8_cost  */
-  6, /* reduc_i16_cost  */
-  6, /* reduc_i32_cost  */
-  6, /* reduc_i64_cost  */
-  6, /* reduc_f16_cost  */
-  6, /* reduc_f32_cost  */
-  6, /* reduc_f64_cost  */
-  6, /* store_elt_extra_cost  */
-  6, /* vec_to_scalar_cost  */
-  5, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* Costs for vector insn classes for Vulcan.  */
-static const struct cpu_vector_cost thunderx2t99_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  6, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  2, /* cond_taken_branch_cost  */
-  1,  /* cond_not_taken_branch_cost  */
-  &thunderx2t99_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
-{
-  5, /* int_stmt_cost  */
-  5, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  10, /* permute_cost  */
-  5, /* reduc_i8_cost  */
-  5, /* reduc_i16_cost  */
-  5, /* reduc_i32_cost  */
-  5, /* reduc_i64_cost  */
-  5, /* reduc_f16_cost  */
-  5, /* reduc_f32_cost  */
-  5, /* reduc_f64_cost  */
-  5, /* store_elt_extra_cost  */
-  5, /* vec_to_scalar_cost  */
-  5, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  4, /* unalign_store_cost  */
-  4  /* store_cost  */
-};
-
-static const struct cpu_vector_cost thunderx3t110_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  5, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  2, /* cond_taken_branch_cost  */
-  1,  /* cond_not_taken_branch_cost  */
-  &thunderx3t110_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost ampere1_advsimd_vector_cost =
-{
-  1, /* int_stmt_cost  */
-  3, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  12, /* reduc_i8_cost  */
-  9, /* reduc_i16_cost  */
-  6, /* reduc_i32_cost  */
-  5, /* reduc_i64_cost  */
-  9, /* reduc_f16_cost  */
-  6, /* reduc_f32_cost  */
-  5, /* reduc_f64_cost  */
-  8, /* store_elt_extra_cost  */
-  6, /* vec_to_scalar_cost  */
-  7, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* Ampere-1 costs for vector insn classes.  */
-static const struct cpu_vector_cost ampere1_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  3, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &ampere1_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr  /* issue_info  */
-};
-
-/* Generic costs for branch instructions.  */
-static const struct cpu_branch_cost generic_branch_cost =
-{
-  1,  /* Predictable.  */
-  3   /* Unpredictable.  */
-};
-
-/* Generic approximation modes.  */
-static const cpu_approx_modes generic_approx_modes =
-{
-  AARCH64_APPROX_NONE,	/* division  */
-  AARCH64_APPROX_NONE,	/* sqrt  */
-  AARCH64_APPROX_NONE	/* recip_sqrt  */
-};
-
-/* Approximation modes for Exynos M1.  */
-static const cpu_approx_modes exynosm1_approx_modes =
-{
-  AARCH64_APPROX_NONE,	/* division  */
-  AARCH64_APPROX_ALL,	/* sqrt  */
-  AARCH64_APPROX_ALL	/* recip_sqrt  */
-};
-
-/* Approximation modes for X-Gene 1.  */
-static const cpu_approx_modes xgene1_approx_modes =
-{
-  AARCH64_APPROX_NONE,	/* division  */
-  AARCH64_APPROX_NONE,	/* sqrt  */
-  AARCH64_APPROX_ALL	/* recip_sqrt  */
-};
-
-/* Generic prefetch settings (which disable prefetch).  */
-static const cpu_prefetch_tune generic_prefetch_tune =
-{
-  0,			/* num_slots  */
-  -1,			/* l1_cache_size  */
-  -1,			/* l1_cache_line_size  */
-  -1,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune exynosm1_prefetch_tune =
-{
-  0,			/* num_slots  */
-  -1,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  -1,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune qdf24xx_prefetch_tune =
-{
-  4,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  512,			/* l2_cache_size  */
-  false,		/* prefetch_dynamic_strides */
-  2048,			/* minimum_stride */
-  3			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune thunderxt88_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  128,			/* l1_cache_line_size  */
-  16*1024,		/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  3			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune thunderx_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  128,			/* l1_cache_line_size  */
-  -1,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  256,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  256,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune tsv110_prefetch_tune =
-{
-  0,                    /* num_slots  */
-  64,                   /* l1_cache_size  */
-  64,                   /* l1_cache_line_size  */
-  512,                  /* l2_cache_size  */
-  true,                 /* prefetch_dynamic_strides */
-  -1,                   /* minimum_stride */
-  -1                    /* default_opt_level  */
-};
-
-static const cpu_prefetch_tune xgene1_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  256,			/* l2_cache_size  */
-  true,                 /* prefetch_dynamic_strides */
-  -1,                   /* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune a64fx_prefetch_tune =
-{
-  8,			/* num_slots  */
-  64,			/* l1_cache_size  */
-  256,			/* l1_cache_line_size  */
-  32768,		/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune ampere1_prefetch_tune =
-{
-  0,			/* num_slots  */
-  64,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  2048,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const struct tune_params generic_tunings =
-{
-  &cortexa57_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &generic_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "16:12",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
-     Neoverse V1.  It does not have a noticeable effect on A64FX and should
-     have at most a very minor effect on SVE2 cores.  */
-  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa35_tunings =
-{
-  &cortexa53_extra_costs,
-  &generic_addrcost_table,
-  &cortexa53_regmove_cost,
-  &generic_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  1, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa53_tunings =
-{
-  &cortexa53_extra_costs,
-  &generic_addrcost_table,
-  &cortexa53_regmove_cost,
-  &generic_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa57_tunings =
-{
-  &cortexa57_extra_costs,
-  &generic_addrcost_table,
-  &cortexa57_regmove_cost,
-  &cortexa57_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa72_tunings =
-{
-  &cortexa57_extra_costs,
-  &generic_addrcost_table,
-  &cortexa57_regmove_cost,
-  &cortexa57_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa73_tunings =
-{
-  &cortexa57_extra_costs,
-  &generic_addrcost_table,
-  &cortexa57_regmove_cost,
-  &cortexa57_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate.  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params exynosm1_tunings =
-{
-  &exynosm1_extra_costs,
-  &exynosm1_addrcost_table,
-  &exynosm1_regmove_cost,
-  &exynosm1_vector_cost,
-  &generic_branch_cost,
-  &exynosm1_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3,	/* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
-  "4",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "4",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  48,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
-  &exynosm1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params thunderxt88_tunings =
-{
-  &thunderx_extra_costs,
-  &generic_addrcost_table,
-  &thunderx_regmove_cost,
-  &thunderx_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 6, /* load_int.  */
-    6, /* store_int.  */
-    6, /* load_fp.  */
-    6, /* store_fp.  */
-    6, /* load_pred.  */
-    6 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate  */
-  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
-  "8",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &thunderxt88_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
-};
-
-static const struct tune_params thunderx_tunings =
-{
-  &thunderx_extra_costs,
-  &generic_addrcost_table,
-  &thunderx_regmove_cost,
-  &thunderx_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 6, /* load_int.  */
-    6, /* store_int.  */
-    6, /* load_fp.  */
-    6, /* store_fp.  */
-    6, /* load_pred.  */
-    6 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate  */
-  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
-  "8",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
-  &thunderx_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
-};
-
-static const struct tune_params tsv110_tunings =
-{
-  &tsv110_extra_costs,
-  &tsv110_addrcost_table,
-  &tsv110_regmove_cost,
-  &tsv110_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4,    /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
-   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
-  "16", /* function_align.  */
-  "4",  /* jump_align.  */
-  "8",  /* loop_align.  */
-  2,    /* int_reassoc_width.  */
-  4,    /* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,    /* vec_reassoc_width.  */
-  2,    /* min_div_recip_mul_sf.  */
-  2,    /* min_div_recip_mul_df.  */
-  0,    /* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
-  &tsv110_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params xgene1_tunings =
-{
-  &xgene1_extra_costs,
-  &xgene1_addrcost_table,
-  &xgene1_regmove_cost,
-  &xgene1_vector_cost,
-  &generic_branch_cost,
-  &xgene1_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 6, /* load_int.  */
-    6, /* store_int.  */
-    6, /* load_fp.  */
-    6, /* store_fp.  */
-    6, /* load_pred.  */
-    6 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  AARCH64_FUSE_NOTHING, /* fusible_ops  */
-  "16",	/* function_align.  */
-  "16",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  17,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
-  &xgene1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params emag_tunings =
-{
-  &xgene1_extra_costs,
-  &xgene1_addrcost_table,
-  &xgene1_regmove_cost,
-  &xgene1_vector_cost,
-  &generic_branch_cost,
-  &xgene1_approx_modes,
-  SVE_NOT_IMPLEMENTED,
-  { 6, /* load_int.  */
-    6, /* store_int.  */
-    6, /* load_fp.  */
-    6, /* store_fp.  */
-    6, /* load_pred.  */
-    6 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  AARCH64_FUSE_NOTHING, /* fusible_ops  */
-  "16",	/* function_align.  */
-  "16",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  17,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
-  &xgene1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params qdf24xx_tunings =
-{
-  &qdf24xx_extra_costs,
-  &qdf24xx_addrcost_table,
-  &qdf24xx_regmove_cost,
-  &qdf24xx_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
-  "16",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
-  &qdf24xx_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-/* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
-   for now.  */
-static const struct tune_params saphira_tunings =
-{
-  &generic_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &generic_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
-  "16",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params thunderx2t99_tunings =
-{
-  &thunderx2t99_extra_costs,
-  &thunderx2t99_addrcost_table,
-  &thunderx2t99_regmove_cost,
-  &thunderx2t99_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate.  */
-  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
-   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  3,	/* int_reassoc_width.  */
-  2,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &thunderx2t99_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params thunderx3t110_tunings =
-{
-  &thunderx3t110_extra_costs,
-  &thunderx3t110_addrcost_table,
-  &thunderx3t110_regmove_cost,
-  &thunderx3t110_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  6, /* issue_rate.  */
-  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
-   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  3,	/* int_reassoc_width.  */
-  2,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &thunderx3t110_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params neoversen1_tunings =
-{
-  &cortexa76_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &cortexa57_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    2, /* store_int.  */
-    5, /* load_fp.  */
-    2, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params ampere1_tunings =
-{
-  &ampere1_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &ampere1_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
-   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
-   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
-   AARCH64_FUSE_CMP_BRANCH),
-  /* fusible_ops  */
-  "32",		/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  4,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &ampere1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
-};
-
-static const struct tune_params ampere1a_tunings =
-{
-  &ampere1a_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &ampere1_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
-   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
-   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
-   AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
-   AARCH64_FUSE_ADDSUB_2REG_CONST1),
-  /* fusible_ops  */
-  "32",		/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &ampere1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
-};
-
-static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  4, /* ld2_st2_permute_cost */
-  4, /* ld3_st3_permute_cost  */
-  5, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  4, /* reduc_i8_cost  */
-  4, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  6, /* reduc_f16_cost  */
-  3, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  /* This value is just inherited from the Cortex-A57 table.  */
-  8, /* vec_to_scalar_cost  */
-  /* This depends very much on what the scalar value is and
-     where it comes from.  E.g. some constants take two dependent
-     instructions or a load, while others might be moved from a GPR.
-     4 seems to be a reasonable compromise in practice.  */
-  4, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  /* Although stores have a latency of 2 and compete for the
-     vector pipes, in practice it's better not to model that.  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const sve_vec_cost neoversev1_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    2, /* fp_stmt_cost  */
-    4, /* ld2_st2_permute_cost  */
-    7, /* ld3_st3_permute_cost  */
-    8, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    /* Theoretically, a reduction involving 31 scalar ADDs could
-       complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
-       completes in 14 cycles, so give it a cost of 31 + 5.  */
-    36, /* reduc_i8_cost  */
-    /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
-    22, /* reduc_i16_cost  */
-    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
-    14, /* reduc_i32_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
-    11, /* reduc_i64_cost  */
-    /* Theoretically, a reduction involving 15 scalar FADDs could
-       complete in ~9 cycles and would have a cost of 30.  FADDV
-       completes in 13 cycles, so give it a cost of 30 + 4.  */
-    34, /* reduc_f16_cost  */
-    /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
-    19, /* reduc_f32_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
-    11, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    /* This value is just inherited from the Cortex-A57 table.  */
-    8, /* vec_to_scalar_cost  */
-    /* See the comment above the Advanced SIMD versions.  */
-    4, /* scalar_to_vec_cost  */
-    4, /* align_load_cost  */
-    4, /* unalign_load_cost  */
-    /* Although stores have a latency of 2 and compete for the
-       vector pipes, in practice it's better not to model that.  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  3, /* clast_cost  */
-  19, /* fadda_f16_cost  */
-  11, /* fadda_f32_cost  */
-  8, /* fadda_f64_cost  */
-  32, /* gather_load_x32_cost  */
-  16, /* gather_load_x64_cost  */
-  3 /* scatter_store_elt_cost  */
-};
-
-static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
-{
-  3, /* loads_stores_per_cycle  */
-  2, /* stores_per_cycle  */
-  4, /* general_ops_per_cycle  */
-  0, /* fp_simd_load_general_ops  */
-  1 /* fp_simd_store_general_ops  */
-};
-
-static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
-{
-  {
-    3, /* loads_stores_per_cycle  */
-    2, /* stores_per_cycle  */
-    4, /* general_ops_per_cycle  */
-    0, /* fp_simd_load_general_ops  */
-    1 /* fp_simd_store_general_ops  */
-  },
-  2, /* ld2_st2_general_ops  */
-  2, /* ld3_st3_general_ops  */
-  3 /* ld4_st4_general_ops  */
-};
-
-static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
-{
-  {
-    {
-      2, /* loads_per_cycle  */
-      2, /* stores_per_cycle  */
-      2, /* general_ops_per_cycle  */
-      0, /* fp_simd_load_general_ops  */
-      1 /* fp_simd_store_general_ops  */
-    },
-    2, /* ld2_st2_general_ops  */
-    2, /* ld3_st3_general_ops  */
-    3 /* ld4_st4_general_ops  */
-  },
-  1, /* pred_ops_per_cycle  */
-  2, /* while_pred_ops  */
-  2, /* int_cmp_pred_ops  */
-  1, /* fp_cmp_pred_ops  */
-  1, /* gather_scatter_pair_general_ops  */
-  1 /* gather_scatter_pair_pred_ops  */
-};
-
-static const aarch64_vec_issue_info neoversev1_vec_issue_info =
-{
-  &neoversev1_scalar_issue_info,
-  &neoversev1_advsimd_issue_info,
-  &neoversev1_sve_issue_info
-};
-
-/* Neoverse V1 costs for vector insn classes.  */
-static const struct cpu_vector_cost neoversev1_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  2, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &neoversev1_advsimd_vector_cost, /* advsimd  */
-  &neoversev1_sve_vector_cost, /* sve  */
-  &neoversev1_vec_issue_info /* issue_info  */
-};
-
-static const struct tune_params neoversev1_tunings =
-{
-  &cortexa76_extra_costs,
-  &neoversev1_addrcost_table,
-  &neoversev1_regmove_cost,
-  &neoversev1_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_256, /* sve_width  */
-  { 4, /* load_int.  */
-    2, /* store_int.  */
-    6, /* load_fp.  */
-    2, /* store_fp.  */
-    6, /* load_pred.  */
-    1 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  4,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const sve_vec_cost neoverse512tvb_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    2, /* fp_stmt_cost  */
-    4, /* ld2_st2_permute_cost  */
-    5, /* ld3_st3_permute_cost  */
-    5, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    /* Theoretically, a reduction involving 15 scalar ADDs could
-       complete in ~5 cycles and would have a cost of 15.  Assume that
-       [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6.  */
-    21, /* reduc_i8_cost  */
-    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
-    13, /* reduc_i16_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
-    9, /* reduc_i32_cost  */
-    /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7.  */
-    8, /* reduc_i64_cost  */
-    /* Theoretically, a reduction involving 7 scalar FADDs could
-       complete in ~6 cycles and would have a cost of 14.  Assume that
-       FADDV completes in 8 cycles and so give it a cost of 14 + 2.  */
-    16, /* reduc_f16_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
-    8, /* reduc_f32_cost  */
-    /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2.  */
-    4, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    /* This value is just inherited from the Cortex-A57 table.  */
-    8, /* vec_to_scalar_cost  */
-    /* This depends very much on what the scalar value is and
-       where it comes from.  E.g. some constants take two dependent
-       instructions or a load, while others might be moved from a GPR.
-       4 seems to be a reasonable compromise in practice.  */
-    4, /* scalar_to_vec_cost  */
-    4, /* align_load_cost  */
-    4, /* unalign_load_cost  */
-    /* Although stores generally have a latency of 2 and compete for the
-       vector pipes, in practice it's better not to model that.  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  3, /* clast_cost  */
-  10, /* fadda_f16_cost  */
-  6, /* fadda_f32_cost  */
-  4, /* fadda_f64_cost  */
-  /* A strided Advanced SIMD x64 load would take two parallel FP loads
-     (6 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
-     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
-     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
-     (cost 2) to that, to avoid the difference being lost in rounding.
-
-     There is no easy comparison between a strided Advanced SIMD x32 load
-     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
-     operation more than a 64-bit gather.  */
-  14, /* gather_load_x32_cost  */
-  12, /* gather_load_x64_cost  */
-  3 /* scatter_store_elt_cost  */
-};
-
-static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
-{
-  {
-    {
-      3, /* loads_per_cycle  */
-      2, /* stores_per_cycle  */
-      4, /* general_ops_per_cycle  */
-      0, /* fp_simd_load_general_ops  */
-      1 /* fp_simd_store_general_ops  */
-    },
-    2, /* ld2_st2_general_ops  */
-    2, /* ld3_st3_general_ops  */
-    3 /* ld4_st4_general_ops  */
-  },
-  2, /* pred_ops_per_cycle  */
-  2, /* while_pred_ops  */
-  2, /* int_cmp_pred_ops  */
-  1, /* fp_cmp_pred_ops  */
-  1, /* gather_scatter_pair_general_ops  */
-  1 /* gather_scatter_pair_pred_ops  */
-};
-
-static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
-{
-  &neoversev1_scalar_issue_info,
-  &neoversev1_advsimd_issue_info,
-  &neoverse512tvb_sve_issue_info
-};
-
-static const struct cpu_vector_cost neoverse512tvb_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  2, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &neoversev1_advsimd_vector_cost, /* advsimd  */
-  &neoverse512tvb_sve_vector_cost, /* sve  */
-  &neoverse512tvb_vec_issue_info /* issue_info  */
-};
-
-static const struct tune_params neoverse512tvb_tunings =
-{
-  &cortexa76_extra_costs,
-  &neoversev1_addrcost_table,
-  &neoversev1_regmove_cost,
-  &neoverse512tvb_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_128 | SVE_256, /* sve_width  */
-  { 4, /* load_int.  */
-    2, /* store_int.  */
-    6, /* load_fp.  */
-    2, /* store_fp.  */
-    6, /* load_pred.  */
-    1 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  4,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
-};
-
-static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  2, /* ld2_st2_permute_cost */
-  2, /* ld3_st3_permute_cost  */
-  3, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  4, /* reduc_i8_cost  */
-  4, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  6, /* reduc_f16_cost  */
-  4, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  /* This value is just inherited from the Cortex-A57 table.  */
-  8, /* vec_to_scalar_cost  */
-  /* This depends very much on what the scalar value is and
-     where it comes from.  E.g. some constants take two dependent
-     instructions or a load, while others might be moved from a GPR.
-     4 seems to be a reasonable compromise in practice.  */
-  4, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  /* Although stores have a latency of 2 and compete for the
-     vector pipes, in practice it's better not to model that.  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const sve_vec_cost neoversen2_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    2, /* fp_stmt_cost  */
-    3, /* ld2_st2_permute_cost  */
-    4, /* ld3_st3_permute_cost  */
-    4, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    /* Theoretically, a reduction involving 15 scalar ADDs could
-       complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
-       completes in 11 cycles, so give it a cost of 15 + 6.  */
-    21, /* reduc_i8_cost  */
-    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
-    13, /* reduc_i16_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
-    9, /* reduc_i32_cost  */
-    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
-    2, /* reduc_i64_cost  */
-    /* Theoretically, a reduction involving 7 scalar FADDs could
-       complete in ~8 cycles and would have a cost of 14.  FADDV
-       completes in 6 cycles, so give it a cost of 14 - 2.  */
-    12, /* reduc_f16_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
-    6, /* reduc_f32_cost  */
-    /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
-    2, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    /* This value is just inherited from the Cortex-A57 table.  */
-    8, /* vec_to_scalar_cost  */
-    /* See the comment above the Advanced SIMD versions.  */
-    4, /* scalar_to_vec_cost  */
-    4, /* align_load_cost  */
-    4, /* unalign_load_cost  */
-    /* Although stores have a latency of 2 and compete for the
-       vector pipes, in practice it's better not to model that.  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  3, /* clast_cost  */
-  10, /* fadda_f16_cost  */
-  6, /* fadda_f32_cost  */
-  4, /* fadda_f64_cost  */
-  /* A strided Advanced SIMD x64 load would take two parallel FP loads
-     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
-     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
-     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
-     (cost 2) to that, to avoid the difference being lost in rounding.
-
-     There is no easy comparison between a strided Advanced SIMD x32 load
-     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
-     operation more than a 64-bit gather.  */
-  14, /* gather_load_x32_cost  */
-  12, /* gather_load_x64_cost  */
-  3 /* scatter_store_elt_cost  */
-};
-
-static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
-{
-  3, /* loads_stores_per_cycle  */
-  2, /* stores_per_cycle  */
-  4, /* general_ops_per_cycle  */
-  0, /* fp_simd_load_general_ops  */
-  1 /* fp_simd_store_general_ops  */
-};
-
-static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
-{
-  {
-    3, /* loads_stores_per_cycle  */
-    2, /* stores_per_cycle  */
-    2, /* general_ops_per_cycle  */
-    0, /* fp_simd_load_general_ops  */
-    1 /* fp_simd_store_general_ops  */
-  },
-  2, /* ld2_st2_general_ops  */
-  2, /* ld3_st3_general_ops  */
-  3 /* ld4_st4_general_ops  */
-};
-
-static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
-{
-  {
-    {
-      3, /* loads_per_cycle  */
-      2, /* stores_per_cycle  */
-      2, /* general_ops_per_cycle  */
-      0, /* fp_simd_load_general_ops  */
-      1 /* fp_simd_store_general_ops  */
-    },
-    2, /* ld2_st2_general_ops  */
-    3, /* ld3_st3_general_ops  */
-    3 /* ld4_st4_general_ops  */
-  },
-  2, /* pred_ops_per_cycle  */
-  2, /* while_pred_ops  */
-  2, /* int_cmp_pred_ops  */
-  1, /* fp_cmp_pred_ops  */
-  1, /* gather_scatter_pair_general_ops  */
-  1 /* gather_scatter_pair_pred_ops  */
-};
-
-static const aarch64_vec_issue_info neoversen2_vec_issue_info =
-{
-  &neoversen2_scalar_issue_info,
-  &neoversen2_advsimd_issue_info,
-  &neoversen2_sve_issue_info
-};
-
-/* Neoverse N2 costs for vector insn classes.  */
-static const struct cpu_vector_cost neoversen2_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  2, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &neoversen2_advsimd_vector_cost, /* advsimd  */
-  &neoversen2_sve_vector_cost, /* sve  */
-  &neoversen2_vec_issue_info /* issue_info  */
-};
-
-static const struct tune_params neoversen2_tunings =
-{
-  &cortexa76_extra_costs,
-  &neoversen2_addrcost_table,
-  &neoversen2_regmove_cost,
-  &neoversen2_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_128, /* sve_width  */
-  { 4, /* load_int.  */
-    1, /* store_int.  */
-    6, /* load_fp.  */
-    2, /* store_fp.  */
-    6, /* load_pred.  */
-    1 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
-   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
-};
-
-static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  2, /* ld2_st2_permute_cost */
-  2, /* ld3_st3_permute_cost  */
-  3, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  4, /* reduc_i8_cost  */
-  4, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  6, /* reduc_f16_cost  */
-  3, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  /* This value is just inherited from the Cortex-A57 table.  */
-  8, /* vec_to_scalar_cost  */
-  /* This depends very much on what the scalar value is and
-     where it comes from.  E.g. some constants take two dependent
-     instructions or a load, while others might be moved from a GPR.
-     4 seems to be a reasonable compromise in practice.  */
-  4, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  /* Although stores have a latency of 2 and compete for the
-     vector pipes, in practice it's better not to model that.  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const sve_vec_cost neoversev2_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    2, /* fp_stmt_cost  */
-    3, /* ld2_st2_permute_cost  */
-    3, /* ld3_st3_permute_cost  */
-    4, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    /* Theoretically, a reduction involving 15 scalar ADDs could
-       complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
-       completes in 11 cycles, so give it a cost of 15 + 8.  */
-    21, /* reduc_i8_cost  */
-    /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
-    14, /* reduc_i16_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
-    7, /* reduc_i32_cost  */
-    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
-    2, /* reduc_i64_cost  */
-    /* Theoretically, a reduction involving 7 scalar FADDs could
-       complete in ~6 cycles and would have a cost of 14.  FADDV
-       completes in 8 cycles, so give it a cost of 14 + 2.  */
-    16, /* reduc_f16_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
-    8, /* reduc_f32_cost  */
-    /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2.  */
-    4, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    /* This value is just inherited from the Cortex-A57 table.  */
-    8, /* vec_to_scalar_cost  */
-    /* See the comment above the Advanced SIMD versions.  */
-    4, /* scalar_to_vec_cost  */
-    4, /* align_load_cost  */
-    4, /* unalign_load_cost  */
-    /* Although stores have a latency of 2 and compete for the
-       vector pipes, in practice it's better not to model that.  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  3, /* clast_cost  */
-  10, /* fadda_f16_cost  */
-  6, /* fadda_f32_cost  */
-  4, /* fadda_f64_cost  */
-  /* A strided Advanced SIMD x64 load would take two parallel FP loads
-     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
-     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
-     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
-     (cost 2) to that, to avoid the difference being lost in rounding.
-
-     There is no easy comparison between a strided Advanced SIMD x32 load
-     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
-     operation more than a 64-bit gather.  */
-  14, /* gather_load_x32_cost  */
-  12, /* gather_load_x64_cost  */
-  3 /* scatter_store_elt_cost  */
-};
-
-static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
-{
-  3, /* loads_stores_per_cycle  */
-  2, /* stores_per_cycle  */
-  6, /* general_ops_per_cycle  */
-  0, /* fp_simd_load_general_ops  */
-  1 /* fp_simd_store_general_ops  */
-};
-
-static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
-{
-  {
-    3, /* loads_stores_per_cycle  */
-    2, /* stores_per_cycle  */
-    4, /* general_ops_per_cycle  */
-    0, /* fp_simd_load_general_ops  */
-    1 /* fp_simd_store_general_ops  */
-  },
-  2, /* ld2_st2_general_ops  */
-  2, /* ld3_st3_general_ops  */
-  3 /* ld4_st4_general_ops  */
-};
-
-static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
-{
-  {
-    {
-      3, /* loads_per_cycle  */
-      2, /* stores_per_cycle  */
-      4, /* general_ops_per_cycle  */
-      0, /* fp_simd_load_general_ops  */
-      1 /* fp_simd_store_general_ops  */
-    },
-    2, /* ld2_st2_general_ops  */
-    3, /* ld3_st3_general_ops  */
-    3 /* ld4_st4_general_ops  */
-  },
-  2, /* pred_ops_per_cycle  */
-  2, /* while_pred_ops  */
-  2, /* int_cmp_pred_ops  */
-  1, /* fp_cmp_pred_ops  */
-  1, /* gather_scatter_pair_general_ops  */
-  1 /* gather_scatter_pair_pred_ops  */
-};
-
-static const aarch64_vec_issue_info neoversev2_vec_issue_info =
-{
-  &neoversev2_scalar_issue_info,
-  &neoversev2_advsimd_issue_info,
-  &neoversev2_sve_issue_info
-};
-
-/* Demeter costs for vector insn classes.  */
-static const struct cpu_vector_cost neoversev2_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  2, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &neoversev2_advsimd_vector_cost, /* advsimd  */
-  &neoversev2_sve_vector_cost, /* sve  */
-  &neoversev2_vec_issue_info /* issue_info  */
-};
-
-static const struct tune_params neoversev2_tunings =
-{
-  &cortexa76_extra_costs,
-  &neoversev2_addrcost_table,
-  &neoversev2_regmove_cost,
-  &neoversev2_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_128, /* sve_width  */
-  { 4, /* load_int.  */
-    2, /* store_int.  */
-    6, /* load_fp.  */
-    1, /* store_fp.  */
-    6, /* load_pred.  */
-    2 /* store_pred.  */
-  }, /* memmov_cost.  */
-  5, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  3,	/* int_reassoc_width.  */
-  6,	/* fp_reassoc_width.  */
-  4,	/* fma_reassoc_width.  */
-  3,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
-   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
-};
-
-static const struct tune_params a64fx_tunings =
-{
-  &a64fx_extra_costs,
-  &a64fx_addrcost_table,
-  &a64fx_regmove_cost,
-  &a64fx_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_512, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  7, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32",	/* function_align.  */
-  "16",	/* jump_align.  */
-  "32",	/* loop_align.  */
-  4,	/* int_reassoc_width.  */
-  2,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &a64fx_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
+#include "tuning_models/generic.h"
+#include "tuning_models/cortexa35.h"
+#include "tuning_models/cortexa53.h"
+#include "tuning_models/cortexa57.h"
+#include "tuning_models/cortexa72.h"
+#include "tuning_models/cortexa73.h"
+#include "tuning_models/exynosm1.h"
+#include "tuning_models/thunderxt88.h"
+#include "tuning_models/thunderx.h"
+#include "tuning_models/tsv110.h"
+#include "tuning_models/xgene1.h"
+#include "tuning_models/emag.h"
+#include "tuning_models/qdf24xx.h"
+#include "tuning_models/saphira.h"
+#include "tuning_models/thunderx2t99.h"
+#include "tuning_models/thunderx3t110.h"
+#include "tuning_models/neoversen1.h"
+#include "tuning_models/ampere1.h"
+#include "tuning_models/ampere1a.h"
+#include "tuning_models/neoversev1.h"
+#include "tuning_models/neoverse512tvb.h"
+#include "tuning_models/neoversen2.h"
+#include "tuning_models/neoversev2.h"
+#include "tuning_models/a64fx.h"
 
 /* Support for fine-grained override of the tuning structures.  */
 struct aarch64_tuning_override_function
diff --git a/gcc/config/aarch64/tuning_models/a64fx.h b/gcc/config/aarch64/tuning_models/a64fx.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b06c27eba1e4de01738bdfdc077460f9135fb41
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/a64fx.h
@@ -0,0 +1,169 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_A64FX
+#define GCC_AARCH64_H_A64FX
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table a64fx_addrcost_table =
+{
+    {
+      1, /* hi  */
+      1, /* si  */
+      1, /* di  */
+      2, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* post_modify_ld3_st3  */
+  0, /* post_modify_ld4_st4  */
+  2, /* register_offset  */
+  3, /* register_sextend  */
+  3, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost a64fx_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  5, /* GP2FP  */
+  7, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost a64fx_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  5, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  13, /* reduc_i8_cost  */
+  13, /* reduc_i16_cost  */
+  13, /* reduc_i32_cost  */
+  13, /* reduc_i64_cost  */
+  13, /* reduc_f16_cost  */
+  13, /* reduc_f32_cost  */
+  13, /* reduc_f64_cost  */
+  13, /* store_elt_extra_cost  */
+  13, /* vec_to_scalar_cost  */
+  4, /* scalar_to_vec_cost  */
+  6, /* align_load_cost  */
+  6, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const sve_vec_cost a64fx_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    5, /* fp_stmt_cost  */
+    0, /* ld2_st2_permute_cost  */
+    0, /* ld3_st3_permute_cost  */
+    0, /* ld4_st4_permute_cost  */
+    3, /* permute_cost  */
+    13, /* reduc_i8_cost  */
+    13, /* reduc_i16_cost  */
+    13, /* reduc_i32_cost  */
+    13, /* reduc_i64_cost  */
+    13, /* reduc_f16_cost  */
+    13, /* reduc_f32_cost  */
+    13, /* reduc_f64_cost  */
+    13, /* store_elt_extra_cost  */
+    13, /* vec_to_scalar_cost  */
+    4, /* scalar_to_vec_cost  */
+    6, /* align_load_cost  */
+    6, /* unalign_load_cost  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  13, /* clast_cost  */
+  13, /* fadda_f16_cost  */
+  13, /* fadda_f32_cost  */
+  13, /* fadda_f64_cost  */
+  64, /* gather_load_x32_cost  */
+  32, /* gather_load_x64_cost  */
+  1 /* scatter_store_elt_cost  */
+};
+
+static const struct cpu_vector_cost a64fx_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  5, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  3, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &a64fx_advsimd_vector_cost, /* advsimd  */
+  &a64fx_sve_vector_cost, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const cpu_prefetch_tune a64fx_prefetch_tune =
+{
+  8,			/* num_slots  */
+  64,			/* l1_cache_size  */
+  256,			/* l1_cache_line_size  */
+  32768,		/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params a64fx_tunings =
+{
+  &a64fx_extra_costs,
+  &a64fx_addrcost_table,
+  &a64fx_regmove_cost,
+  &a64fx_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_512, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  7, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32",	/* function_align.  */
+  "16",	/* jump_align.  */
+  "32",	/* loop_align.  */
+  4,	/* int_reassoc_width.  */
+  2,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &a64fx_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_A64FX.  */
diff --git a/gcc/config/aarch64/tuning_models/ampere1.h b/gcc/config/aarch64/tuning_models/ampere1.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d2a1c696103259f23cf73df26cef9d4fa05ac73
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/ampere1.h
@@ -0,0 +1,113 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_AMPERE1
+#define GCC_AARCH64_H_AMPERE1
+
+#include "generic.h"
+
+static const advsimd_vec_cost ampere1_advsimd_vector_cost =
+{
+  1, /* int_stmt_cost  */
+  3, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  2, /* permute_cost  */
+  12, /* reduc_i8_cost  */
+  9, /* reduc_i16_cost  */
+  6, /* reduc_i32_cost  */
+  5, /* reduc_i64_cost  */
+  9, /* reduc_f16_cost  */
+  6, /* reduc_f32_cost  */
+  5, /* reduc_f64_cost  */
+  8, /* store_elt_extra_cost  */
+  6, /* vec_to_scalar_cost  */
+  7, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+/* Ampere-1 costs for vector insn classes.  */
+static const struct cpu_vector_cost ampere1_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  3, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &ampere1_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr  /* issue_info  */
+};
+
+static const cpu_prefetch_tune ampere1_prefetch_tune =
+{
+  0,			/* num_slots  */
+  64,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  2048,			/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params ampere1_tunings =
+{
+  &ampere1_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &ampere1_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
+   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
+   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
+   AARCH64_FUSE_CMP_BRANCH),
+  /* fusible_ops  */
+  "32",		/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  4,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &ampere1_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_AMPERE1.  */
diff --git a/gcc/config/aarch64/tuning_models/ampere1a.h b/gcc/config/aarch64/tuning_models/ampere1a.h
new file mode 100644
index 0000000000000000000000000000000000000000..c419ffb3c1a936a01690ad157c6c71dc645273c8
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/ampere1a.h
@@ -0,0 +1,65 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_AMPERE1A
+#define GCC_AARCH64_H_AMPERE1A
+
+#include "generic.h"
+
+static const struct tune_params ampere1a_tunings =
+{
+  &ampere1a_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &ampere1_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
+   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
+   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
+   AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
+   AARCH64_FUSE_ADDSUB_2REG_CONST1),
+  /* fusible_ops  */
+  "32",		/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &ampere1_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_AMPERE1A.  */
diff --git a/gcc/config/aarch64/tuning_models/cortexa35.h b/gcc/config/aarch64/tuning_models/cortexa35.h
new file mode 100644
index 0000000000000000000000000000000000000000..5534335348db96cc57fc9eccd7ff79a624cb528a
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa35.h
@@ -0,0 +1,62 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_CORTEXA35
+#define GCC_AARCH64_H_CORTEXA35
+
+#include "generic.h"
+#include "cortexa53.h"
+
+static const struct tune_params cortexa35_tunings =
+{
+  &cortexa53_extra_costs,
+  &generic_addrcost_table,
+  &cortexa53_regmove_cost,
+  &generic_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  1, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_CORTEXA35.  */
diff --git a/gcc/config/aarch64/tuning_models/cortexa53.h b/gcc/config/aarch64/tuning_models/cortexa53.h
new file mode 100644
index 0000000000000000000000000000000000000000..9dfdccc5968e7f062af5c78f153bfe3838263b0a
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa53.h
@@ -0,0 +1,71 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_CORTEXA53
+#define GCC_AARCH64_H_CORTEXA53
+
+#include "generic.h"
+
+static const struct cpu_regmove_cost cortexa53_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  5, /* GP2FP  */
+  5, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const struct tune_params cortexa53_tunings =
+{
+  &cortexa53_extra_costs,
+  &generic_addrcost_table,
+  &cortexa53_regmove_cost,
+  &generic_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  2, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_CORTEXA53.  */
diff --git a/gcc/config/aarch64/tuning_models/cortexa57.h b/gcc/config/aarch64/tuning_models/cortexa57.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c4789d57833a5879dda8e2fe454ac5f56cb0601
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa57.h
@@ -0,0 +1,109 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_CORTEXA57
+#define GCC_AARCH64_H_CORTEXA57
+
+#include "generic.h"
+
+static const struct cpu_regmove_cost cortexa57_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  5, /* GP2FP  */
+  5, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  8, /* reduc_i8_cost  */
+  8, /* reduc_i16_cost  */
+  8, /* reduc_i32_cost  */
+  8, /* reduc_i64_cost  */
+  8, /* reduc_f16_cost  */
+  8, /* reduc_f32_cost  */
+  8, /* reduc_f64_cost  */
+  8, /* store_elt_extra_cost  */
+  8, /* vec_to_scalar_cost  */
+  8, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+/* Cortex-A57 costs for vector insn classes.  */
+static const struct cpu_vector_cost cortexa57_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &cortexa57_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const struct tune_params cortexa57_tunings =
+{
+  &cortexa57_extra_costs,
+  &generic_addrcost_table,
+  &cortexa57_regmove_cost,
+  &cortexa57_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_CORTEXA57.  */
diff --git a/gcc/config/aarch64/tuning_models/cortexa72.h b/gcc/config/aarch64/tuning_models/cortexa72.h
new file mode 100644
index 0000000000000000000000000000000000000000..968171c9b2e898d7479dbcb462e33fe3905e183d
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa72.h
@@ -0,0 +1,61 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_CORTEXA72
+#define GCC_AARCH64_H_CORTEXA72
+
+#include "generic.h"
+
+static const struct tune_params cortexa72_tunings =
+{
+  &cortexa57_extra_costs,
+  &generic_addrcost_table,
+  &cortexa57_regmove_cost,
+  &cortexa57_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_CORTEXA72.  */
diff --git a/gcc/config/aarch64/tuning_models/cortexa73.h b/gcc/config/aarch64/tuning_models/cortexa73.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d1a504ddac39604dd193ce0f434fd2f5145c129
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa73.h
@@ -0,0 +1,62 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_CORTEXA73
+#define GCC_AARCH64_H_CORTEXA73
+
+#include "generic.h"
+
+static const struct tune_params cortexa73_tunings =
+{
+  &cortexa57_extra_costs,
+  &generic_addrcost_table,
+  &cortexa57_regmove_cost,
+  &cortexa57_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  2, /* issue_rate.  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+
+#endif /* GCC_AARCH64_H_CORTEXA73.  */
diff --git a/gcc/config/aarch64/tuning_models/emag.h b/gcc/config/aarch64/tuning_models/emag.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f3402c3fc2a94704eeaf9223ecb0ca1c057cace
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/emag.h
@@ -0,0 +1,60 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_EMAG
+#define GCC_AARCH64_H_EMAG
+
+#include "generic.h"
+
+static const struct tune_params emag_tunings =
+{
+  &xgene1_extra_costs,
+  &xgene1_addrcost_table,
+  &xgene1_regmove_cost,
+  &xgene1_vector_cost,
+  &generic_branch_cost,
+  &xgene1_approx_modes,
+  SVE_NOT_IMPLEMENTED,
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  AARCH64_FUSE_NOTHING, /* fusible_ops  */
+  "16",	/* function_align.  */
+  "16",	/* jump_align.  */
+  "16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  17,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
+  &xgene1_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_EMAG.  */
diff --git a/gcc/config/aarch64/tuning_models/exynosm1.h b/gcc/config/aarch64/tuning_models/exynosm1.h
new file mode 100644
index 0000000000000000000000000000000000000000..a42ea4df97f3f048c41481c304fd3684a69d743b
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/exynosm1.h
@@ -0,0 +1,144 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_EXYNOSM1
+#define GCC_AARCH64_H_EXYNOSM1
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table exynosm1_addrcost_table =
+{
+    {
+      0, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      2, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* post_modify_ld3_st3  */
+  0, /* post_modify_ld4_st4  */
+  1, /* register_offset  */
+  1, /* register_sextend  */
+  2, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost exynosm1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost (actual, 4 and 9).  */
+  9, /* GP2FP  */
+  9, /* FP2GP  */
+  1 /* FP2FP  */
+};
+
+static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
+{
+  3, /* int_stmt_cost  */
+  3, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  3, /* reduc_i8_cost  */
+  3, /* reduc_i16_cost  */
+  3, /* reduc_i32_cost  */
+  3, /* reduc_i64_cost  */
+  3, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  3, /* reduc_f64_cost  */
+  3, /* store_elt_extra_cost  */
+  3, /* vec_to_scalar_cost  */
+  3, /* scalar_to_vec_cost  */
+  5, /* align_load_cost  */
+  5, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const struct cpu_vector_cost exynosm1_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  5, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &exynosm1_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+/* Approximation modes for Exynos M1.  */
+static const cpu_approx_modes exynosm1_approx_modes =
+{
+  AARCH64_APPROX_NONE,	/* division  */
+  AARCH64_APPROX_ALL,	/* sqrt  */
+  AARCH64_APPROX_ALL	/* recip_sqrt  */
+};
+
+static const cpu_prefetch_tune exynosm1_prefetch_tune =
+{
+  0,			/* num_slots  */
+  -1,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  -1,			/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params exynosm1_tunings =
+{
+  &exynosm1_extra_costs,
+  &exynosm1_addrcost_table,
+  &exynosm1_regmove_cost,
+  &exynosm1_vector_cost,
+  &generic_branch_cost,
+  &exynosm1_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3,	/* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
+  "4",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "4",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  48,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
+  &exynosm1_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_EXYNOSM1.  */
diff --git a/gcc/config/aarch64/tuning_models/generic.h b/gcc/config/aarch64/tuning_models/generic.h
new file mode 100644
index 0000000000000000000000000000000000000000..deb2c1cffe255bddcb5be571b12086442782da60
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/generic.h
@@ -0,0 +1,190 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_GENERIC
+#define GCC_AARCH64_H_GENERIC
+
+static const struct cpu_addrcost_table generic_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* post_modify_ld3_st3  */
+  0, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost generic_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  5, /* GP2FP  */
+  5, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+/* Generic costs for Advanced SIMD vector operations.   */
+static const advsimd_vec_cost generic_advsimd_vector_cost =
+{
+  1, /* int_stmt_cost  */
+  1, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  2, /* permute_cost  */
+  2, /* reduc_i8_cost  */
+  2, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  2, /* reduc_f16_cost  */
+  2, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  2, /* vec_to_scalar_cost  */
+  1, /* scalar_to_vec_cost  */
+  1, /* align_load_cost  */
+  1, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+/* Generic costs for SVE vector operations.  */
+static const sve_vec_cost generic_sve_vector_cost =
+{
+  {
+    1, /* int_stmt_cost  */
+    1, /* fp_stmt_cost  */
+    0, /* ld2_st2_permute_cost  */
+    0, /* ld3_st3_permute_cost  */
+    0, /* ld4_st4_permute_cost  */
+    2, /* permute_cost  */
+    2, /* reduc_i8_cost  */
+    2, /* reduc_i16_cost  */
+    2, /* reduc_i32_cost  */
+    2, /* reduc_i64_cost  */
+    2, /* reduc_f16_cost  */
+    2, /* reduc_f32_cost  */
+    2, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    2, /* vec_to_scalar_cost  */
+    1, /* scalar_to_vec_cost  */
+    1, /* align_load_cost  */
+    1, /* unalign_load_cost  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  2, /* clast_cost  */
+  2, /* fadda_f16_cost  */
+  2, /* fadda_f32_cost  */
+  2, /* fadda_f64_cost  */
+  4, /* gather_load_x32_cost  */
+  2, /* gather_load_x64_cost  */
+  1 /* scatter_store_elt_cost  */
+};
+
+/* Generic costs for vector insn classes.  */
+static const struct cpu_vector_cost generic_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  1, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  3, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &generic_advsimd_vector_cost, /* advsimd  */
+  &generic_sve_vector_cost, /* sve */
+  nullptr /* issue_info  */
+};
+
+/* Generic costs for branch instructions.  */
+static const struct cpu_branch_cost generic_branch_cost =
+{
+  1,  /* Predictable.  */
+  3   /* Unpredictable.  */
+};
+
+/* Generic approximation modes.  */
+static const cpu_approx_modes generic_approx_modes =
+{
+  AARCH64_APPROX_NONE,	/* division  */
+  AARCH64_APPROX_NONE,	/* sqrt  */
+  AARCH64_APPROX_NONE	/* recip_sqrt  */
+};
+
+/* Generic prefetch settings (which disable prefetch).  */
+static const cpu_prefetch_tune generic_prefetch_tune =
+{
+  0,			/* num_slots  */
+  -1,			/* l1_cache_size  */
+  -1,			/* l1_cache_line_size  */
+  -1,			/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params generic_tunings =
+{
+  &cortexa57_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &generic_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  2, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "16:12",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
+     Neoverse V1.  It does not have a noticeable effect on A64FX and should
+     have at most a very minor effect on SVE2 cores.  */
+  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_GENERIC.  */
diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
new file mode 100644
index 0000000000000000000000000000000000000000..50d7b23712cc6a8be8f35246657ec5d86d6d4191
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
@@ -0,0 +1,164 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_NEOVERSE512TVB
+#define GCC_AARCH64_H_NEOVERSE512TVB
+
+#include "generic.h"
+
+static const sve_vec_cost neoverse512tvb_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    2, /* fp_stmt_cost  */
+    4, /* ld2_st2_permute_cost  */
+    5, /* ld3_st3_permute_cost  */
+    5, /* ld4_st4_permute_cost  */
+    3, /* permute_cost  */
+    /* Theoretically, a reduction involving 15 scalar ADDs could
+       complete in ~5 cycles and would have a cost of 15.  Assume that
+       [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6.  */
+    21, /* reduc_i8_cost  */
+    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
+    13, /* reduc_i16_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
+    9, /* reduc_i32_cost  */
+    /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7.  */
+    8, /* reduc_i64_cost  */
+    /* Theoretically, a reduction involving 7 scalar FADDs could
+       complete in ~6 cycles and would have a cost of 14.  Assume that
+       FADDV completes in 8 cycles and so give it a cost of 14 + 2.  */
+    16, /* reduc_f16_cost  */
+    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
+    8, /* reduc_f32_cost  */
+    /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2.  */
+    4, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    /* This value is just inherited from the Cortex-A57 table.  */
+    8, /* vec_to_scalar_cost  */
+    /* This depends very much on what the scalar value is and
+       where it comes from.  E.g. some constants take two dependent
+       instructions or a load, while others might be moved from a GPR.
+       4 seems to be a reasonable compromise in practice.  */
+    4, /* scalar_to_vec_cost  */
+    4, /* align_load_cost  */
+    4, /* unalign_load_cost  */
+    /* Although stores generally have a latency of 2 and compete for the
+       vector pipes, in practice it's better not to model that.  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  3, /* clast_cost  */
+  10, /* fadda_f16_cost  */
+  6, /* fadda_f32_cost  */
+  4, /* fadda_f64_cost  */
+  /* A strided Advanced SIMD x64 load would take two parallel FP loads
+     (6 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
+     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
+     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
+     (cost 2) to that, to avoid the difference being lost in rounding.
+
+     There is no easy comparison between a strided Advanced SIMD x32 load
+     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
+     operation more than a 64-bit gather.  */
+  14, /* gather_load_x32_cost  */
+  12, /* gather_load_x64_cost  */
+  3 /* scatter_store_elt_cost  */
+};
+
+static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
+{
+  {
+    {
+      3, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      4, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    2, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  2, /* pred_ops_per_cycle  */
+  2, /* while_pred_ops  */
+  2, /* int_cmp_pred_ops  */
+  1, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
+{
+  &neoversev1_scalar_issue_info,
+  &neoversev1_advsimd_issue_info,
+  &neoverse512tvb_sve_issue_info
+};
+
+static const struct cpu_vector_cost neoverse512tvb_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  2, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &neoversev1_advsimd_vector_cost, /* advsimd  */
+  &neoverse512tvb_sve_vector_cost, /* sve  */
+  &neoverse512tvb_vec_issue_info /* issue_info  */
+};
+
+static const struct tune_params neoverse512tvb_tunings =
+{
+  &cortexa76_extra_costs,
+  &neoversev1_addrcost_table,
+  &neoversev1_regmove_cost,
+  &neoverse512tvb_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_128 | SVE_256, /* sve_width  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    6, /* load_fp.  */
+    2, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  4,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_NEOVERSE512TVB.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversen1.h b/gcc/config/aarch64/tuning_models/neoversen1.h
new file mode 100644
index 0000000000000000000000000000000000000000..132166d3d06430b725e4448937332cc159c11cda
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoversen1.h
@@ -0,0 +1,60 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_NEOVERSEN1
+#define GCC_AARCH64_H_NEOVERSEN1
+
+#include "generic.h"
+
+static const struct tune_params neoversen1_tunings =
+{
+  &cortexa76_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &cortexa57_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    5, /* load_fp.  */
+    2, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_NEOVERSEN1.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h b/gcc/config/aarch64/tuning_models/neoversen2.h
new file mode 100644
index 0000000000000000000000000000000000000000..395a6d82b8403e586bf179cade055543cf9b9eb0
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoversen2.h
@@ -0,0 +1,245 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_NEOVERSEN2
+#define GCC_AARCH64_H_NEOVERSEN2
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table neoversen2_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  2, /* post_modify_ld3_st3  */
+  2, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost neoversen2_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Spilling to int<->fp instead of memory is recommended so set
+     realistic costs compared to memmov_cost.  */
+  3, /* GP2FP  */
+  2, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  2, /* ld2_st2_permute_cost */
+  2, /* ld3_st3_permute_cost  */
+  3, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  4, /* reduc_i8_cost  */
+  4, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  6, /* reduc_f16_cost  */
+  4, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  /* This value is just inherited from the Cortex-A57 table.  */
+  8, /* vec_to_scalar_cost  */
+  /* This depends very much on what the scalar value is and
+     where it comes from.  E.g. some constants take two dependent
+     instructions or a load, while others might be moved from a GPR.
+     4 seems to be a reasonable compromise in practice.  */
+  4, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  /* Although stores have a latency of 2 and compete for the
+     vector pipes, in practice it's better not to model that.  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const sve_vec_cost neoversen2_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    2, /* fp_stmt_cost  */
+    3, /* ld2_st2_permute_cost  */
+    4, /* ld3_st3_permute_cost  */
+    4, /* ld4_st4_permute_cost  */
+    3, /* permute_cost  */
+    /* Theoretically, a reduction involving 15 scalar ADDs could
+       complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
+       completes in 11 cycles, so give it a cost of 15 + 6.  */
+    21, /* reduc_i8_cost  */
+    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
+    13, /* reduc_i16_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
+    9, /* reduc_i32_cost  */
+    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
+    2, /* reduc_i64_cost  */
+    /* Theoretically, a reduction involving 7 scalar FADDs could
+       complete in ~8 cycles and would have a cost of 14.  FADDV
+       completes in 6 cycles, so give it a cost of 14 - 2.  */
+    12, /* reduc_f16_cost  */
+    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
+    6, /* reduc_f32_cost  */
+    /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
+    2, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    /* This value is just inherited from the Cortex-A57 table.  */
+    8, /* vec_to_scalar_cost  */
+    /* See the comment above the Advanced SIMD versions.  */
+    4, /* scalar_to_vec_cost  */
+    4, /* align_load_cost  */
+    4, /* unalign_load_cost  */
+    /* Although stores have a latency of 2 and compete for the
+       vector pipes, in practice it's better not to model that.  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  3, /* clast_cost  */
+  10, /* fadda_f16_cost  */
+  6, /* fadda_f32_cost  */
+  4, /* fadda_f64_cost  */
+  /* A strided Advanced SIMD x64 load would take two parallel FP loads
+     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
+     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
+     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
+     (cost 2) to that, to avoid the difference being lost in rounding.
+
+     There is no easy comparison between a strided Advanced SIMD x32 load
+     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
+     operation more than a 64-bit gather.  */
+  14, /* gather_load_x32_cost  */
+  12, /* gather_load_x64_cost  */
+  3 /* scatter_store_elt_cost  */
+};
+
+static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
+{
+  3, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  4, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
+{
+  {
+    3, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    2, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  2, /* ld2_st2_general_ops  */
+  2, /* ld3_st3_general_ops  */
+  3 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
+{
+  {
+    {
+      3, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      2, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    3, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  2, /* pred_ops_per_cycle  */
+  2, /* while_pred_ops  */
+  2, /* int_cmp_pred_ops  */
+  1, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info neoversen2_vec_issue_info =
+{
+  &neoversen2_scalar_issue_info,
+  &neoversen2_advsimd_issue_info,
+  &neoversen2_sve_issue_info
+};
+
+/* Neoverse N2 costs for vector insn classes.  */
+static const struct cpu_vector_cost neoversen2_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  2, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &neoversen2_advsimd_vector_cost, /* advsimd  */
+  &neoversen2_sve_vector_cost, /* sve  */
+  &neoversen2_vec_issue_info /* issue_info  */
+};
+
+static const struct tune_params neoversen2_tunings =
+{
+  &cortexa76_extra_costs,
+  &neoversen2_addrcost_table,
+  &neoversen2_regmove_cost,
+  &neoversen2_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_128, /* sve_width  */
+  { 4, /* load_int.  */
+    1, /* store_int.  */
+    6, /* load_fp.  */
+    2, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
+   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_NEOVERSEN2.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h b/gcc/config/aarch64/tuning_models/neoversev1.h
new file mode 100644
index 0000000000000000000000000000000000000000..584a5000e06f598dcdd3bcc533dc6dbc642223ca
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -0,0 +1,237 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_NEOVERSEV1
+#define GCC_AARCH64_H_NEOVERSEV1
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table neoversev1_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  3, /* post_modify_ld3_st3  */
+  3, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost neoversev1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Spilling to int<->fp instead of memory is recommended so set
+     realistic costs compared to memmov_cost.  */
+  3, /* GP2FP  */
+  2, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  4, /* ld2_st2_permute_cost */
+  4, /* ld3_st3_permute_cost  */
+  5, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  4, /* reduc_i8_cost  */
+  4, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  6, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  /* This value is just inherited from the Cortex-A57 table.  */
+  8, /* vec_to_scalar_cost  */
+  /* This depends very much on what the scalar value is and
+     where it comes from.  E.g. some constants take two dependent
+     instructions or a load, while others might be moved from a GPR.
+     4 seems to be a reasonable compromise in practice.  */
+  4, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  /* Although stores have a latency of 2 and compete for the
+     vector pipes, in practice it's better not to model that.  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const sve_vec_cost neoversev1_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    2, /* fp_stmt_cost  */
+    4, /* ld2_st2_permute_cost  */
+    7, /* ld3_st3_permute_cost  */
+    8, /* ld4_st4_permute_cost  */
+    3, /* permute_cost  */
+    /* Theoretically, a reduction involving 31 scalar ADDs could
+       complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
+       completes in 14 cycles, so give it a cost of 31 + 5.  */
+    36, /* reduc_i8_cost  */
+    /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
+    22, /* reduc_i16_cost  */
+    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
+    14, /* reduc_i32_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
+    11, /* reduc_i64_cost  */
+    /* Theoretically, a reduction involving 15 scalar FADDs could
+       complete in ~9 cycles and would have a cost of 30.  FADDV
+       completes in 13 cycles, so give it a cost of 30 + 4.  */
+    34, /* reduc_f16_cost  */
+    /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
+    19, /* reduc_f32_cost  */
+    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
+    11, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    /* This value is just inherited from the Cortex-A57 table.  */
+    8, /* vec_to_scalar_cost  */
+    /* See the comment above the Advanced SIMD versions.  */
+    4, /* scalar_to_vec_cost  */
+    4, /* align_load_cost  */
+    4, /* unalign_load_cost  */
+    /* Although stores have a latency of 2 and compete for the
+       vector pipes, in practice it's better not to model that.  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  3, /* clast_cost  */
+  19, /* fadda_f16_cost  */
+  11, /* fadda_f32_cost  */
+  8, /* fadda_f64_cost  */
+  32, /* gather_load_x32_cost  */
+  16, /* gather_load_x64_cost  */
+  3 /* scatter_store_elt_cost  */
+};
+
+static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
+{
+  3, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  4, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
+{
+  {
+    3, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    4, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  2, /* ld2_st2_general_ops  */
+  2, /* ld3_st3_general_ops  */
+  3 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
+{
+  {
+    {
+      2, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      2, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    2, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  1, /* pred_ops_per_cycle  */
+  2, /* while_pred_ops  */
+  2, /* int_cmp_pred_ops  */
+  1, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info neoversev1_vec_issue_info =
+{
+  &neoversev1_scalar_issue_info,
+  &neoversev1_advsimd_issue_info,
+  &neoversev1_sve_issue_info
+};
+
+/* Neoverse V1 costs for vector insn classes.  */
+static const struct cpu_vector_cost neoversev1_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  2, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &neoversev1_advsimd_vector_cost, /* advsimd  */
+  &neoversev1_sve_vector_cost, /* sve  */
+  &neoversev1_vec_issue_info /* issue_info  */
+};
+
+static const struct tune_params neoversev1_tunings =
+{
+  &cortexa76_extra_costs,
+  &neoversev1_addrcost_table,
+  &neoversev1_regmove_cost,
+  &neoversev1_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_256, /* sve_width  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    6, /* load_fp.  */
+    2, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  4,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
+   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+
+#endif /* GCC_AARCH64_H_NEOVERSEV1.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h
new file mode 100644
index 0000000000000000000000000000000000000000..28d4244ef4c99ecdffb7408e39dc21bc191223de
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -0,0 +1,245 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_NEOVERSEV2
+#define GCC_AARCH64_H_NEOVERSEV2
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table neoversev2_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  2, /* post_modify_ld3_st3  */
+  2, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost neoversev2_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Spilling to int<->fp instead of memory is recommended so set
+     realistic costs compared to memmov_cost.  */
+  3, /* GP2FP  */
+  2, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  2, /* ld2_st2_permute_cost */
+  2, /* ld3_st3_permute_cost  */
+  3, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  4, /* reduc_i8_cost  */
+  4, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  6, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  /* This value is just inherited from the Cortex-A57 table.  */
+  8, /* vec_to_scalar_cost  */
+  /* This depends very much on what the scalar value is and
+     where it comes from.  E.g. some constants take two dependent
+     instructions or a load, while others might be moved from a GPR.
+     4 seems to be a reasonable compromise in practice.  */
+  4, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  /* Although stores have a latency of 2 and compete for the
+     vector pipes, in practice it's better not to model that.  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const sve_vec_cost neoversev2_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    2, /* fp_stmt_cost  */
+    3, /* ld2_st2_permute_cost  */
+    3, /* ld3_st3_permute_cost  */
+    4, /* ld4_st4_permute_cost  */
+    3, /* permute_cost  */
+    /* Theoretically, a reduction involving 15 scalar ADDs could
+       complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
+       completes in 11 cycles, so give it a cost of 15 + 8.  */
+    21, /* reduc_i8_cost  */
+    /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
+    14, /* reduc_i16_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
+    7, /* reduc_i32_cost  */
+    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
+    2, /* reduc_i64_cost  */
+    /* Theoretically, a reduction involving 7 scalar FADDs could
+       complete in ~6 cycles and would have a cost of 14.  FADDV
+       completes in 8 cycles, so give it a cost of 14 + 2.  */
+    16, /* reduc_f16_cost  */
+    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
+    8, /* reduc_f32_cost  */
+    /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2.  */
+    4, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    /* This value is just inherited from the Cortex-A57 table.  */
+    8, /* vec_to_scalar_cost  */
+    /* See the comment above the Advanced SIMD versions.  */
+    4, /* scalar_to_vec_cost  */
+    4, /* align_load_cost  */
+    4, /* unalign_load_cost  */
+    /* Although stores have a latency of 2 and compete for the
+       vector pipes, in practice it's better not to model that.  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  3, /* clast_cost  */
+  10, /* fadda_f16_cost  */
+  6, /* fadda_f32_cost  */
+  4, /* fadda_f64_cost  */
+  /* A strided Advanced SIMD x64 load would take two parallel FP loads
+     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
+     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
+     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
+     (cost 2) to that, to avoid the difference being lost in rounding.
+
+     There is no easy comparison between a strided Advanced SIMD x32 load
+     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
+     operation more than a 64-bit gather.  */
+  14, /* gather_load_x32_cost  */
+  12, /* gather_load_x64_cost  */
+  3 /* scatter_store_elt_cost  */
+};
+
+static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
+{
+  3, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  6, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
+{
+  {
+    3, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    4, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  2, /* ld2_st2_general_ops  */
+  2, /* ld3_st3_general_ops  */
+  3 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
+{
+  {
+    {
+      3, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      4, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    3, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  2, /* pred_ops_per_cycle  */
+  2, /* while_pred_ops  */
+  2, /* int_cmp_pred_ops  */
+  1, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info neoversev2_vec_issue_info =
+{
+  &neoversev2_scalar_issue_info,
+  &neoversev2_advsimd_issue_info,
+  &neoversev2_sve_issue_info
+};
+
+/* Demeter costs for vector insn classes.  */
+static const struct cpu_vector_cost neoversev2_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  2, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &neoversev2_advsimd_vector_cost, /* advsimd  */
+  &neoversev2_sve_vector_cost, /* sve  */
+  &neoversev2_vec_issue_info /* issue_info  */
+};
+
+static const struct tune_params neoversev2_tunings =
+{
+  &cortexa76_extra_costs,
+  &neoversev2_addrcost_table,
+  &neoversev2_regmove_cost,
+  &neoversev2_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_128, /* sve_width  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    6, /* load_fp.  */
+    1, /* store_fp.  */
+    6, /* load_pred.  */
+    2 /* store_pred.  */
+  }, /* memmov_cost.  */
+  5, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  3,	/* int_reassoc_width.  */
+  6,	/* fp_reassoc_width.  */
+  4,	/* fma_reassoc_width.  */
+  3,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
+   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_NEOVERSEV2.  */
diff --git a/gcc/config/aarch64/tuning_models/qdf24xx.h b/gcc/config/aarch64/tuning_models/qdf24xx.h
new file mode 100644
index 0000000000000000000000000000000000000000..29c9b9f5843acc15450a2492b141c02ee48a3f13
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/qdf24xx.h
@@ -0,0 +1,137 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_QDF24XX
+#define GCC_AARCH64_H_QDF24XX
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table qdf24xx_addrcost_table =
+{
+    {
+      1, /* hi  */
+      1, /* si  */
+      1, /* di  */
+      2, /* ti  */
+    },
+  1, /* pre_modify  */
+  1, /* post_modify  */
+  1, /* post_modify_ld3_st3  */
+  1, /* post_modify_ld4_st4  */
+  3, /* register_offset  */
+  3, /* register_sextend  */
+  3, /* register_zextend  */
+  2, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost qdf24xx_regmove_cost =
+{
+  2, /* GP2GP  */
+  /* Avoid the use of int<->fp moves for spilling.  */
+  6, /* GP2FP  */
+  6, /* FP2GP  */
+  4 /* FP2FP  */
+};
+
+static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
+{
+  1, /* int_stmt_cost  */
+  3, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  2, /* permute_cost  */
+  1, /* reduc_i8_cost  */
+  1, /* reduc_i16_cost  */
+  1, /* reduc_i32_cost  */
+  1, /* reduc_i64_cost  */
+  1, /* reduc_f16_cost  */
+  1, /* reduc_f32_cost  */
+  1, /* reduc_f64_cost  */
+  1, /* store_elt_extra_cost  */
+  1, /* vec_to_scalar_cost  */
+  1, /* scalar_to_vec_cost  */
+  1, /* align_load_cost  */
+  1, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+/* QDF24XX costs for vector insn classes.  */
+static const struct cpu_vector_cost qdf24xx_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  1, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  3, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &qdf24xx_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const cpu_prefetch_tune qdf24xx_prefetch_tune =
+{
+  4,			/* num_slots  */
+  32,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  512,			/* l2_cache_size  */
+  false,		/* prefetch_dynamic_strides */
+  2048,			/* minimum_stride */
+  3			/* default_opt_level  */
+};
+
+static const struct tune_params qdf24xx_tunings =
+{
+  &qdf24xx_extra_costs,
+  &qdf24xx_addrcost_table,
+  &qdf24xx_regmove_cost,
+  &qdf24xx_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
+  "16",	/* function_align.  */
+  "8",	/* jump_align.  */
+  "16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
+  &qdf24xx_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_QDF24XX.  */
diff --git a/gcc/config/aarch64/tuning_models/saphira.h b/gcc/config/aarch64/tuning_models/saphira.h
new file mode 100644
index 0000000000000000000000000000000000000000..e584d316bb7c3c2d232cf7623a92100ad261f07d
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/saphira.h
@@ -0,0 +1,63 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_SAPHIRA
+#define GCC_AARCH64_H_SAPHIRA
+
+#include "generic.h"
+
+/* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
+   for now.  */
+static const struct tune_params saphira_tunings =
+{
+  &generic_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &generic_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
+  "16",	/* function_align.  */
+  "8",	/* jump_align.  */
+  "16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_SAPHIRA.  */
diff --git a/gcc/config/aarch64/tuning_models/thunderx.h b/gcc/config/aarch64/tuning_models/thunderx.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd4b9d539fc5cf2bd20d84e91d6b72fa7237f99f
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/thunderx.h
@@ -0,0 +1,117 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_THUNDERX
+#define GCC_AARCH64_H_THUNDERX
+
+#include "generic.h"
+
+static const struct cpu_regmove_cost thunderx_regmove_cost =
+{
+  2, /* GP2GP  */
+  2, /* GP2FP  */
+  6, /* FP2GP  */
+  4 /* FP2FP  */
+};
+
+static const advsimd_vec_cost thunderx_advsimd_vector_cost =
+{
+  4, /* int_stmt_cost  */
+  1, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  4, /* permute_cost  */
+  2, /* reduc_i8_cost  */
+  2, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  2, /* reduc_f16_cost  */
+  2, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  2, /* vec_to_scalar_cost  */
+  2, /* scalar_to_vec_cost  */
+  3, /* align_load_cost  */
+  5, /* unalign_load_cost  */
+  5, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+/* ThunderX costs for vector insn classes.  */
+static const struct cpu_vector_cost thunderx_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  3, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  3, /* cond_taken_branch_cost  */
+  3, /* cond_not_taken_branch_cost  */
+  &thunderx_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const cpu_prefetch_tune thunderx_prefetch_tune =
+{
+  8,			/* num_slots  */
+  32,			/* l1_cache_size  */
+  128,			/* l1_cache_line_size  */
+  -1,			/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params thunderx_tunings =
+{
+  &thunderx_extra_costs,
+  &generic_addrcost_table,
+  &thunderx_regmove_cost,
+  &thunderx_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
+  2, /* issue_rate  */
+  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
+  "8",	/* function_align.  */
+  "8",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
+  &thunderx_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_THUNDERX.  */
diff --git a/gcc/config/aarch64/tuning_models/thunderx2t99.h b/gcc/config/aarch64/tuning_models/thunderx2t99.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a376e0bab37b0b5bc1ea23de0e96a9245846fd7
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/thunderx2t99.h
@@ -0,0 +1,137 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_THUNDERX2T99
+#define GCC_AARCH64_H_THUNDERX2T99
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
+{
+    {
+      1, /* hi  */
+      1, /* si  */
+      1, /* di  */
+      2, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* post_modify_ld3_st3  */
+  0, /* post_modify_ld4_st4  */
+  2, /* register_offset  */
+  3, /* register_sextend  */
+  3, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of int<->fp moves for spilling.  */
+  5, /* GP2FP  */
+  6, /* FP2GP  */
+  3, /* FP2FP  */
+};
+
+static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
+{
+  4, /* int_stmt_cost  */
+  5, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  10, /* permute_cost  */
+  6, /* reduc_i8_cost  */
+  6, /* reduc_i16_cost  */
+  6, /* reduc_i32_cost  */
+  6, /* reduc_i64_cost  */
+  6, /* reduc_f16_cost  */
+  6, /* reduc_f32_cost  */
+  6, /* reduc_f64_cost  */
+  6, /* store_elt_extra_cost  */
+  6, /* vec_to_scalar_cost  */
+  5, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+/* Costs for vector insn classes for Vulcan.  */
+static const struct cpu_vector_cost thunderx2t99_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  6, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  2, /* cond_taken_branch_cost  */
+  1,  /* cond_not_taken_branch_cost  */
+  &thunderx2t99_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
+{
+  8,			/* num_slots  */
+  32,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  256,			/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params thunderx2t99_tunings =
+{
+  &thunderx2t99_extra_costs,
+  &thunderx2t99_addrcost_table,
+  &thunderx2t99_regmove_cost,
+  &thunderx2t99_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate.  */
+  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
+   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "8",	/* jump_align.  */
+  "16",	/* loop_align.  */
+  3,	/* int_reassoc_width.  */
+  2,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &thunderx2t99_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_THUNDERX2T99.  */
diff --git a/gcc/config/aarch64/tuning_models/thunderx3t110.h b/gcc/config/aarch64/tuning_models/thunderx3t110.h
new file mode 100644
index 0000000000000000000000000000000000000000..65203b4af132e12e4994013fbab228bd3873b756
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/thunderx3t110.h
@@ -0,0 +1,136 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_THUNDERX3T110
+#define GCC_AARCH64_H_THUNDERX3T110
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
+{
+    {
+      1, /* hi  */
+      1, /* si  */
+      1, /* di  */
+      2, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* post_modify_ld3_st3  */
+  0, /* post_modify_ld4_st4  */
+  2, /* register_offset  */
+  3, /* register_sextend  */
+  3, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of int<->fp moves for spilling.  */
+  4, /* GP2FP  */
+  5, /* FP2GP  */
+  4  /* FP2FP  */
+};
+
+static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
+{
+  5, /* int_stmt_cost  */
+  5, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  10, /* permute_cost  */
+  5, /* reduc_i8_cost  */
+  5, /* reduc_i16_cost  */
+  5, /* reduc_i32_cost  */
+  5, /* reduc_i64_cost  */
+  5, /* reduc_f16_cost  */
+  5, /* reduc_f32_cost  */
+  5, /* reduc_f64_cost  */
+  5, /* store_elt_extra_cost  */
+  5, /* vec_to_scalar_cost  */
+  5, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  4, /* unalign_store_cost  */
+  4  /* store_cost  */
+};
+
+static const struct cpu_vector_cost thunderx3t110_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  5, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  2, /* cond_taken_branch_cost  */
+  1,  /* cond_not_taken_branch_cost  */
+  &thunderx3t110_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
+{
+  8,			/* num_slots  */
+  32,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  256,			/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params thunderx3t110_tunings =
+{
+  &thunderx3t110_extra_costs,
+  &thunderx3t110_addrcost_table,
+  &thunderx3t110_regmove_cost,
+  &thunderx3t110_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  6, /* issue_rate.  */
+  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
+   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "8",	/* jump_align.  */
+  "16",	/* loop_align.  */
+  3,	/* int_reassoc_width.  */
+  2,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &thunderx3t110_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_THUNDERX3T110.  */
diff --git a/gcc/config/aarch64/tuning_models/thunderxt88.h b/gcc/config/aarch64/tuning_models/thunderxt88.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcc74d31484ee6b99d37920dbfe7b1d59377d074
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/thunderxt88.h
@@ -0,0 +1,72 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_THUNDERXT88
+#define GCC_AARCH64_H_THUNDERXT88
+
+#include "generic.h"
+#include "thunderx.h"
+
+static const cpu_prefetch_tune thunderxt88_prefetch_tune =
+{
+  8,			/* num_slots  */
+  32,			/* l1_cache_size  */
+  128,			/* l1_cache_line_size  */
+  16*1024,		/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  3			/* default_opt_level  */
+};
+
+static const struct tune_params thunderxt88_tunings =
+{
+  &thunderx_extra_costs,
+  &generic_addrcost_table,
+  &thunderx_regmove_cost,
+  &thunderx_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
+  2, /* issue_rate  */
+  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
+  "8",	/* function_align.  */
+  "8",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &thunderxt88_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_THUNDERXT88.  */
diff --git a/gcc/config/aarch64/tuning_models/tsv110.h b/gcc/config/aarch64/tuning_models/tsv110.h
new file mode 100644
index 0000000000000000000000000000000000000000..42aeafce652fff34e3277194993dd4aa1f0383a1
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/tsv110.h
@@ -0,0 +1,137 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_TSV110
+#define GCC_AARCH64_H_TSV110
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table tsv110_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* post_modify_ld3_st3  */
+  0, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  1, /* register_sextend  */
+  1, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost tsv110_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  2, /* GP2FP  */
+  3, /* FP2GP  */
+  2  /* FP2FP  */
+};
+
+static const advsimd_vec_cost tsv110_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  2, /* permute_cost  */
+  3, /* reduc_i8_cost  */
+  3, /* reduc_i16_cost  */
+  3, /* reduc_i32_cost  */
+  3, /* reduc_i64_cost  */
+  3, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  3, /* reduc_f64_cost  */
+  3, /* store_elt_extra_cost  */
+  3, /* vec_to_scalar_cost  */
+  2, /* scalar_to_vec_cost  */
+  5, /* align_load_cost  */
+  5, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const struct cpu_vector_cost tsv110_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  5, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &tsv110_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const cpu_prefetch_tune tsv110_prefetch_tune =
+{
+  0,                    /* num_slots  */
+  64,                   /* l1_cache_size  */
+  64,                   /* l1_cache_line_size  */
+  512,                  /* l2_cache_size  */
+  true,                 /* prefetch_dynamic_strides */
+  -1,                   /* minimum_stride */
+  -1                    /* default_opt_level  */
+};
+
+static const struct tune_params tsv110_tunings =
+{
+  &tsv110_extra_costs,
+  &tsv110_addrcost_table,
+  &tsv110_regmove_cost,
+  &tsv110_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4,    /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
+   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
+  "16", /* function_align.  */
+  "4",  /* jump_align.  */
+  "8",  /* loop_align.  */
+  2,    /* int_reassoc_width.  */
+  4,    /* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,    /* vec_reassoc_width.  */
+  2,    /* min_div_recip_mul_sf.  */
+  2,    /* min_div_recip_mul_df.  */
+  0,    /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
+  &tsv110_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_TSV110.  */
diff --git a/gcc/config/aarch64/tuning_models/xgene1.h b/gcc/config/aarch64/tuning_models/xgene1.h
new file mode 100644
index 0000000000000000000000000000000000000000..53a3eb0ddeb80a9735cc988e242a70e87dc90655
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/xgene1.h
@@ -0,0 +1,145 @@
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_XGENE1
+#define GCC_AARCH64_H_XGENE1
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table xgene1_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  1, /* pre_modify  */
+  1, /* post_modify  */
+  1, /* post_modify_ld3_st3  */
+  1, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  1, /* register_sextend  */
+  1, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost xgene1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  8, /* GP2FP  */
+  8, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost xgene1_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  2, /* permute_cost  */
+  4, /* reduc_i8_cost  */
+  4, /* reduc_i16_cost  */
+  4, /* reduc_i32_cost  */
+  4, /* reduc_i64_cost  */
+  4, /* reduc_f16_cost  */
+  4, /* reduc_f32_cost  */
+  4, /* reduc_f64_cost  */
+  4, /* store_elt_extra_cost  */
+  4, /* vec_to_scalar_cost  */
+  4, /* scalar_to_vec_cost  */
+  10, /* align_load_cost  */
+  10, /* unalign_load_cost  */
+  2, /* unalign_store_cost  */
+  2  /* store_cost  */
+};
+
+/* Generic costs for vector insn classes.  */
+static const struct cpu_vector_cost xgene1_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  5, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  2, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &xgene1_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+/* Approximation modes for X-Gene 1.  */
+static const cpu_approx_modes xgene1_approx_modes =
+{
+  AARCH64_APPROX_NONE,	/* division  */
+  AARCH64_APPROX_NONE,	/* sqrt  */
+  AARCH64_APPROX_ALL	/* recip_sqrt  */
+};
+
+static const cpu_prefetch_tune xgene1_prefetch_tune =
+{
+  8,			/* num_slots  */
+  32,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  256,			/* l2_cache_size  */
+  true,                 /* prefetch_dynamic_strides */
+  -1,                   /* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params xgene1_tunings =
+{
+  &xgene1_extra_costs,
+  &xgene1_addrcost_table,
+  &xgene1_regmove_cost,
+  &xgene1_vector_cost,
+  &generic_branch_cost,
+  &xgene1_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  AARCH64_FUSE_NOTHING, /* fusible_ops  */
+  "16",	/* function_align.  */
+  "16",	/* jump_align.  */
+  "16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  17,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
+  &xgene1_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_XGENE1.  */

Comments

Richard Earnshaw Nov. 16, 2023, 9:13 a.m. UTC | #1
On 15/11/2023 17:06, Tamar Christina wrote:
> Hi All,
> 
> This patch series attempts to move the generic cost model in AArch64 to a new
> and modern generic standard.  The current standard is quite old and generates
> very suboptimal code out of the box for user of GCC.
> 
> The goal is for the new cost model to be beneficial on newer/current Arm
> Microarchitectures while not being too negative for older ones.
> 
> It does not change any core specific optimization.  The final changes reflect
> both performance optimizations and size optimizations.
> 
> This first patch just re-organizes the cost structures to their own files.
> The AArch64.cc file has gotten very big and it's hard to follow.
> 
> No functional changes are expected from this change.  Note that since all the
> structures have private visibility I've put them in header files instead.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	PR target/111370
> 	* config/aarch64/aarch64.cc (generic_addrcost_table,
> 	exynosm1_addrcost_table,
> 	xgene1_addrcost_table,
> 	thunderx2t99_addrcost_table,
> 	thunderx3t110_addrcost_table,
> 	tsv110_addrcost_table,
> 	qdf24xx_addrcost_table,
> 	a64fx_addrcost_table,
> 	neoversev1_addrcost_table,
> 	neoversen2_addrcost_table,
> 	neoversev2_addrcost_table,
> 	generic_regmove_cost,
> 	cortexa57_regmove_cost,
> 	cortexa53_regmove_cost,
> 	exynosm1_regmove_cost,
> 	thunderx_regmove_cost,
> 	xgene1_regmove_cost,
> 	qdf24xx_regmove_cost,
> 	thunderx2t99_regmove_cost,
> 	thunderx3t110_regmove_cost,
> 	tsv110_regmove_cost,
> 	a64fx_regmove_cost,
> 	neoversen2_regmove_cost,
> 	neoversev1_regmove_cost,
> 	neoversev2_regmove_cost,
> 	generic_vector_cost,
> 	a64fx_vector_cost,
> 	qdf24xx_vector_cost,
> 	thunderx_vector_cost,
> 	tsv110_vector_cost,
> 	cortexa57_vector_cost,
> 	exynosm1_vector_cost,
> 	xgene1_vector_cost,
> 	thunderx2t99_vector_cost,
> 	thunderx3t110_vector_cost,
> 	ampere1_vector_cost,
> 	generic_branch_cost,
> 	generic_tunings,
> 	cortexa35_tunings,
> 	cortexa53_tunings,
> 	cortexa57_tunings,
> 	cortexa72_tunings,
> 	cortexa73_tunings,
> 	exynosm1_tunings,
> 	thunderxt88_tunings,
> 	thunderx_tunings,
> 	tsv110_tunings,
> 	xgene1_tunings,
> 	emag_tunings,
> 	qdf24xx_tunings,
> 	saphira_tunings,
> 	thunderx2t99_tunings,
> 	thunderx3t110_tunings,
> 	neoversen1_tunings,
> 	ampere1_tunings,
> 	ampere1a_tunings,
> 	neoversev1_vector_cost,
> 	neoversev1_tunings,
> 	neoverse512tvb_vector_cost,
> 	neoverse512tvb_tunings,
> 	neoversen2_vector_cost,
> 	neoversen2_tunings,
> 	neoversev2_vector_cost,
> 	neoversev2_tunings
> 	a64fx_tunings): Split into own files.

I think the official way of writing this is

	* config/aarch64/aarch64.cc (generic_addrcost_table)
	(exynosm1_addrcost_table, xgene1_addrcost_table)
	(thunderx2t99_addrcost_table, thunderx3t110_addrcost_table)
         ...
	(a64fx_tunings): Split into own files.


> 	* config/aarch64/tuning_models/a64fx.h: New file.
> 	* config/aarch64/tuning_models/ampere1.h: New file.
> 	* config/aarch64/tuning_models/ampere1a.h: New file.
> 	* config/aarch64/tuning_models/cortexa35.h: New file.
> 	* config/aarch64/tuning_models/cortexa53.h: New file.
> 	* config/aarch64/tuning_models/cortexa57.h: New file.
> 	* config/aarch64/tuning_models/cortexa72.h: New file.
> 	* config/aarch64/tuning_models/cortexa73.h: New file.
> 	* config/aarch64/tuning_models/emag.h: New file.
> 	* config/aarch64/tuning_models/exynosm1.h: New file.
> 	* config/aarch64/tuning_models/generic.h: New file.
> 	* config/aarch64/tuning_models/neoverse512tvb.h: New file.
> 	* config/aarch64/tuning_models/neoversen1.h: New file.
> 	* config/aarch64/tuning_models/neoversen2.h: New file.
> 	* config/aarch64/tuning_models/neoversev1.h: New file.
> 	* config/aarch64/tuning_models/neoversev2.h: New file.
> 	* config/aarch64/tuning_models/qdf24xx.h: New file.
> 	* config/aarch64/tuning_models/saphira.h: New file.
> 	* config/aarch64/tuning_models/thunderx.h: New file.
> 	* config/aarch64/tuning_models/thunderx2t99.h: New file.
> 	* config/aarch64/tuning_models/thunderx3t110.h: New file.
> 	* config/aarch64/tuning_models/thunderxt88.h: New file.
> 	* config/aarch64/tuning_models/tsv110.h: New file.
> 	* config/aarch64/tuning_models/xgene1.h: New file.
> 

Otherwise, OK.

R.

> --- inline copy of patch --
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 9fbfc548a891f5d11940c6fd3c49a14bfbdec886..07b1cde39209f5c7740e336b499e9aed31e4c515 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -354,2405 +354,30 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] =
>   };
>   
>   /* Tuning parameters.  */
> -
> -static const struct cpu_addrcost_table generic_addrcost_table =
> -{
> -    {
> -      1, /* hi  */
> -      0, /* si  */
> -      0, /* di  */
> -      1, /* ti  */
> -    },
> -  0, /* pre_modify  */
> -  0, /* post_modify  */
> -  0, /* post_modify_ld3_st3  */
> -  0, /* post_modify_ld4_st4  */
> -  0, /* register_offset  */
> -  0, /* register_sextend  */
> -  0, /* register_zextend  */
> -  0 /* imm_offset  */
> -};
> -
> -static const struct cpu_addrcost_table exynosm1_addrcost_table =
> -{
> -    {
> -      0, /* hi  */
> -      0, /* si  */
> -      0, /* di  */
> -      2, /* ti  */
> -    },
> -  0, /* pre_modify  */
> -  0, /* post_modify  */
> -  0, /* post_modify_ld3_st3  */
> -  0, /* post_modify_ld4_st4  */
> -  1, /* register_offset  */
> -  1, /* register_sextend  */
> -  2, /* register_zextend  */
> -  0, /* imm_offset  */
> -};
> -
> -static const struct cpu_addrcost_table xgene1_addrcost_table =
> -{
> -    {
> -      1, /* hi  */
> -      0, /* si  */
> -      0, /* di  */
> -      1, /* ti  */
> -    },
> -  1, /* pre_modify  */
> -  1, /* post_modify  */
> -  1, /* post_modify_ld3_st3  */
> -  1, /* post_modify_ld4_st4  */
> -  0, /* register_offset  */
> -  1, /* register_sextend  */
> -  1, /* register_zextend  */
> -  0, /* imm_offset  */
> -};
> -
> -static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
> -{
> -    {
> -      1, /* hi  */
> -      1, /* si  */
> -      1, /* di  */
> -      2, /* ti  */
> -    },
> -  0, /* pre_modify  */
> -  0, /* post_modify  */
> -  0, /* post_modify_ld3_st3  */
> -  0, /* post_modify_ld4_st4  */
> -  2, /* register_offset  */
> -  3, /* register_sextend  */
> -  3, /* register_zextend  */
> -  0, /* imm_offset  */
> -};
> -
> -static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
> -{
> -    {
> -      1, /* hi  */
> -      1, /* si  */
> -      1, /* di  */
> -      2, /* ti  */
> -    },
> -  0, /* pre_modify  */
> -  0, /* post_modify  */
> -  0, /* post_modify_ld3_st3  */
> -  0, /* post_modify_ld4_st4  */
> -  2, /* register_offset  */
> -  3, /* register_sextend  */
> -  3, /* register_zextend  */
> -  0, /* imm_offset  */
> -};
> -
> -static const struct cpu_addrcost_table tsv110_addrcost_table =
> -{
> -    {
> -      1, /* hi  */
> -      0, /* si  */
> -      0, /* di  */
> -      1, /* ti  */
> -    },
> -  0, /* pre_modify  */
> -  0, /* post_modify  */
> -  0, /* post_modify_ld3_st3  */
> -  0, /* post_modify_ld4_st4  */
> -  0, /* register_offset  */
> -  1, /* register_sextend  */
> -  1, /* register_zextend  */
> -  0, /* imm_offset  */
> -};
> -
> -static const struct cpu_addrcost_table qdf24xx_addrcost_table =
> -{
> -    {
> -      1, /* hi  */
> -      1, /* si  */
> -      1, /* di  */
> -      2, /* ti  */
> -    },
> -  1, /* pre_modify  */
> -  1, /* post_modify  */
> -  1, /* post_modify_ld3_st3  */
> -  1, /* post_modify_ld4_st4  */
> -  3, /* register_offset  */
> -  3, /* register_sextend  */
> -  3, /* register_zextend  */
> -  2, /* imm_offset  */
> -};
> -
> -static const struct cpu_addrcost_table a64fx_addrcost_table =
> -{
> -    {
> -      1, /* hi  */
> -      1, /* si  */
> -      1, /* di  */
> -      2, /* ti  */
> -    },
> -  0, /* pre_modify  */
> -  0, /* post_modify  */
> -  0, /* post_modify_ld3_st3  */
> -  0, /* post_modify_ld4_st4  */
> -  2, /* register_offset  */
> -  3, /* register_sextend  */
> -  3, /* register_zextend  */
> -  0, /* imm_offset  */
> -};
> -
> -static const struct cpu_addrcost_table neoversev1_addrcost_table =
> -{
> -    {
> -      1, /* hi  */
> -      0, /* si  */
> -      0, /* di  */
> -      1, /* ti  */
> -    },
> -  0, /* pre_modify  */
> -  0, /* post_modify  */
> -  3, /* post_modify_ld3_st3  */
> -  3, /* post_modify_ld4_st4  */
> -  0, /* register_offset  */
> -  0, /* register_sextend  */
> -  0, /* register_zextend  */
> -  0 /* imm_offset  */
> -};
> -
> -static const struct cpu_addrcost_table neoversen2_addrcost_table =
> -{
> -    {
> -      1, /* hi  */
> -      0, /* si  */
> -      0, /* di  */
> -      1, /* ti  */
> -    },
> -  0, /* pre_modify  */
> -  0, /* post_modify  */
> -  2, /* post_modify_ld3_st3  */
> -  2, /* post_modify_ld4_st4  */
> -  0, /* register_offset  */
> -  0, /* register_sextend  */
> -  0, /* register_zextend  */
> -  0 /* imm_offset  */
> -};
> -
> -static const struct cpu_addrcost_table neoversev2_addrcost_table =
> -{
> -    {
> -      1, /* hi  */
> -      0, /* si  */
> -      0, /* di  */
> -      1, /* ti  */
> -    },
> -  0, /* pre_modify  */
> -  0, /* post_modify  */
> -  2, /* post_modify_ld3_st3  */
> -  2, /* post_modify_ld4_st4  */
> -  0, /* register_offset  */
> -  0, /* register_sextend  */
> -  0, /* register_zextend  */
> -  0 /* imm_offset  */
> -};
> -
> -static const struct cpu_regmove_cost generic_regmove_cost =
> -{
> -  1, /* GP2GP  */
> -  /* Avoid the use of slow int<->fp moves for spilling by setting
> -     their cost higher than memmov_cost.  */
> -  5, /* GP2FP  */
> -  5, /* FP2GP  */
> -  2 /* FP2FP  */
> -};
> -
> -static const struct cpu_regmove_cost cortexa57_regmove_cost =
> -{
> -  1, /* GP2GP  */
> -  /* Avoid the use of slow int<->fp moves for spilling by setting
> -     their cost higher than memmov_cost.  */
> -  5, /* GP2FP  */
> -  5, /* FP2GP  */
> -  2 /* FP2FP  */
> -};
> -
> -static const struct cpu_regmove_cost cortexa53_regmove_cost =
> -{
> -  1, /* GP2GP  */
> -  /* Avoid the use of slow int<->fp moves for spilling by setting
> -     their cost higher than memmov_cost.  */
> -  5, /* GP2FP  */
> -  5, /* FP2GP  */
> -  2 /* FP2FP  */
> -};
> -
> -static const struct cpu_regmove_cost exynosm1_regmove_cost =
> -{
> -  1, /* GP2GP  */
> -  /* Avoid the use of slow int<->fp moves for spilling by setting
> -     their cost higher than memmov_cost (actual, 4 and 9).  */
> -  9, /* GP2FP  */
> -  9, /* FP2GP  */
> -  1 /* FP2FP  */
> -};
> -
> -static const struct cpu_regmove_cost thunderx_regmove_cost =
> -{
> -  2, /* GP2GP  */
> -  2, /* GP2FP  */
> -  6, /* FP2GP  */
> -  4 /* FP2FP  */
> -};
> -
> -static const struct cpu_regmove_cost xgene1_regmove_cost =
> -{
> -  1, /* GP2GP  */
> -  /* Avoid the use of slow int<->fp moves for spilling by setting
> -     their cost higher than memmov_cost.  */
> -  8, /* GP2FP  */
> -  8, /* FP2GP  */
> -  2 /* FP2FP  */
> -};
> -
> -static const struct cpu_regmove_cost qdf24xx_regmove_cost =
> -{
> -  2, /* GP2GP  */
> -  /* Avoid the use of int<->fp moves for spilling.  */
> -  6, /* GP2FP  */
> -  6, /* FP2GP  */
> -  4 /* FP2FP  */
> -};
> -
> -static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
> -{
> -  1, /* GP2GP  */
> -  /* Avoid the use of int<->fp moves for spilling.  */
> -  5, /* GP2FP  */
> -  6, /* FP2GP  */
> -  3, /* FP2FP  */
> -};
> -
> -static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
> -{
> -  1, /* GP2GP  */
> -  /* Avoid the use of int<->fp moves for spilling.  */
> -  4, /* GP2FP  */
> -  5, /* FP2GP  */
> -  4  /* FP2FP  */
> -};
> -
> -static const struct cpu_regmove_cost tsv110_regmove_cost =
> -{
> -  1, /* GP2GP  */
> -  /* Avoid the use of slow int<->fp moves for spilling by setting
> -     their cost higher than memmov_cost.  */
> -  2, /* GP2FP  */
> -  3, /* FP2GP  */
> -  2  /* FP2FP  */
> -};
> -
> -static const struct cpu_regmove_cost a64fx_regmove_cost =
> -{
> -  1, /* GP2GP  */
> -  /* Avoid the use of slow int<->fp moves for spilling by setting
> -     their cost higher than memmov_cost.  */
> -  5, /* GP2FP  */
> -  7, /* FP2GP  */
> -  2 /* FP2FP  */
> -};
> -
> -static const struct cpu_regmove_cost neoversen2_regmove_cost =
> -{
> -  1, /* GP2GP  */
> -  /* Spilling to int<->fp instead of memory is recommended so set
> -     realistic costs compared to memmov_cost.  */
> -  3, /* GP2FP  */
> -  2, /* FP2GP  */
> -  2 /* FP2FP  */
> -};
> -
> -static const struct cpu_regmove_cost neoversev1_regmove_cost =
> -{
> -  1, /* GP2GP  */
> -  /* Spilling to int<->fp instead of memory is recommended so set
> -     realistic costs compared to memmov_cost.  */
> -  3, /* GP2FP  */
> -  2, /* FP2GP  */
> -  2 /* FP2FP  */
> -};
> -
> -static const struct cpu_regmove_cost neoversev2_regmove_cost =
> -{
> -  1, /* GP2GP  */
> -  /* Spilling to int<->fp instead of memory is recommended so set
> -     realistic costs compared to memmov_cost.  */
> -  3, /* GP2FP  */
> -  2, /* FP2GP  */
> -  2 /* FP2FP  */
> -};
> -
> -/* Generic costs for Advanced SIMD vector operations.   */
> -static const advsimd_vec_cost generic_advsimd_vector_cost =
> -{
> -  1, /* int_stmt_cost  */
> -  1, /* fp_stmt_cost  */
> -  0, /* ld2_st2_permute_cost  */
> -  0, /* ld3_st3_permute_cost  */
> -  0, /* ld4_st4_permute_cost  */
> -  2, /* permute_cost  */
> -  2, /* reduc_i8_cost  */
> -  2, /* reduc_i16_cost  */
> -  2, /* reduc_i32_cost  */
> -  2, /* reduc_i64_cost  */
> -  2, /* reduc_f16_cost  */
> -  2, /* reduc_f32_cost  */
> -  2, /* reduc_f64_cost  */
> -  2, /* store_elt_extra_cost  */
> -  2, /* vec_to_scalar_cost  */
> -  1, /* scalar_to_vec_cost  */
> -  1, /* align_load_cost  */
> -  1, /* unalign_load_cost  */
> -  1, /* unalign_store_cost  */
> -  1  /* store_cost  */
> -};
> -
> -/* Generic costs for SVE vector operations.  */
> -static const sve_vec_cost generic_sve_vector_cost =
> -{
> -  {
> -    1, /* int_stmt_cost  */
> -    1, /* fp_stmt_cost  */
> -    0, /* ld2_st2_permute_cost  */
> -    0, /* ld3_st3_permute_cost  */
> -    0, /* ld4_st4_permute_cost  */
> -    2, /* permute_cost  */
> -    2, /* reduc_i8_cost  */
> -    2, /* reduc_i16_cost  */
> -    2, /* reduc_i32_cost  */
> -    2, /* reduc_i64_cost  */
> -    2, /* reduc_f16_cost  */
> -    2, /* reduc_f32_cost  */
> -    2, /* reduc_f64_cost  */
> -    2, /* store_elt_extra_cost  */
> -    2, /* vec_to_scalar_cost  */
> -    1, /* scalar_to_vec_cost  */
> -    1, /* align_load_cost  */
> -    1, /* unalign_load_cost  */
> -    1, /* unalign_store_cost  */
> -    1  /* store_cost  */
> -  },
> -  2, /* clast_cost  */
> -  2, /* fadda_f16_cost  */
> -  2, /* fadda_f32_cost  */
> -  2, /* fadda_f64_cost  */
> -  4, /* gather_load_x32_cost  */
> -  2, /* gather_load_x64_cost  */
> -  1 /* scatter_store_elt_cost  */
> -};
> -
> -/* Generic costs for vector insn classes.  */
> -static const struct cpu_vector_cost generic_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  1, /* scalar_fp_stmt_cost  */
> -  1, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  3, /* cond_taken_branch_cost  */
> -  1, /* cond_not_taken_branch_cost  */
> -  &generic_advsimd_vector_cost, /* advsimd  */
> -  &generic_sve_vector_cost, /* sve */
> -  nullptr /* issue_info  */
> -};
> -
> -static const advsimd_vec_cost a64fx_advsimd_vector_cost =
> -{
> -  2, /* int_stmt_cost  */
> -  5, /* fp_stmt_cost  */
> -  0, /* ld2_st2_permute_cost  */
> -  0, /* ld3_st3_permute_cost  */
> -  0, /* ld4_st4_permute_cost  */
> -  3, /* permute_cost  */
> -  13, /* reduc_i8_cost  */
> -  13, /* reduc_i16_cost  */
> -  13, /* reduc_i32_cost  */
> -  13, /* reduc_i64_cost  */
> -  13, /* reduc_f16_cost  */
> -  13, /* reduc_f32_cost  */
> -  13, /* reduc_f64_cost  */
> -  13, /* store_elt_extra_cost  */
> -  13, /* vec_to_scalar_cost  */
> -  4, /* scalar_to_vec_cost  */
> -  6, /* align_load_cost  */
> -  6, /* unalign_load_cost  */
> -  1, /* unalign_store_cost  */
> -  1  /* store_cost  */
> -};
> -
> -static const sve_vec_cost a64fx_sve_vector_cost =
> -{
> -  {
> -    2, /* int_stmt_cost  */
> -    5, /* fp_stmt_cost  */
> -    0, /* ld2_st2_permute_cost  */
> -    0, /* ld3_st3_permute_cost  */
> -    0, /* ld4_st4_permute_cost  */
> -    3, /* permute_cost  */
> -    13, /* reduc_i8_cost  */
> -    13, /* reduc_i16_cost  */
> -    13, /* reduc_i32_cost  */
> -    13, /* reduc_i64_cost  */
> -    13, /* reduc_f16_cost  */
> -    13, /* reduc_f32_cost  */
> -    13, /* reduc_f64_cost  */
> -    13, /* store_elt_extra_cost  */
> -    13, /* vec_to_scalar_cost  */
> -    4, /* scalar_to_vec_cost  */
> -    6, /* align_load_cost  */
> -    6, /* unalign_load_cost  */
> -    1, /* unalign_store_cost  */
> -    1  /* store_cost  */
> -  },
> -  13, /* clast_cost  */
> -  13, /* fadda_f16_cost  */
> -  13, /* fadda_f32_cost  */
> -  13, /* fadda_f64_cost  */
> -  64, /* gather_load_x32_cost  */
> -  32, /* gather_load_x64_cost  */
> -  1 /* scatter_store_elt_cost  */
> -};
> -
> -static const struct cpu_vector_cost a64fx_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  5, /* scalar_fp_stmt_cost  */
> -  4, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  3, /* cond_taken_branch_cost  */
> -  1, /* cond_not_taken_branch_cost  */
> -  &a64fx_advsimd_vector_cost, /* advsimd  */
> -  &a64fx_sve_vector_cost, /* sve  */
> -  nullptr /* issue_info  */
> -};
> -
> -static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
> -{
> -  1, /* int_stmt_cost  */
> -  3, /* fp_stmt_cost  */
> -  0, /* ld2_st2_permute_cost  */
> -  0, /* ld3_st3_permute_cost  */
> -  0, /* ld4_st4_permute_cost  */
> -  2, /* permute_cost  */
> -  1, /* reduc_i8_cost  */
> -  1, /* reduc_i16_cost  */
> -  1, /* reduc_i32_cost  */
> -  1, /* reduc_i64_cost  */
> -  1, /* reduc_f16_cost  */
> -  1, /* reduc_f32_cost  */
> -  1, /* reduc_f64_cost  */
> -  1, /* store_elt_extra_cost  */
> -  1, /* vec_to_scalar_cost  */
> -  1, /* scalar_to_vec_cost  */
> -  1, /* align_load_cost  */
> -  1, /* unalign_load_cost  */
> -  1, /* unalign_store_cost  */
> -  1  /* store_cost  */
> -};
> -
> -/* QDF24XX costs for vector insn classes.  */
> -static const struct cpu_vector_cost qdf24xx_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  1, /* scalar_fp_stmt_cost  */
> -  1, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  3, /* cond_taken_branch_cost  */
> -  1, /* cond_not_taken_branch_cost  */
> -  &qdf24xx_advsimd_vector_cost, /* advsimd  */
> -  nullptr, /* sve  */
> -  nullptr /* issue_info  */
> -};
> -
> -
> -static const advsimd_vec_cost thunderx_advsimd_vector_cost =
> -{
> -  4, /* int_stmt_cost  */
> -  1, /* fp_stmt_cost  */
> -  0, /* ld2_st2_permute_cost  */
> -  0, /* ld3_st3_permute_cost  */
> -  0, /* ld4_st4_permute_cost  */
> -  4, /* permute_cost  */
> -  2, /* reduc_i8_cost  */
> -  2, /* reduc_i16_cost  */
> -  2, /* reduc_i32_cost  */
> -  2, /* reduc_i64_cost  */
> -  2, /* reduc_f16_cost  */
> -  2, /* reduc_f32_cost  */
> -  2, /* reduc_f64_cost  */
> -  2, /* store_elt_extra_cost  */
> -  2, /* vec_to_scalar_cost  */
> -  2, /* scalar_to_vec_cost  */
> -  3, /* align_load_cost  */
> -  5, /* unalign_load_cost  */
> -  5, /* unalign_store_cost  */
> -  1  /* store_cost  */
> -};
> -
> -/* ThunderX costs for vector insn classes.  */
> -static const struct cpu_vector_cost thunderx_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  1, /* scalar_fp_stmt_cost  */
> -  3, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  3, /* cond_taken_branch_cost  */
> -  3, /* cond_not_taken_branch_cost  */
> -  &thunderx_advsimd_vector_cost, /* advsimd  */
> -  nullptr, /* sve  */
> -  nullptr /* issue_info  */
> -};
> -
> -static const advsimd_vec_cost tsv110_advsimd_vector_cost =
> -{
> -  2, /* int_stmt_cost  */
> -  2, /* fp_stmt_cost  */
> -  0, /* ld2_st2_permute_cost  */
> -  0, /* ld3_st3_permute_cost  */
> -  0, /* ld4_st4_permute_cost  */
> -  2, /* permute_cost  */
> -  3, /* reduc_i8_cost  */
> -  3, /* reduc_i16_cost  */
> -  3, /* reduc_i32_cost  */
> -  3, /* reduc_i64_cost  */
> -  3, /* reduc_f16_cost  */
> -  3, /* reduc_f32_cost  */
> -  3, /* reduc_f64_cost  */
> -  3, /* store_elt_extra_cost  */
> -  3, /* vec_to_scalar_cost  */
> -  2, /* scalar_to_vec_cost  */
> -  5, /* align_load_cost  */
> -  5, /* unalign_load_cost  */
> -  1, /* unalign_store_cost  */
> -  1  /* store_cost  */
> -};
> -
> -static const struct cpu_vector_cost tsv110_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  1, /* scalar_fp_stmt_cost  */
> -  5, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  1, /* cond_taken_branch_cost  */
> -  1, /* cond_not_taken_branch_cost  */
> -  &tsv110_advsimd_vector_cost, /* advsimd  */
> -  nullptr, /* sve  */
> -  nullptr /* issue_info  */
> -};
> -
> -static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
> -{
> -  2, /* int_stmt_cost  */
> -  2, /* fp_stmt_cost  */
> -  0, /* ld2_st2_permute_cost  */
> -  0, /* ld3_st3_permute_cost  */
> -  0, /* ld4_st4_permute_cost  */
> -  3, /* permute_cost  */
> -  8, /* reduc_i8_cost  */
> -  8, /* reduc_i16_cost  */
> -  8, /* reduc_i32_cost  */
> -  8, /* reduc_i64_cost  */
> -  8, /* reduc_f16_cost  */
> -  8, /* reduc_f32_cost  */
> -  8, /* reduc_f64_cost  */
> -  8, /* store_elt_extra_cost  */
> -  8, /* vec_to_scalar_cost  */
> -  8, /* scalar_to_vec_cost  */
> -  4, /* align_load_cost  */
> -  4, /* unalign_load_cost  */
> -  1, /* unalign_store_cost  */
> -  1  /* store_cost  */
> -};
> -
> -/* Cortex-A57 costs for vector insn classes.  */
> -static const struct cpu_vector_cost cortexa57_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  1, /* scalar_fp_stmt_cost  */
> -  4, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  1, /* cond_taken_branch_cost  */
> -  1, /* cond_not_taken_branch_cost  */
> -  &cortexa57_advsimd_vector_cost, /* advsimd  */
> -  nullptr, /* sve  */
> -  nullptr /* issue_info  */
> -};
> -
> -static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
> -{
> -  3, /* int_stmt_cost  */
> -  3, /* fp_stmt_cost  */
> -  0, /* ld2_st2_permute_cost  */
> -  0, /* ld3_st3_permute_cost  */
> -  0, /* ld4_st4_permute_cost  */
> -  3, /* permute_cost  */
> -  3, /* reduc_i8_cost  */
> -  3, /* reduc_i16_cost  */
> -  3, /* reduc_i32_cost  */
> -  3, /* reduc_i64_cost  */
> -  3, /* reduc_f16_cost  */
> -  3, /* reduc_f32_cost  */
> -  3, /* reduc_f64_cost  */
> -  3, /* store_elt_extra_cost  */
> -  3, /* vec_to_scalar_cost  */
> -  3, /* scalar_to_vec_cost  */
> -  5, /* align_load_cost  */
> -  5, /* unalign_load_cost  */
> -  1, /* unalign_store_cost  */
> -  1  /* store_cost  */
> -};
> -
> -static const struct cpu_vector_cost exynosm1_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  1, /* scalar_fp_stmt_cost  */
> -  5, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  1, /* cond_taken_branch_cost  */
> -  1, /* cond_not_taken_branch_cost  */
> -  &exynosm1_advsimd_vector_cost, /* advsimd  */
> -  nullptr, /* sve  */
> -  nullptr /* issue_info  */
> -};
> -
> -static const advsimd_vec_cost xgene1_advsimd_vector_cost =
> -{
> -  2, /* int_stmt_cost  */
> -  2, /* fp_stmt_cost  */
> -  0, /* ld2_st2_permute_cost  */
> -  0, /* ld3_st3_permute_cost  */
> -  0, /* ld4_st4_permute_cost  */
> -  2, /* permute_cost  */
> -  4, /* reduc_i8_cost  */
> -  4, /* reduc_i16_cost  */
> -  4, /* reduc_i32_cost  */
> -  4, /* reduc_i64_cost  */
> -  4, /* reduc_f16_cost  */
> -  4, /* reduc_f32_cost  */
> -  4, /* reduc_f64_cost  */
> -  4, /* store_elt_extra_cost  */
> -  4, /* vec_to_scalar_cost  */
> -  4, /* scalar_to_vec_cost  */
> -  10, /* align_load_cost  */
> -  10, /* unalign_load_cost  */
> -  2, /* unalign_store_cost  */
> -  2  /* store_cost  */
> -};
> -
> -/* Generic costs for vector insn classes.  */
> -static const struct cpu_vector_cost xgene1_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  1, /* scalar_fp_stmt_cost  */
> -  5, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  2, /* cond_taken_branch_cost  */
> -  1, /* cond_not_taken_branch_cost  */
> -  &xgene1_advsimd_vector_cost, /* advsimd  */
> -  nullptr, /* sve  */
> -  nullptr /* issue_info  */
> -};
> -
> -static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
> -{
> -  4, /* int_stmt_cost  */
> -  5, /* fp_stmt_cost  */
> -  0, /* ld2_st2_permute_cost  */
> -  0, /* ld3_st3_permute_cost  */
> -  0, /* ld4_st4_permute_cost  */
> -  10, /* permute_cost  */
> -  6, /* reduc_i8_cost  */
> -  6, /* reduc_i16_cost  */
> -  6, /* reduc_i32_cost  */
> -  6, /* reduc_i64_cost  */
> -  6, /* reduc_f16_cost  */
> -  6, /* reduc_f32_cost  */
> -  6, /* reduc_f64_cost  */
> -  6, /* store_elt_extra_cost  */
> -  6, /* vec_to_scalar_cost  */
> -  5, /* scalar_to_vec_cost  */
> -  4, /* align_load_cost  */
> -  4, /* unalign_load_cost  */
> -  1, /* unalign_store_cost  */
> -  1  /* store_cost  */
> -};
> -
> -/* Costs for vector insn classes for Vulcan.  */
> -static const struct cpu_vector_cost thunderx2t99_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  6, /* scalar_fp_stmt_cost  */
> -  4, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  2, /* cond_taken_branch_cost  */
> -  1,  /* cond_not_taken_branch_cost  */
> -  &thunderx2t99_advsimd_vector_cost, /* advsimd  */
> -  nullptr, /* sve  */
> -  nullptr /* issue_info  */
> -};
> -
> -static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
> -{
> -  5, /* int_stmt_cost  */
> -  5, /* fp_stmt_cost  */
> -  0, /* ld2_st2_permute_cost  */
> -  0, /* ld3_st3_permute_cost  */
> -  0, /* ld4_st4_permute_cost  */
> -  10, /* permute_cost  */
> -  5, /* reduc_i8_cost  */
> -  5, /* reduc_i16_cost  */
> -  5, /* reduc_i32_cost  */
> -  5, /* reduc_i64_cost  */
> -  5, /* reduc_f16_cost  */
> -  5, /* reduc_f32_cost  */
> -  5, /* reduc_f64_cost  */
> -  5, /* store_elt_extra_cost  */
> -  5, /* vec_to_scalar_cost  */
> -  5, /* scalar_to_vec_cost  */
> -  4, /* align_load_cost  */
> -  4, /* unalign_load_cost  */
> -  4, /* unalign_store_cost  */
> -  4  /* store_cost  */
> -};
> -
> -static const struct cpu_vector_cost thunderx3t110_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  5, /* scalar_fp_stmt_cost  */
> -  4, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  2, /* cond_taken_branch_cost  */
> -  1,  /* cond_not_taken_branch_cost  */
> -  &thunderx3t110_advsimd_vector_cost, /* advsimd  */
> -  nullptr, /* sve  */
> -  nullptr /* issue_info  */
> -};
> -
> -static const advsimd_vec_cost ampere1_advsimd_vector_cost =
> -{
> -  1, /* int_stmt_cost  */
> -  3, /* fp_stmt_cost  */
> -  0, /* ld2_st2_permute_cost  */
> -  0, /* ld3_st3_permute_cost  */
> -  0, /* ld4_st4_permute_cost  */
> -  2, /* permute_cost  */
> -  12, /* reduc_i8_cost  */
> -  9, /* reduc_i16_cost  */
> -  6, /* reduc_i32_cost  */
> -  5, /* reduc_i64_cost  */
> -  9, /* reduc_f16_cost  */
> -  6, /* reduc_f32_cost  */
> -  5, /* reduc_f64_cost  */
> -  8, /* store_elt_extra_cost  */
> -  6, /* vec_to_scalar_cost  */
> -  7, /* scalar_to_vec_cost  */
> -  4, /* align_load_cost  */
> -  4, /* unalign_load_cost  */
> -  1, /* unalign_store_cost  */
> -  1  /* store_cost  */
> -};
> -
> -/* Ampere-1 costs for vector insn classes.  */
> -static const struct cpu_vector_cost ampere1_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  3, /* scalar_fp_stmt_cost  */
> -  4, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  1, /* cond_taken_branch_cost  */
> -  1, /* cond_not_taken_branch_cost  */
> -  &ampere1_advsimd_vector_cost, /* advsimd  */
> -  nullptr, /* sve  */
> -  nullptr  /* issue_info  */
> -};
> -
> -/* Generic costs for branch instructions.  */
> -static const struct cpu_branch_cost generic_branch_cost =
> -{
> -  1,  /* Predictable.  */
> -  3   /* Unpredictable.  */
> -};
> -
> -/* Generic approximation modes.  */
> -static const cpu_approx_modes generic_approx_modes =
> -{
> -  AARCH64_APPROX_NONE,	/* division  */
> -  AARCH64_APPROX_NONE,	/* sqrt  */
> -  AARCH64_APPROX_NONE	/* recip_sqrt  */
> -};
> -
> -/* Approximation modes for Exynos M1.  */
> -static const cpu_approx_modes exynosm1_approx_modes =
> -{
> -  AARCH64_APPROX_NONE,	/* division  */
> -  AARCH64_APPROX_ALL,	/* sqrt  */
> -  AARCH64_APPROX_ALL	/* recip_sqrt  */
> -};
> -
> -/* Approximation modes for X-Gene 1.  */
> -static const cpu_approx_modes xgene1_approx_modes =
> -{
> -  AARCH64_APPROX_NONE,	/* division  */
> -  AARCH64_APPROX_NONE,	/* sqrt  */
> -  AARCH64_APPROX_ALL	/* recip_sqrt  */
> -};
> -
> -/* Generic prefetch settings (which disable prefetch).  */
> -static const cpu_prefetch_tune generic_prefetch_tune =
> -{
> -  0,			/* num_slots  */
> -  -1,			/* l1_cache_size  */
> -  -1,			/* l1_cache_line_size  */
> -  -1,			/* l2_cache_size  */
> -  true,			/* prefetch_dynamic_strides */
> -  -1,			/* minimum_stride */
> -  -1			/* default_opt_level  */
> -};
> -
> -static const cpu_prefetch_tune exynosm1_prefetch_tune =
> -{
> -  0,			/* num_slots  */
> -  -1,			/* l1_cache_size  */
> -  64,			/* l1_cache_line_size  */
> -  -1,			/* l2_cache_size  */
> -  true,			/* prefetch_dynamic_strides */
> -  -1,			/* minimum_stride */
> -  -1			/* default_opt_level  */
> -};
> -
> -static const cpu_prefetch_tune qdf24xx_prefetch_tune =
> -{
> -  4,			/* num_slots  */
> -  32,			/* l1_cache_size  */
> -  64,			/* l1_cache_line_size  */
> -  512,			/* l2_cache_size  */
> -  false,		/* prefetch_dynamic_strides */
> -  2048,			/* minimum_stride */
> -  3			/* default_opt_level  */
> -};
> -
> -static const cpu_prefetch_tune thunderxt88_prefetch_tune =
> -{
> -  8,			/* num_slots  */
> -  32,			/* l1_cache_size  */
> -  128,			/* l1_cache_line_size  */
> -  16*1024,		/* l2_cache_size  */
> -  true,			/* prefetch_dynamic_strides */
> -  -1,			/* minimum_stride */
> -  3			/* default_opt_level  */
> -};
> -
> -static const cpu_prefetch_tune thunderx_prefetch_tune =
> -{
> -  8,			/* num_slots  */
> -  32,			/* l1_cache_size  */
> -  128,			/* l1_cache_line_size  */
> -  -1,			/* l2_cache_size  */
> -  true,			/* prefetch_dynamic_strides */
> -  -1,			/* minimum_stride */
> -  -1			/* default_opt_level  */
> -};
> -
> -static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
> -{
> -  8,			/* num_slots  */
> -  32,			/* l1_cache_size  */
> -  64,			/* l1_cache_line_size  */
> -  256,			/* l2_cache_size  */
> -  true,			/* prefetch_dynamic_strides */
> -  -1,			/* minimum_stride */
> -  -1			/* default_opt_level  */
> -};
> -
> -static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
> -{
> -  8,			/* num_slots  */
> -  32,			/* l1_cache_size  */
> -  64,			/* l1_cache_line_size  */
> -  256,			/* l2_cache_size  */
> -  true,			/* prefetch_dynamic_strides */
> -  -1,			/* minimum_stride */
> -  -1			/* default_opt_level  */
> -};
> -
> -static const cpu_prefetch_tune tsv110_prefetch_tune =
> -{
> -  0,                    /* num_slots  */
> -  64,                   /* l1_cache_size  */
> -  64,                   /* l1_cache_line_size  */
> -  512,                  /* l2_cache_size  */
> -  true,                 /* prefetch_dynamic_strides */
> -  -1,                   /* minimum_stride */
> -  -1                    /* default_opt_level  */
> -};
> -
> -static const cpu_prefetch_tune xgene1_prefetch_tune =
> -{
> -  8,			/* num_slots  */
> -  32,			/* l1_cache_size  */
> -  64,			/* l1_cache_line_size  */
> -  256,			/* l2_cache_size  */
> -  true,                 /* prefetch_dynamic_strides */
> -  -1,                   /* minimum_stride */
> -  -1			/* default_opt_level  */
> -};
> -
> -static const cpu_prefetch_tune a64fx_prefetch_tune =
> -{
> -  8,			/* num_slots  */
> -  64,			/* l1_cache_size  */
> -  256,			/* l1_cache_line_size  */
> -  32768,		/* l2_cache_size  */
> -  true,			/* prefetch_dynamic_strides */
> -  -1,			/* minimum_stride */
> -  -1			/* default_opt_level  */
> -};
> -
> -static const cpu_prefetch_tune ampere1_prefetch_tune =
> -{
> -  0,			/* num_slots  */
> -  64,			/* l1_cache_size  */
> -  64,			/* l1_cache_line_size  */
> -  2048,			/* l2_cache_size  */
> -  true,			/* prefetch_dynamic_strides */
> -  -1,			/* minimum_stride */
> -  -1			/* default_opt_level  */
> -};
> -
> -static const struct tune_params generic_tunings =
> -{
> -  &cortexa57_extra_costs,
> -  &generic_addrcost_table,
> -  &generic_regmove_cost,
> -  &generic_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  2, /* issue_rate  */
> -  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> -  "16:12",	/* function_align.  */
> -  "4",	/* jump_align.  */
> -  "8",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
> -     Neoverse V1.  It does not have a noticeable effect on A64FX and should
> -     have at most a very minor effect on SVE2 cores.  */
> -  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),	/* tune_flags.  */
> -  &generic_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params cortexa35_tunings =
> -{
> -  &cortexa53_extra_costs,
> -  &generic_addrcost_table,
> -  &cortexa53_regmove_cost,
> -  &generic_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  1, /* issue_rate  */
> -  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> -   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
> -  "16",	/* function_align.  */
> -  "4",	/* jump_align.  */
> -  "8",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> -  &generic_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params cortexa53_tunings =
> -{
> -  &cortexa53_extra_costs,
> -  &generic_addrcost_table,
> -  &cortexa53_regmove_cost,
> -  &generic_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  2, /* issue_rate  */
> -  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> -   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
> -  "16",	/* function_align.  */
> -  "4",	/* jump_align.  */
> -  "8",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> -  &generic_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params cortexa57_tunings =
> -{
> -  &cortexa57_extra_costs,
> -  &generic_addrcost_table,
> -  &cortexa57_regmove_cost,
> -  &cortexa57_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  3, /* issue_rate  */
> -  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> -   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
> -  "16",	/* function_align.  */
> -  "4",	/* jump_align.  */
> -  "8",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
> -  &generic_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params cortexa72_tunings =
> -{
> -  &cortexa57_extra_costs,
> -  &generic_addrcost_table,
> -  &cortexa57_regmove_cost,
> -  &cortexa57_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  3, /* issue_rate  */
> -  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> -   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
> -  "16",	/* function_align.  */
> -  "4",	/* jump_align.  */
> -  "8",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> -  &generic_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params cortexa73_tunings =
> -{
> -  &cortexa57_extra_costs,
> -  &generic_addrcost_table,
> -  &cortexa57_regmove_cost,
> -  &cortexa57_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  2, /* issue_rate.  */
> -  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> -   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
> -  "16",	/* function_align.  */
> -  "4",	/* jump_align.  */
> -  "8",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> -  &generic_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params exynosm1_tunings =
> -{
> -  &exynosm1_extra_costs,
> -  &exynosm1_addrcost_table,
> -  &exynosm1_regmove_cost,
> -  &exynosm1_vector_cost,
> -  &generic_branch_cost,
> -  &exynosm1_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  3,	/* issue_rate  */
> -  (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
> -  "4",	/* function_align.  */
> -  "4",	/* jump_align.  */
> -  "4",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  48,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
> -  &exynosm1_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params thunderxt88_tunings =
> -{
> -  &thunderx_extra_costs,
> -  &generic_addrcost_table,
> -  &thunderx_regmove_cost,
> -  &thunderx_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 6, /* load_int.  */
> -    6, /* store_int.  */
> -    6, /* load_fp.  */
> -    6, /* store_fp.  */
> -    6, /* load_pred.  */
> -    6 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  2, /* issue_rate  */
> -  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
> -  "8",	/* function_align.  */
> -  "8",	/* jump_align.  */
> -  "8",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> -  &thunderxt88_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params thunderx_tunings =
> -{
> -  &thunderx_extra_costs,
> -  &generic_addrcost_table,
> -  &thunderx_regmove_cost,
> -  &thunderx_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 6, /* load_int.  */
> -    6, /* store_int.  */
> -    6, /* load_fp.  */
> -    6, /* store_fp.  */
> -    6, /* load_pred.  */
> -    6 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  2, /* issue_rate  */
> -  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
> -  "8",	/* function_align.  */
> -  "8",	/* jump_align.  */
> -  "8",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
> -  &thunderx_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params tsv110_tunings =
> -{
> -  &tsv110_extra_costs,
> -  &tsv110_addrcost_table,
> -  &tsv110_regmove_cost,
> -  &tsv110_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  4,    /* issue_rate  */
> -  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
> -   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
> -  "16", /* function_align.  */
> -  "4",  /* jump_align.  */
> -  "8",  /* loop_align.  */
> -  2,    /* int_reassoc_width.  */
> -  4,    /* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,    /* vec_reassoc_width.  */
> -  2,    /* min_div_recip_mul_sf.  */
> -  2,    /* min_div_recip_mul_df.  */
> -  0,    /* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
> -  &tsv110_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params xgene1_tunings =
> -{
> -  &xgene1_extra_costs,
> -  &xgene1_addrcost_table,
> -  &xgene1_regmove_cost,
> -  &xgene1_vector_cost,
> -  &generic_branch_cost,
> -  &xgene1_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 6, /* load_int.  */
> -    6, /* store_int.  */
> -    6, /* load_fp.  */
> -    6, /* store_fp.  */
> -    6, /* load_pred.  */
> -    6 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  4, /* issue_rate  */
> -  AARCH64_FUSE_NOTHING, /* fusible_ops  */
> -  "16",	/* function_align.  */
> -  "16",	/* jump_align.  */
> -  "16",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  17,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
> -  &xgene1_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params emag_tunings =
> -{
> -  &xgene1_extra_costs,
> -  &xgene1_addrcost_table,
> -  &xgene1_regmove_cost,
> -  &xgene1_vector_cost,
> -  &generic_branch_cost,
> -  &xgene1_approx_modes,
> -  SVE_NOT_IMPLEMENTED,
> -  { 6, /* load_int.  */
> -    6, /* store_int.  */
> -    6, /* load_fp.  */
> -    6, /* store_fp.  */
> -    6, /* load_pred.  */
> -    6 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  4, /* issue_rate  */
> -  AARCH64_FUSE_NOTHING, /* fusible_ops  */
> -  "16",	/* function_align.  */
> -  "16",	/* jump_align.  */
> -  "16",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  17,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
> -  &xgene1_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params qdf24xx_tunings =
> -{
> -  &qdf24xx_extra_costs,
> -  &qdf24xx_addrcost_table,
> -  &qdf24xx_regmove_cost,
> -  &qdf24xx_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  4, /* issue_rate  */
> -  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> -   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
> -  "16",	/* function_align.  */
> -  "8",	/* jump_align.  */
> -  "16",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
> -  &qdf24xx_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -/* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
> -   for now.  */
> -static const struct tune_params saphira_tunings =
> -{
> -  &generic_extra_costs,
> -  &generic_addrcost_table,
> -  &generic_regmove_cost,
> -  &generic_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  4, /* issue_rate  */
> -  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> -   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
> -  "16",	/* function_align.  */
> -  "8",	/* jump_align.  */
> -  "16",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  1,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
> -  &generic_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params thunderx2t99_tunings =
> -{
> -  &thunderx2t99_extra_costs,
> -  &thunderx2t99_addrcost_table,
> -  &thunderx2t99_regmove_cost,
> -  &thunderx2t99_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  4, /* issue_rate.  */
> -  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
> -   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
> -  "16",	/* function_align.  */
> -  "8",	/* jump_align.  */
> -  "16",	/* loop_align.  */
> -  3,	/* int_reassoc_width.  */
> -  2,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  2,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> -  &thunderx2t99_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params thunderx3t110_tunings =
> -{
> -  &thunderx3t110_extra_costs,
> -  &thunderx3t110_addrcost_table,
> -  &thunderx3t110_regmove_cost,
> -  &thunderx3t110_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  6, /* issue_rate.  */
> -  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
> -   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
> -  "16",	/* function_align.  */
> -  "8",	/* jump_align.  */
> -  "16",	/* loop_align.  */
> -  3,	/* int_reassoc_width.  */
> -  2,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  2,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> -  &thunderx3t110_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params neoversen1_tunings =
> -{
> -  &cortexa76_extra_costs,
> -  &generic_addrcost_table,
> -  &generic_regmove_cost,
> -  &cortexa57_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    2, /* store_int.  */
> -    5, /* load_fp.  */
> -    2, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  3, /* issue_rate  */
> -  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> -  "32:16",	/* function_align.  */
> -  "4",		/* jump_align.  */
> -  "32:16",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  2,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
> -  &generic_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params ampere1_tunings =
> -{
> -  &ampere1_extra_costs,
> -  &generic_addrcost_table,
> -  &generic_regmove_cost,
> -  &ampere1_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  4, /* issue_rate  */
> -  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
> -   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
> -   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
> -   AARCH64_FUSE_CMP_BRANCH),
> -  /* fusible_ops  */
> -  "32",		/* function_align.  */
> -  "4",		/* jump_align.  */
> -  "32:16",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  4,	/* fma_reassoc_width.  */
> -  2,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> -  &ampere1_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params ampere1a_tunings =
> -{
> -  &ampere1a_extra_costs,
> -  &generic_addrcost_table,
> -  &generic_regmove_cost,
> -  &ampere1_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_NOT_IMPLEMENTED, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  4, /* issue_rate  */
> -  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
> -   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
> -   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
> -   AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
> -   AARCH64_FUSE_ADDSUB_2REG_CONST1),
> -  /* fusible_ops  */
> -  "32",		/* function_align.  */
> -  "4",		/* jump_align.  */
> -  "32:16",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  2,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> -  &ampere1_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
> -};
> -
> -static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
> -{
> -  2, /* int_stmt_cost  */
> -  2, /* fp_stmt_cost  */
> -  4, /* ld2_st2_permute_cost */
> -  4, /* ld3_st3_permute_cost  */
> -  5, /* ld4_st4_permute_cost  */
> -  3, /* permute_cost  */
> -  4, /* reduc_i8_cost  */
> -  4, /* reduc_i16_cost  */
> -  2, /* reduc_i32_cost  */
> -  2, /* reduc_i64_cost  */
> -  6, /* reduc_f16_cost  */
> -  3, /* reduc_f32_cost  */
> -  2, /* reduc_f64_cost  */
> -  2, /* store_elt_extra_cost  */
> -  /* This value is just inherited from the Cortex-A57 table.  */
> -  8, /* vec_to_scalar_cost  */
> -  /* This depends very much on what the scalar value is and
> -     where it comes from.  E.g. some constants take two dependent
> -     instructions or a load, while others might be moved from a GPR.
> -     4 seems to be a reasonable compromise in practice.  */
> -  4, /* scalar_to_vec_cost  */
> -  4, /* align_load_cost  */
> -  4, /* unalign_load_cost  */
> -  /* Although stores have a latency of 2 and compete for the
> -     vector pipes, in practice it's better not to model that.  */
> -  1, /* unalign_store_cost  */
> -  1  /* store_cost  */
> -};
> -
> -static const sve_vec_cost neoversev1_sve_vector_cost =
> -{
> -  {
> -    2, /* int_stmt_cost  */
> -    2, /* fp_stmt_cost  */
> -    4, /* ld2_st2_permute_cost  */
> -    7, /* ld3_st3_permute_cost  */
> -    8, /* ld4_st4_permute_cost  */
> -    3, /* permute_cost  */
> -    /* Theoretically, a reduction involving 31 scalar ADDs could
> -       complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
> -       completes in 14 cycles, so give it a cost of 31 + 5.  */
> -    36, /* reduc_i8_cost  */
> -    /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
> -    22, /* reduc_i16_cost  */
> -    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
> -    14, /* reduc_i32_cost  */
> -    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
> -    11, /* reduc_i64_cost  */
> -    /* Theoretically, a reduction involving 15 scalar FADDs could
> -       complete in ~9 cycles and would have a cost of 30.  FADDV
> -       completes in 13 cycles, so give it a cost of 30 + 4.  */
> -    34, /* reduc_f16_cost  */
> -    /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
> -    19, /* reduc_f32_cost  */
> -    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
> -    11, /* reduc_f64_cost  */
> -    2, /* store_elt_extra_cost  */
> -    /* This value is just inherited from the Cortex-A57 table.  */
> -    8, /* vec_to_scalar_cost  */
> -    /* See the comment above the Advanced SIMD versions.  */
> -    4, /* scalar_to_vec_cost  */
> -    4, /* align_load_cost  */
> -    4, /* unalign_load_cost  */
> -    /* Although stores have a latency of 2 and compete for the
> -       vector pipes, in practice it's better not to model that.  */
> -    1, /* unalign_store_cost  */
> -    1  /* store_cost  */
> -  },
> -  3, /* clast_cost  */
> -  19, /* fadda_f16_cost  */
> -  11, /* fadda_f32_cost  */
> -  8, /* fadda_f64_cost  */
> -  32, /* gather_load_x32_cost  */
> -  16, /* gather_load_x64_cost  */
> -  3 /* scatter_store_elt_cost  */
> -};
> -
> -static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
> -{
> -  3, /* loads_stores_per_cycle  */
> -  2, /* stores_per_cycle  */
> -  4, /* general_ops_per_cycle  */
> -  0, /* fp_simd_load_general_ops  */
> -  1 /* fp_simd_store_general_ops  */
> -};
> -
> -static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
> -{
> -  {
> -    3, /* loads_stores_per_cycle  */
> -    2, /* stores_per_cycle  */
> -    4, /* general_ops_per_cycle  */
> -    0, /* fp_simd_load_general_ops  */
> -    1 /* fp_simd_store_general_ops  */
> -  },
> -  2, /* ld2_st2_general_ops  */
> -  2, /* ld3_st3_general_ops  */
> -  3 /* ld4_st4_general_ops  */
> -};
> -
> -static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
> -{
> -  {
> -    {
> -      2, /* loads_per_cycle  */
> -      2, /* stores_per_cycle  */
> -      2, /* general_ops_per_cycle  */
> -      0, /* fp_simd_load_general_ops  */
> -      1 /* fp_simd_store_general_ops  */
> -    },
> -    2, /* ld2_st2_general_ops  */
> -    2, /* ld3_st3_general_ops  */
> -    3 /* ld4_st4_general_ops  */
> -  },
> -  1, /* pred_ops_per_cycle  */
> -  2, /* while_pred_ops  */
> -  2, /* int_cmp_pred_ops  */
> -  1, /* fp_cmp_pred_ops  */
> -  1, /* gather_scatter_pair_general_ops  */
> -  1 /* gather_scatter_pair_pred_ops  */
> -};
> -
> -static const aarch64_vec_issue_info neoversev1_vec_issue_info =
> -{
> -  &neoversev1_scalar_issue_info,
> -  &neoversev1_advsimd_issue_info,
> -  &neoversev1_sve_issue_info
> -};
> -
> -/* Neoverse V1 costs for vector insn classes.  */
> -static const struct cpu_vector_cost neoversev1_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  2, /* scalar_fp_stmt_cost  */
> -  4, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  1, /* cond_taken_branch_cost  */
> -  1, /* cond_not_taken_branch_cost  */
> -  &neoversev1_advsimd_vector_cost, /* advsimd  */
> -  &neoversev1_sve_vector_cost, /* sve  */
> -  &neoversev1_vec_issue_info /* issue_info  */
> -};
> -
> -static const struct tune_params neoversev1_tunings =
> -{
> -  &cortexa76_extra_costs,
> -  &neoversev1_addrcost_table,
> -  &neoversev1_regmove_cost,
> -  &neoversev1_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_256, /* sve_width  */
> -  { 4, /* load_int.  */
> -    2, /* store_int.  */
> -    6, /* load_fp.  */
> -    2, /* store_fp.  */
> -    6, /* load_pred.  */
> -    1 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  3, /* issue_rate  */
> -  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> -  "32:16",	/* function_align.  */
> -  "4",		/* jump_align.  */
> -  "32:16",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  4,	/* fma_reassoc_width.  */
> -  2,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
> -   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
> -   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
> -   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
> -  &generic_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> -
> -static const sve_vec_cost neoverse512tvb_sve_vector_cost =
> -{
> -  {
> -    2, /* int_stmt_cost  */
> -    2, /* fp_stmt_cost  */
> -    4, /* ld2_st2_permute_cost  */
> -    5, /* ld3_st3_permute_cost  */
> -    5, /* ld4_st4_permute_cost  */
> -    3, /* permute_cost  */
> -    /* Theoretically, a reduction involving 15 scalar ADDs could
> -       complete in ~5 cycles and would have a cost of 15.  Assume that
> -       [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6.  */
> -    21, /* reduc_i8_cost  */
> -    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
> -    13, /* reduc_i16_cost  */
> -    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
> -    9, /* reduc_i32_cost  */
> -    /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7.  */
> -    8, /* reduc_i64_cost  */
> -    /* Theoretically, a reduction involving 7 scalar FADDs could
> -       complete in ~6 cycles and would have a cost of 14.  Assume that
> -       FADDV completes in 8 cycles and so give it a cost of 14 + 2.  */
> -    16, /* reduc_f16_cost  */
> -    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
> -    8, /* reduc_f32_cost  */
> -    /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2.  */
> -    4, /* reduc_f64_cost  */
> -    2, /* store_elt_extra_cost  */
> -    /* This value is just inherited from the Cortex-A57 table.  */
> -    8, /* vec_to_scalar_cost  */
> -    /* This depends very much on what the scalar value is and
> -       where it comes from.  E.g. some constants take two dependent
> -       instructions or a load, while others might be moved from a GPR.
> -       4 seems to be a reasonable compromise in practice.  */
> -    4, /* scalar_to_vec_cost  */
> -    4, /* align_load_cost  */
> -    4, /* unalign_load_cost  */
> -    /* Although stores generally have a latency of 2 and compete for the
> -       vector pipes, in practice it's better not to model that.  */
> -    1, /* unalign_store_cost  */
> -    1  /* store_cost  */
> -  },
> -  3, /* clast_cost  */
> -  10, /* fadda_f16_cost  */
> -  6, /* fadda_f32_cost  */
> -  4, /* fadda_f64_cost  */
> -  /* A strided Advanced SIMD x64 load would take two parallel FP loads
> -     (6 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
> -     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
> -     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
> -     (cost 2) to that, to avoid the difference being lost in rounding.
> -
> -     There is no easy comparison between a strided Advanced SIMD x32 load
> -     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
> -     operation more than a 64-bit gather.  */
> -  14, /* gather_load_x32_cost  */
> -  12, /* gather_load_x64_cost  */
> -  3 /* scatter_store_elt_cost  */
> -};
> -
> -static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
> -{
> -  {
> -    {
> -      3, /* loads_per_cycle  */
> -      2, /* stores_per_cycle  */
> -      4, /* general_ops_per_cycle  */
> -      0, /* fp_simd_load_general_ops  */
> -      1 /* fp_simd_store_general_ops  */
> -    },
> -    2, /* ld2_st2_general_ops  */
> -    2, /* ld3_st3_general_ops  */
> -    3 /* ld4_st4_general_ops  */
> -  },
> -  2, /* pred_ops_per_cycle  */
> -  2, /* while_pred_ops  */
> -  2, /* int_cmp_pred_ops  */
> -  1, /* fp_cmp_pred_ops  */
> -  1, /* gather_scatter_pair_general_ops  */
> -  1 /* gather_scatter_pair_pred_ops  */
> -};
> -
> -static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
> -{
> -  &neoversev1_scalar_issue_info,
> -  &neoversev1_advsimd_issue_info,
> -  &neoverse512tvb_sve_issue_info
> -};
> -
> -static const struct cpu_vector_cost neoverse512tvb_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  2, /* scalar_fp_stmt_cost  */
> -  4, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  1, /* cond_taken_branch_cost  */
> -  1, /* cond_not_taken_branch_cost  */
> -  &neoversev1_advsimd_vector_cost, /* advsimd  */
> -  &neoverse512tvb_sve_vector_cost, /* sve  */
> -  &neoverse512tvb_vec_issue_info /* issue_info  */
> -};
> -
> -static const struct tune_params neoverse512tvb_tunings =
> -{
> -  &cortexa76_extra_costs,
> -  &neoversev1_addrcost_table,
> -  &neoversev1_regmove_cost,
> -  &neoverse512tvb_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_128 | SVE_256, /* sve_width  */
> -  { 4, /* load_int.  */
> -    2, /* store_int.  */
> -    6, /* load_fp.  */
> -    2, /* store_fp.  */
> -    6, /* load_pred.  */
> -    1 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  3, /* issue_rate  */
> -  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> -  "32:16",	/* function_align.  */
> -  "4",		/* jump_align.  */
> -  "32:16",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  4,	/* fma_reassoc_width.  */
> -  2,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
> -   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
> -   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
> -  &generic_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
> -};
> -
> -static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
> -{
> -  2, /* int_stmt_cost  */
> -  2, /* fp_stmt_cost  */
> -  2, /* ld2_st2_permute_cost */
> -  2, /* ld3_st3_permute_cost  */
> -  3, /* ld4_st4_permute_cost  */
> -  3, /* permute_cost  */
> -  4, /* reduc_i8_cost  */
> -  4, /* reduc_i16_cost  */
> -  2, /* reduc_i32_cost  */
> -  2, /* reduc_i64_cost  */
> -  6, /* reduc_f16_cost  */
> -  4, /* reduc_f32_cost  */
> -  2, /* reduc_f64_cost  */
> -  2, /* store_elt_extra_cost  */
> -  /* This value is just inherited from the Cortex-A57 table.  */
> -  8, /* vec_to_scalar_cost  */
> -  /* This depends very much on what the scalar value is and
> -     where it comes from.  E.g. some constants take two dependent
> -     instructions or a load, while others might be moved from a GPR.
> -     4 seems to be a reasonable compromise in practice.  */
> -  4, /* scalar_to_vec_cost  */
> -  4, /* align_load_cost  */
> -  4, /* unalign_load_cost  */
> -  /* Although stores have a latency of 2 and compete for the
> -     vector pipes, in practice it's better not to model that.  */
> -  1, /* unalign_store_cost  */
> -  1  /* store_cost  */
> -};
> -
> -static const sve_vec_cost neoversen2_sve_vector_cost =
> -{
> -  {
> -    2, /* int_stmt_cost  */
> -    2, /* fp_stmt_cost  */
> -    3, /* ld2_st2_permute_cost  */
> -    4, /* ld3_st3_permute_cost  */
> -    4, /* ld4_st4_permute_cost  */
> -    3, /* permute_cost  */
> -    /* Theoretically, a reduction involving 15 scalar ADDs could
> -       complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
> -       completes in 11 cycles, so give it a cost of 15 + 6.  */
> -    21, /* reduc_i8_cost  */
> -    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
> -    13, /* reduc_i16_cost  */
> -    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
> -    9, /* reduc_i32_cost  */
> -    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
> -    2, /* reduc_i64_cost  */
> -    /* Theoretically, a reduction involving 7 scalar FADDs could
> -       complete in ~8 cycles and would have a cost of 14.  FADDV
> -       completes in 6 cycles, so give it a cost of 14 - 2.  */
> -    12, /* reduc_f16_cost  */
> -    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
> -    6, /* reduc_f32_cost  */
> -    /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
> -    2, /* reduc_f64_cost  */
> -    2, /* store_elt_extra_cost  */
> -    /* This value is just inherited from the Cortex-A57 table.  */
> -    8, /* vec_to_scalar_cost  */
> -    /* See the comment above the Advanced SIMD versions.  */
> -    4, /* scalar_to_vec_cost  */
> -    4, /* align_load_cost  */
> -    4, /* unalign_load_cost  */
> -    /* Although stores have a latency of 2 and compete for the
> -       vector pipes, in practice it's better not to model that.  */
> -    1, /* unalign_store_cost  */
> -    1  /* store_cost  */
> -  },
> -  3, /* clast_cost  */
> -  10, /* fadda_f16_cost  */
> -  6, /* fadda_f32_cost  */
> -  4, /* fadda_f64_cost  */
> -  /* A strided Advanced SIMD x64 load would take two parallel FP loads
> -     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
> -     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
> -     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
> -     (cost 2) to that, to avoid the difference being lost in rounding.
> -
> -     There is no easy comparison between a strided Advanced SIMD x32 load
> -     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
> -     operation more than a 64-bit gather.  */
> -  14, /* gather_load_x32_cost  */
> -  12, /* gather_load_x64_cost  */
> -  3 /* scatter_store_elt_cost  */
> -};
> -
> -static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
> -{
> -  3, /* loads_stores_per_cycle  */
> -  2, /* stores_per_cycle  */
> -  4, /* general_ops_per_cycle  */
> -  0, /* fp_simd_load_general_ops  */
> -  1 /* fp_simd_store_general_ops  */
> -};
> -
> -static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
> -{
> -  {
> -    3, /* loads_stores_per_cycle  */
> -    2, /* stores_per_cycle  */
> -    2, /* general_ops_per_cycle  */
> -    0, /* fp_simd_load_general_ops  */
> -    1 /* fp_simd_store_general_ops  */
> -  },
> -  2, /* ld2_st2_general_ops  */
> -  2, /* ld3_st3_general_ops  */
> -  3 /* ld4_st4_general_ops  */
> -};
> -
> -static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
> -{
> -  {
> -    {
> -      3, /* loads_per_cycle  */
> -      2, /* stores_per_cycle  */
> -      2, /* general_ops_per_cycle  */
> -      0, /* fp_simd_load_general_ops  */
> -      1 /* fp_simd_store_general_ops  */
> -    },
> -    2, /* ld2_st2_general_ops  */
> -    3, /* ld3_st3_general_ops  */
> -    3 /* ld4_st4_general_ops  */
> -  },
> -  2, /* pred_ops_per_cycle  */
> -  2, /* while_pred_ops  */
> -  2, /* int_cmp_pred_ops  */
> -  1, /* fp_cmp_pred_ops  */
> -  1, /* gather_scatter_pair_general_ops  */
> -  1 /* gather_scatter_pair_pred_ops  */
> -};
> -
> -static const aarch64_vec_issue_info neoversen2_vec_issue_info =
> -{
> -  &neoversen2_scalar_issue_info,
> -  &neoversen2_advsimd_issue_info,
> -  &neoversen2_sve_issue_info
> -};
> -
> -/* Neoverse N2 costs for vector insn classes.  */
> -static const struct cpu_vector_cost neoversen2_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  2, /* scalar_fp_stmt_cost  */
> -  4, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  1, /* cond_taken_branch_cost  */
> -  1, /* cond_not_taken_branch_cost  */
> -  &neoversen2_advsimd_vector_cost, /* advsimd  */
> -  &neoversen2_sve_vector_cost, /* sve  */
> -  &neoversen2_vec_issue_info /* issue_info  */
> -};
> -
> -static const struct tune_params neoversen2_tunings =
> -{
> -  &cortexa76_extra_costs,
> -  &neoversen2_addrcost_table,
> -  &neoversen2_regmove_cost,
> -  &neoversen2_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_128, /* sve_width  */
> -  { 4, /* load_int.  */
> -    1, /* store_int.  */
> -    6, /* load_fp.  */
> -    2, /* store_fp.  */
> -    6, /* load_pred.  */
> -    1 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  3, /* issue_rate  */
> -  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> -  "32:16",	/* function_align.  */
> -  "4",		/* jump_align.  */
> -  "32:16",	/* loop_align.  */
> -  2,	/* int_reassoc_width.  */
> -  4,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  2,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
> -   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
> -   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
> -   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
> -  &generic_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
> -};
> -
> -static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
> -{
> -  2, /* int_stmt_cost  */
> -  2, /* fp_stmt_cost  */
> -  2, /* ld2_st2_permute_cost */
> -  2, /* ld3_st3_permute_cost  */
> -  3, /* ld4_st4_permute_cost  */
> -  3, /* permute_cost  */
> -  4, /* reduc_i8_cost  */
> -  4, /* reduc_i16_cost  */
> -  2, /* reduc_i32_cost  */
> -  2, /* reduc_i64_cost  */
> -  6, /* reduc_f16_cost  */
> -  3, /* reduc_f32_cost  */
> -  2, /* reduc_f64_cost  */
> -  2, /* store_elt_extra_cost  */
> -  /* This value is just inherited from the Cortex-A57 table.  */
> -  8, /* vec_to_scalar_cost  */
> -  /* This depends very much on what the scalar value is and
> -     where it comes from.  E.g. some constants take two dependent
> -     instructions or a load, while others might be moved from a GPR.
> -     4 seems to be a reasonable compromise in practice.  */
> -  4, /* scalar_to_vec_cost  */
> -  4, /* align_load_cost  */
> -  4, /* unalign_load_cost  */
> -  /* Although stores have a latency of 2 and compete for the
> -     vector pipes, in practice it's better not to model that.  */
> -  1, /* unalign_store_cost  */
> -  1  /* store_cost  */
> -};
> -
> -static const sve_vec_cost neoversev2_sve_vector_cost =
> -{
> -  {
> -    2, /* int_stmt_cost  */
> -    2, /* fp_stmt_cost  */
> -    3, /* ld2_st2_permute_cost  */
> -    3, /* ld3_st3_permute_cost  */
> -    4, /* ld4_st4_permute_cost  */
> -    3, /* permute_cost  */
> -    /* Theoretically, a reduction involving 15 scalar ADDs could
> -       complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
> -       completes in 11 cycles, so give it a cost of 15 + 8.  */
> -    21, /* reduc_i8_cost  */
> -    /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
> -    14, /* reduc_i16_cost  */
> -    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
> -    7, /* reduc_i32_cost  */
> -    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
> -    2, /* reduc_i64_cost  */
> -    /* Theoretically, a reduction involving 7 scalar FADDs could
> -       complete in ~6 cycles and would have a cost of 14.  FADDV
> -       completes in 8 cycles, so give it a cost of 14 + 2.  */
> -    16, /* reduc_f16_cost  */
> -    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
> -    8, /* reduc_f32_cost  */
> -    /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2.  */
> -    4, /* reduc_f64_cost  */
> -    2, /* store_elt_extra_cost  */
> -    /* This value is just inherited from the Cortex-A57 table.  */
> -    8, /* vec_to_scalar_cost  */
> -    /* See the comment above the Advanced SIMD versions.  */
> -    4, /* scalar_to_vec_cost  */
> -    4, /* align_load_cost  */
> -    4, /* unalign_load_cost  */
> -    /* Although stores have a latency of 2 and compete for the
> -       vector pipes, in practice it's better not to model that.  */
> -    1, /* unalign_store_cost  */
> -    1  /* store_cost  */
> -  },
> -  3, /* clast_cost  */
> -  10, /* fadda_f16_cost  */
> -  6, /* fadda_f32_cost  */
> -  4, /* fadda_f64_cost  */
> -  /* A strided Advanced SIMD x64 load would take two parallel FP loads
> -     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
> -     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
> -     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
> -     (cost 2) to that, to avoid the difference being lost in rounding.
> -
> -     There is no easy comparison between a strided Advanced SIMD x32 load
> -     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
> -     operation more than a 64-bit gather.  */
> -  14, /* gather_load_x32_cost  */
> -  12, /* gather_load_x64_cost  */
> -  3 /* scatter_store_elt_cost  */
> -};
> -
> -static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
> -{
> -  3, /* loads_stores_per_cycle  */
> -  2, /* stores_per_cycle  */
> -  6, /* general_ops_per_cycle  */
> -  0, /* fp_simd_load_general_ops  */
> -  1 /* fp_simd_store_general_ops  */
> -};
> -
> -static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
> -{
> -  {
> -    3, /* loads_stores_per_cycle  */
> -    2, /* stores_per_cycle  */
> -    4, /* general_ops_per_cycle  */
> -    0, /* fp_simd_load_general_ops  */
> -    1 /* fp_simd_store_general_ops  */
> -  },
> -  2, /* ld2_st2_general_ops  */
> -  2, /* ld3_st3_general_ops  */
> -  3 /* ld4_st4_general_ops  */
> -};
> -
> -static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
> -{
> -  {
> -    {
> -      3, /* loads_per_cycle  */
> -      2, /* stores_per_cycle  */
> -      4, /* general_ops_per_cycle  */
> -      0, /* fp_simd_load_general_ops  */
> -      1 /* fp_simd_store_general_ops  */
> -    },
> -    2, /* ld2_st2_general_ops  */
> -    3, /* ld3_st3_general_ops  */
> -    3 /* ld4_st4_general_ops  */
> -  },
> -  2, /* pred_ops_per_cycle  */
> -  2, /* while_pred_ops  */
> -  2, /* int_cmp_pred_ops  */
> -  1, /* fp_cmp_pred_ops  */
> -  1, /* gather_scatter_pair_general_ops  */
> -  1 /* gather_scatter_pair_pred_ops  */
> -};
> -
> -static const aarch64_vec_issue_info neoversev2_vec_issue_info =
> -{
> -  &neoversev2_scalar_issue_info,
> -  &neoversev2_advsimd_issue_info,
> -  &neoversev2_sve_issue_info
> -};
> -
> -/* Demeter costs for vector insn classes.  */
> -static const struct cpu_vector_cost neoversev2_vector_cost =
> -{
> -  1, /* scalar_int_stmt_cost  */
> -  2, /* scalar_fp_stmt_cost  */
> -  4, /* scalar_load_cost  */
> -  1, /* scalar_store_cost  */
> -  1, /* cond_taken_branch_cost  */
> -  1, /* cond_not_taken_branch_cost  */
> -  &neoversev2_advsimd_vector_cost, /* advsimd  */
> -  &neoversev2_sve_vector_cost, /* sve  */
> -  &neoversev2_vec_issue_info /* issue_info  */
> -};
> -
> -static const struct tune_params neoversev2_tunings =
> -{
> -  &cortexa76_extra_costs,
> -  &neoversev2_addrcost_table,
> -  &neoversev2_regmove_cost,
> -  &neoversev2_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_128, /* sve_width  */
> -  { 4, /* load_int.  */
> -    2, /* store_int.  */
> -    6, /* load_fp.  */
> -    1, /* store_fp.  */
> -    6, /* load_pred.  */
> -    2 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  5, /* issue_rate  */
> -  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> -  "32:16",	/* function_align.  */
> -  "4",		/* jump_align.  */
> -  "32:16",	/* loop_align.  */
> -  3,	/* int_reassoc_width.  */
> -  6,	/* fp_reassoc_width.  */
> -  4,	/* fma_reassoc_width.  */
> -  3,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
> -   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
> -   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
> -   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
> -  &generic_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
> -};
> -
> -static const struct tune_params a64fx_tunings =
> -{
> -  &a64fx_extra_costs,
> -  &a64fx_addrcost_table,
> -  &a64fx_regmove_cost,
> -  &a64fx_vector_cost,
> -  &generic_branch_cost,
> -  &generic_approx_modes,
> -  SVE_512, /* sve_width  */
> -  { 4, /* load_int.  */
> -    4, /* store_int.  */
> -    4, /* load_fp.  */
> -    4, /* store_fp.  */
> -    4, /* load_pred.  */
> -    4 /* store_pred.  */
> -  }, /* memmov_cost.  */
> -  7, /* issue_rate  */
> -  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> -  "32",	/* function_align.  */
> -  "16",	/* jump_align.  */
> -  "32",	/* loop_align.  */
> -  4,	/* int_reassoc_width.  */
> -  2,	/* fp_reassoc_width.  */
> -  1,	/* fma_reassoc_width.  */
> -  2,	/* vec_reassoc_width.  */
> -  2,	/* min_div_recip_mul_sf.  */
> -  2,	/* min_div_recip_mul_df.  */
> -  0,	/* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> -  &a64fx_prefetch_tune,
> -  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> -  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> -};
> +#include "tuning_models/generic.h"
> +#include "tuning_models/cortexa35.h"
> +#include "tuning_models/cortexa53.h"
> +#include "tuning_models/cortexa57.h"
> +#include "tuning_models/cortexa72.h"
> +#include "tuning_models/cortexa73.h"
> +#include "tuning_models/exynosm1.h"
> +#include "tuning_models/thunderxt88.h"
> +#include "tuning_models/thunderx.h"
> +#include "tuning_models/tsv110.h"
> +#include "tuning_models/xgene1.h"
> +#include "tuning_models/emag.h"
> +#include "tuning_models/qdf24xx.h"
> +#include "tuning_models/saphira.h"
> +#include "tuning_models/thunderx2t99.h"
> +#include "tuning_models/thunderx3t110.h"
> +#include "tuning_models/neoversen1.h"
> +#include "tuning_models/ampere1.h"
> +#include "tuning_models/ampere1a.h"
> +#include "tuning_models/neoversev1.h"
> +#include "tuning_models/neoverse512tvb.h"
> +#include "tuning_models/neoversen2.h"
> +#include "tuning_models/neoversev2.h"
> +#include "tuning_models/a64fx.h"
>   
>   /* Support for fine-grained override of the tuning structures.  */
>   struct aarch64_tuning_override_function
> diff --git a/gcc/config/aarch64/tuning_models/a64fx.h b/gcc/config/aarch64/tuning_models/a64fx.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..7b06c27eba1e4de01738bdfdc077460f9135fb41
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/a64fx.h
> @@ -0,0 +1,169 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_A64FX
> +#define GCC_AARCH64_H_A64FX
> +
> +#include "generic.h"
> +
> +static const struct cpu_addrcost_table a64fx_addrcost_table =
> +{
> +    {
> +      1, /* hi  */
> +      1, /* si  */
> +      1, /* di  */
> +      2, /* ti  */
> +    },
> +  0, /* pre_modify  */
> +  0, /* post_modify  */
> +  0, /* post_modify_ld3_st3  */
> +  0, /* post_modify_ld4_st4  */
> +  2, /* register_offset  */
> +  3, /* register_sextend  */
> +  3, /* register_zextend  */
> +  0, /* imm_offset  */
> +};
> +
> +static const struct cpu_regmove_cost a64fx_regmove_cost =
> +{
> +  1, /* GP2GP  */
> +  /* Avoid the use of slow int<->fp moves for spilling by setting
> +     their cost higher than memmov_cost.  */
> +  5, /* GP2FP  */
> +  7, /* FP2GP  */
> +  2 /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost a64fx_advsimd_vector_cost =
> +{
> +  2, /* int_stmt_cost  */
> +  5, /* fp_stmt_cost  */
> +  0, /* ld2_st2_permute_cost  */
> +  0, /* ld3_st3_permute_cost  */
> +  0, /* ld4_st4_permute_cost  */
> +  3, /* permute_cost  */
> +  13, /* reduc_i8_cost  */
> +  13, /* reduc_i16_cost  */
> +  13, /* reduc_i32_cost  */
> +  13, /* reduc_i64_cost  */
> +  13, /* reduc_f16_cost  */
> +  13, /* reduc_f32_cost  */
> +  13, /* reduc_f64_cost  */
> +  13, /* store_elt_extra_cost  */
> +  13, /* vec_to_scalar_cost  */
> +  4, /* scalar_to_vec_cost  */
> +  6, /* align_load_cost  */
> +  6, /* unalign_load_cost  */
> +  1, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +static const sve_vec_cost a64fx_sve_vector_cost =
> +{
> +  {
> +    2, /* int_stmt_cost  */
> +    5, /* fp_stmt_cost  */
> +    0, /* ld2_st2_permute_cost  */
> +    0, /* ld3_st3_permute_cost  */
> +    0, /* ld4_st4_permute_cost  */
> +    3, /* permute_cost  */
> +    13, /* reduc_i8_cost  */
> +    13, /* reduc_i16_cost  */
> +    13, /* reduc_i32_cost  */
> +    13, /* reduc_i64_cost  */
> +    13, /* reduc_f16_cost  */
> +    13, /* reduc_f32_cost  */
> +    13, /* reduc_f64_cost  */
> +    13, /* store_elt_extra_cost  */
> +    13, /* vec_to_scalar_cost  */
> +    4, /* scalar_to_vec_cost  */
> +    6, /* align_load_cost  */
> +    6, /* unalign_load_cost  */
> +    1, /* unalign_store_cost  */
> +    1  /* store_cost  */
> +  },
> +  13, /* clast_cost  */
> +  13, /* fadda_f16_cost  */
> +  13, /* fadda_f32_cost  */
> +  13, /* fadda_f64_cost  */
> +  64, /* gather_load_x32_cost  */
> +  32, /* gather_load_x64_cost  */
> +  1 /* scatter_store_elt_cost  */
> +};
> +
> +static const struct cpu_vector_cost a64fx_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  5, /* scalar_fp_stmt_cost  */
> +  4, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  3, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &a64fx_advsimd_vector_cost, /* advsimd  */
> +  &a64fx_sve_vector_cost, /* sve  */
> +  nullptr /* issue_info  */
> +};
> +
> +static const cpu_prefetch_tune a64fx_prefetch_tune =
> +{
> +  8,			/* num_slots  */
> +  64,			/* l1_cache_size  */
> +  256,			/* l1_cache_line_size  */
> +  32768,		/* l2_cache_size  */
> +  true,			/* prefetch_dynamic_strides */
> +  -1,			/* minimum_stride */
> +  -1			/* default_opt_level  */
> +};
> +
> +static const struct tune_params a64fx_tunings =
> +{
> +  &a64fx_extra_costs,
> +  &a64fx_addrcost_table,
> +  &a64fx_regmove_cost,
> +  &a64fx_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_512, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  7, /* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> +  "32",	/* function_align.  */
> +  "16",	/* jump_align.  */
> +  "32",	/* loop_align.  */
> +  4,	/* int_reassoc_width.  */
> +  2,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  2,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> +  &a64fx_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_A64FX.  */
> diff --git a/gcc/config/aarch64/tuning_models/ampere1.h b/gcc/config/aarch64/tuning_models/ampere1.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..8d2a1c696103259f23cf73df26cef9d4fa05ac73
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/ampere1.h
> @@ -0,0 +1,113 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_AMPERE1
> +#define GCC_AARCH64_H_AMPERE1
> +
> +#include "generic.h"
> +
> +static const advsimd_vec_cost ampere1_advsimd_vector_cost =
> +{
> +  1, /* int_stmt_cost  */
> +  3, /* fp_stmt_cost  */
> +  0, /* ld2_st2_permute_cost  */
> +  0, /* ld3_st3_permute_cost  */
> +  0, /* ld4_st4_permute_cost  */
> +  2, /* permute_cost  */
> +  12, /* reduc_i8_cost  */
> +  9, /* reduc_i16_cost  */
> +  6, /* reduc_i32_cost  */
> +  5, /* reduc_i64_cost  */
> +  9, /* reduc_f16_cost  */
> +  6, /* reduc_f32_cost  */
> +  5, /* reduc_f64_cost  */
> +  8, /* store_elt_extra_cost  */
> +  6, /* vec_to_scalar_cost  */
> +  7, /* scalar_to_vec_cost  */
> +  4, /* align_load_cost  */
> +  4, /* unalign_load_cost  */
> +  1, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +/* Ampere-1 costs for vector insn classes.  */
> +static const struct cpu_vector_cost ampere1_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  3, /* scalar_fp_stmt_cost  */
> +  4, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  1, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &ampere1_advsimd_vector_cost, /* advsimd  */
> +  nullptr, /* sve  */
> +  nullptr  /* issue_info  */
> +};
> +
> +static const cpu_prefetch_tune ampere1_prefetch_tune =
> +{
> +  0,			/* num_slots  */
> +  64,			/* l1_cache_size  */
> +  64,			/* l1_cache_line_size  */
> +  2048,			/* l2_cache_size  */
> +  true,			/* prefetch_dynamic_strides */
> +  -1,			/* minimum_stride */
> +  -1			/* default_opt_level  */
> +};
> +
> +static const struct tune_params ampere1_tunings =
> +{
> +  &ampere1_extra_costs,
> +  &generic_addrcost_table,
> +  &generic_regmove_cost,
> +  &ampere1_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  4, /* issue_rate  */
> +  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
> +   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
> +   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
> +   AARCH64_FUSE_CMP_BRANCH),
> +  /* fusible_ops  */
> +  "32",		/* function_align.  */
> +  "4",		/* jump_align.  */
> +  "32:16",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  4,	/* fma_reassoc_width.  */
> +  2,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> +  &ampere1_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_AMPERE1.  */
> diff --git a/gcc/config/aarch64/tuning_models/ampere1a.h b/gcc/config/aarch64/tuning_models/ampere1a.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..c419ffb3c1a936a01690ad157c6c71dc645273c8
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/ampere1a.h
> @@ -0,0 +1,65 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_AMPERE1A
> +#define GCC_AARCH64_H_AMPERE1A
> +
> +#include "generic.h"
> +
> +static const struct tune_params ampere1a_tunings =
> +{
> +  &ampere1a_extra_costs,
> +  &generic_addrcost_table,
> +  &generic_regmove_cost,
> +  &ampere1_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  4, /* issue_rate  */
> +  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
> +   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
> +   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
> +   AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
> +   AARCH64_FUSE_ADDSUB_2REG_CONST1),
> +  /* fusible_ops  */
> +  "32",		/* function_align.  */
> +  "4",		/* jump_align.  */
> +  "32:16",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  2,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> +  &ampere1_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_AMPERE1A.  */
> diff --git a/gcc/config/aarch64/tuning_models/cortexa35.h b/gcc/config/aarch64/tuning_models/cortexa35.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..5534335348db96cc57fc9eccd7ff79a624cb528a
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/cortexa35.h
> @@ -0,0 +1,62 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_CORTEXA35
> +#define GCC_AARCH64_H_CORTEXA35
> +
> +#include "generic.h"
> +#include "cortexa53.h"
> +
> +static const struct tune_params cortexa35_tunings =
> +{
> +  &cortexa53_extra_costs,
> +  &generic_addrcost_table,
> +  &cortexa53_regmove_cost,
> +  &generic_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  1, /* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> +   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
> +  "16",	/* function_align.  */
> +  "4",	/* jump_align.  */
> +  "8",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> +  &generic_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_CORTEXA35.  */
> diff --git a/gcc/config/aarch64/tuning_models/cortexa53.h b/gcc/config/aarch64/tuning_models/cortexa53.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..9dfdccc5968e7f062af5c78f153bfe3838263b0a
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/cortexa53.h
> @@ -0,0 +1,71 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_CORTEXA53
> +#define GCC_AARCH64_H_CORTEXA53
> +
> +#include "generic.h"
> +
> +static const struct cpu_regmove_cost cortexa53_regmove_cost =
> +{
> +  1, /* GP2GP  */
> +  /* Avoid the use of slow int<->fp moves for spilling by setting
> +     their cost higher than memmov_cost.  */
> +  5, /* GP2FP  */
> +  5, /* FP2GP  */
> +  2 /* FP2FP  */
> +};
> +
> +static const struct tune_params cortexa53_tunings =
> +{
> +  &cortexa53_extra_costs,
> +  &generic_addrcost_table,
> +  &cortexa53_regmove_cost,
> +  &generic_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  2, /* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> +   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
> +  "16",	/* function_align.  */
> +  "4",	/* jump_align.  */
> +  "8",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> +  &generic_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_CORTEXA53.  */
> diff --git a/gcc/config/aarch64/tuning_models/cortexa57.h b/gcc/config/aarch64/tuning_models/cortexa57.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..9c4789d57833a5879dda8e2fe454ac5f56cb0601
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/cortexa57.h
> @@ -0,0 +1,109 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_CORTEXA57
> +#define GCC_AARCH64_H_CORTEXA57
> +
> +#include "generic.h"
> +
> +static const struct cpu_regmove_cost cortexa57_regmove_cost =
> +{
> +  1, /* GP2GP  */
> +  /* Avoid the use of slow int<->fp moves for spilling by setting
> +     their cost higher than memmov_cost.  */
> +  5, /* GP2FP  */
> +  5, /* FP2GP  */
> +  2 /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
> +{
> +  2, /* int_stmt_cost  */
> +  2, /* fp_stmt_cost  */
> +  0, /* ld2_st2_permute_cost  */
> +  0, /* ld3_st3_permute_cost  */
> +  0, /* ld4_st4_permute_cost  */
> +  3, /* permute_cost  */
> +  8, /* reduc_i8_cost  */
> +  8, /* reduc_i16_cost  */
> +  8, /* reduc_i32_cost  */
> +  8, /* reduc_i64_cost  */
> +  8, /* reduc_f16_cost  */
> +  8, /* reduc_f32_cost  */
> +  8, /* reduc_f64_cost  */
> +  8, /* store_elt_extra_cost  */
> +  8, /* vec_to_scalar_cost  */
> +  8, /* scalar_to_vec_cost  */
> +  4, /* align_load_cost  */
> +  4, /* unalign_load_cost  */
> +  1, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +/* Cortex-A57 costs for vector insn classes.  */
> +static const struct cpu_vector_cost cortexa57_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
> +  4, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  1, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &cortexa57_advsimd_vector_cost, /* advsimd  */
> +  nullptr, /* sve  */
> +  nullptr /* issue_info  */
> +};
> +
> +static const struct tune_params cortexa57_tunings =
> +{
> +  &cortexa57_extra_costs,
> +  &generic_addrcost_table,
> +  &cortexa57_regmove_cost,
> +  &cortexa57_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  3, /* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> +   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
> +  "16",	/* function_align.  */
> +  "4",	/* jump_align.  */
> +  "8",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
> +  &generic_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_CORTEXA57.  */
> diff --git a/gcc/config/aarch64/tuning_models/cortexa72.h b/gcc/config/aarch64/tuning_models/cortexa72.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..968171c9b2e898d7479dbcb462e33fe3905e183d
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/cortexa72.h
> @@ -0,0 +1,61 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_CORTEXA72
> +#define GCC_AARCH64_H_CORTEXA72
> +
> +#include "generic.h"
> +
> +static const struct tune_params cortexa72_tunings =
> +{
> +  &cortexa57_extra_costs,
> +  &generic_addrcost_table,
> +  &cortexa57_regmove_cost,
> +  &cortexa57_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  3, /* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> +   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
> +  "16",	/* function_align.  */
> +  "4",	/* jump_align.  */
> +  "8",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> +  &generic_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_CORTEXA72.  */
> diff --git a/gcc/config/aarch64/tuning_models/cortexa73.h b/gcc/config/aarch64/tuning_models/cortexa73.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..8d1a504ddac39604dd193ce0f434fd2f5145c129
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/cortexa73.h
> @@ -0,0 +1,62 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_CORTEXA73
> +#define GCC_AARCH64_H_CORTEXA73
> +
> +#include "generic.h"
> +
> +static const struct tune_params cortexa73_tunings =
> +{
> +  &cortexa57_extra_costs,
> +  &generic_addrcost_table,
> +  &cortexa57_regmove_cost,
> +  &cortexa57_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  2, /* issue_rate.  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> +   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
> +  "16",	/* function_align.  */
> +  "4",	/* jump_align.  */
> +  "8",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> +  &generic_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +
> +#endif /* GCC_AARCH64_H_CORTEXA73.  */
> diff --git a/gcc/config/aarch64/tuning_models/emag.h b/gcc/config/aarch64/tuning_models/emag.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..3f3402c3fc2a94704eeaf9223ecb0ca1c057cace
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/emag.h
> @@ -0,0 +1,60 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_EMAG
> +#define GCC_AARCH64_H_EMAG
> +
> +#include "generic.h"
> +
> +static const struct tune_params emag_tunings =
> +{
> +  &xgene1_extra_costs,
> +  &xgene1_addrcost_table,
> +  &xgene1_regmove_cost,
> +  &xgene1_vector_cost,
> +  &generic_branch_cost,
> +  &xgene1_approx_modes,
> +  SVE_NOT_IMPLEMENTED,
> +  { 6, /* load_int.  */
> +    6, /* store_int.  */
> +    6, /* load_fp.  */
> +    6, /* store_fp.  */
> +    6, /* load_pred.  */
> +    6 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  4, /* issue_rate  */
> +  AARCH64_FUSE_NOTHING, /* fusible_ops  */
> +  "16",	/* function_align.  */
> +  "16",	/* jump_align.  */
> +  "16",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  17,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
> +  &xgene1_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_EMAG.  */
> diff --git a/gcc/config/aarch64/tuning_models/exynosm1.h b/gcc/config/aarch64/tuning_models/exynosm1.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..a42ea4df97f3f048c41481c304fd3684a69d743b
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/exynosm1.h
> @@ -0,0 +1,144 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_EXYNOSM1
> +#define GCC_AARCH64_H_EXYNOSM1
> +
> +#include "generic.h"
> +
> +static const struct cpu_addrcost_table exynosm1_addrcost_table =
> +{
> +    {
> +      0, /* hi  */
> +      0, /* si  */
> +      0, /* di  */
> +      2, /* ti  */
> +    },
> +  0, /* pre_modify  */
> +  0, /* post_modify  */
> +  0, /* post_modify_ld3_st3  */
> +  0, /* post_modify_ld4_st4  */
> +  1, /* register_offset  */
> +  1, /* register_sextend  */
> +  2, /* register_zextend  */
> +  0, /* imm_offset  */
> +};
> +
> +static const struct cpu_regmove_cost exynosm1_regmove_cost =
> +{
> +  1, /* GP2GP  */
> +  /* Avoid the use of slow int<->fp moves for spilling by setting
> +     their cost higher than memmov_cost (actual, 4 and 9).  */
> +  9, /* GP2FP  */
> +  9, /* FP2GP  */
> +  1 /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
> +{
> +  3, /* int_stmt_cost  */
> +  3, /* fp_stmt_cost  */
> +  0, /* ld2_st2_permute_cost  */
> +  0, /* ld3_st3_permute_cost  */
> +  0, /* ld4_st4_permute_cost  */
> +  3, /* permute_cost  */
> +  3, /* reduc_i8_cost  */
> +  3, /* reduc_i16_cost  */
> +  3, /* reduc_i32_cost  */
> +  3, /* reduc_i64_cost  */
> +  3, /* reduc_f16_cost  */
> +  3, /* reduc_f32_cost  */
> +  3, /* reduc_f64_cost  */
> +  3, /* store_elt_extra_cost  */
> +  3, /* vec_to_scalar_cost  */
> +  3, /* scalar_to_vec_cost  */
> +  5, /* align_load_cost  */
> +  5, /* unalign_load_cost  */
> +  1, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +static const struct cpu_vector_cost exynosm1_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
> +  5, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  1, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &exynosm1_advsimd_vector_cost, /* advsimd  */
> +  nullptr, /* sve  */
> +  nullptr /* issue_info  */
> +};
> +
> +/* Approximation modes for Exynos M1.  */
> +static const cpu_approx_modes exynosm1_approx_modes =
> +{
> +  AARCH64_APPROX_NONE,	/* division  */
> +  AARCH64_APPROX_ALL,	/* sqrt  */
> +  AARCH64_APPROX_ALL	/* recip_sqrt  */
> +};
> +
> +static const cpu_prefetch_tune exynosm1_prefetch_tune =
> +{
> +  0,			/* num_slots  */
> +  -1,			/* l1_cache_size  */
> +  64,			/* l1_cache_line_size  */
> +  -1,			/* l2_cache_size  */
> +  true,			/* prefetch_dynamic_strides */
> +  -1,			/* minimum_stride */
> +  -1			/* default_opt_level  */
> +};
> +
> +static const struct tune_params exynosm1_tunings =
> +{
> +  &exynosm1_extra_costs,
> +  &exynosm1_addrcost_table,
> +  &exynosm1_regmove_cost,
> +  &exynosm1_vector_cost,
> +  &generic_branch_cost,
> +  &exynosm1_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  3,	/* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
> +  "4",	/* function_align.  */
> +  "4",	/* jump_align.  */
> +  "4",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  48,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
> +  &exynosm1_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_EXYNOSM1.  */
> diff --git a/gcc/config/aarch64/tuning_models/generic.h b/gcc/config/aarch64/tuning_models/generic.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..deb2c1cffe255bddcb5be571b12086442782da60
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/generic.h
> @@ -0,0 +1,190 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +   Contributed by ARM Ltd.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_GENERIC
> +#define GCC_AARCH64_H_GENERIC
> +
> +static const struct cpu_addrcost_table generic_addrcost_table =
> +{
> +    {
> +      1, /* hi  */
> +      0, /* si  */
> +      0, /* di  */
> +      1, /* ti  */
> +    },
> +  0, /* pre_modify  */
> +  0, /* post_modify  */
> +  0, /* post_modify_ld3_st3  */
> +  0, /* post_modify_ld4_st4  */
> +  0, /* register_offset  */
> +  0, /* register_sextend  */
> +  0, /* register_zextend  */
> +  0 /* imm_offset  */
> +};
> +
> +static const struct cpu_regmove_cost generic_regmove_cost =
> +{
> +  1, /* GP2GP  */
> +  /* Avoid the use of slow int<->fp moves for spilling by setting
> +     their cost higher than memmov_cost.  */
> +  5, /* GP2FP  */
> +  5, /* FP2GP  */
> +  2 /* FP2FP  */
> +};
> +
> +/* Generic costs for Advanced SIMD vector operations.   */
> +static const advsimd_vec_cost generic_advsimd_vector_cost =
> +{
> +  1, /* int_stmt_cost  */
> +  1, /* fp_stmt_cost  */
> +  0, /* ld2_st2_permute_cost  */
> +  0, /* ld3_st3_permute_cost  */
> +  0, /* ld4_st4_permute_cost  */
> +  2, /* permute_cost  */
> +  2, /* reduc_i8_cost  */
> +  2, /* reduc_i16_cost  */
> +  2, /* reduc_i32_cost  */
> +  2, /* reduc_i64_cost  */
> +  2, /* reduc_f16_cost  */
> +  2, /* reduc_f32_cost  */
> +  2, /* reduc_f64_cost  */
> +  2, /* store_elt_extra_cost  */
> +  2, /* vec_to_scalar_cost  */
> +  1, /* scalar_to_vec_cost  */
> +  1, /* align_load_cost  */
> +  1, /* unalign_load_cost  */
> +  1, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +/* Generic costs for SVE vector operations.  */
> +static const sve_vec_cost generic_sve_vector_cost =
> +{
> +  {
> +    1, /* int_stmt_cost  */
> +    1, /* fp_stmt_cost  */
> +    0, /* ld2_st2_permute_cost  */
> +    0, /* ld3_st3_permute_cost  */
> +    0, /* ld4_st4_permute_cost  */
> +    2, /* permute_cost  */
> +    2, /* reduc_i8_cost  */
> +    2, /* reduc_i16_cost  */
> +    2, /* reduc_i32_cost  */
> +    2, /* reduc_i64_cost  */
> +    2, /* reduc_f16_cost  */
> +    2, /* reduc_f32_cost  */
> +    2, /* reduc_f64_cost  */
> +    2, /* store_elt_extra_cost  */
> +    2, /* vec_to_scalar_cost  */
> +    1, /* scalar_to_vec_cost  */
> +    1, /* align_load_cost  */
> +    1, /* unalign_load_cost  */
> +    1, /* unalign_store_cost  */
> +    1  /* store_cost  */
> +  },
> +  2, /* clast_cost  */
> +  2, /* fadda_f16_cost  */
> +  2, /* fadda_f32_cost  */
> +  2, /* fadda_f64_cost  */
> +  4, /* gather_load_x32_cost  */
> +  2, /* gather_load_x64_cost  */
> +  1 /* scatter_store_elt_cost  */
> +};
> +
> +/* Generic costs for vector insn classes.  */
> +static const struct cpu_vector_cost generic_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
> +  1, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  3, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &generic_advsimd_vector_cost, /* advsimd  */
> +  &generic_sve_vector_cost, /* sve */
> +  nullptr /* issue_info  */
> +};
> +
> +/* Generic costs for branch instructions.  */
> +static const struct cpu_branch_cost generic_branch_cost =
> +{
> +  1,  /* Predictable.  */
> +  3   /* Unpredictable.  */
> +};
> +
> +/* Generic approximation modes.  */
> +static const cpu_approx_modes generic_approx_modes =
> +{
> +  AARCH64_APPROX_NONE,	/* division  */
> +  AARCH64_APPROX_NONE,	/* sqrt  */
> +  AARCH64_APPROX_NONE	/* recip_sqrt  */
> +};
> +
> +/* Generic prefetch settings (which disable prefetch).  */
> +static const cpu_prefetch_tune generic_prefetch_tune =
> +{
> +  0,			/* num_slots  */
> +  -1,			/* l1_cache_size  */
> +  -1,			/* l1_cache_line_size  */
> +  -1,			/* l2_cache_size  */
> +  true,			/* prefetch_dynamic_strides */
> +  -1,			/* minimum_stride */
> +  -1			/* default_opt_level  */
> +};
> +
> +static const struct tune_params generic_tunings =
> +{
> +  &cortexa57_extra_costs,
> +  &generic_addrcost_table,
> +  &generic_regmove_cost,
> +  &generic_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  2, /* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> +  "16:12",	/* function_align.  */
> +  "4",	/* jump_align.  */
> +  "8",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
> +     Neoverse V1.  It does not have a noticeable effect on A64FX and should
> +     have at most a very minor effect on SVE2 cores.  */
> +  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),	/* tune_flags.  */
> +  &generic_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_GENERIC.  */
> diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..50d7b23712cc6a8be8f35246657ec5d86d6d4191
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
> @@ -0,0 +1,164 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_NEOVERSE512TVB
> +#define GCC_AARCH64_H_NEOVERSE512TVB
> +
> +#include "generic.h"
> +
> +static const sve_vec_cost neoverse512tvb_sve_vector_cost =
> +{
> +  {
> +    2, /* int_stmt_cost  */
> +    2, /* fp_stmt_cost  */
> +    4, /* ld2_st2_permute_cost  */
> +    5, /* ld3_st3_permute_cost  */
> +    5, /* ld4_st4_permute_cost  */
> +    3, /* permute_cost  */
> +    /* Theoretically, a reduction involving 15 scalar ADDs could
> +       complete in ~5 cycles and would have a cost of 15.  Assume that
> +       [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6.  */
> +    21, /* reduc_i8_cost  */
> +    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
> +    13, /* reduc_i16_cost  */
> +    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
> +    9, /* reduc_i32_cost  */
> +    /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7.  */
> +    8, /* reduc_i64_cost  */
> +    /* Theoretically, a reduction involving 7 scalar FADDs could
> +       complete in ~6 cycles and would have a cost of 14.  Assume that
> +       FADDV completes in 8 cycles and so give it a cost of 14 + 2.  */
> +    16, /* reduc_f16_cost  */
> +    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
> +    8, /* reduc_f32_cost  */
> +    /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2.  */
> +    4, /* reduc_f64_cost  */
> +    2, /* store_elt_extra_cost  */
> +    /* This value is just inherited from the Cortex-A57 table.  */
> +    8, /* vec_to_scalar_cost  */
> +    /* This depends very much on what the scalar value is and
> +       where it comes from.  E.g. some constants take two dependent
> +       instructions or a load, while others might be moved from a GPR.
> +       4 seems to be a reasonable compromise in practice.  */
> +    4, /* scalar_to_vec_cost  */
> +    4, /* align_load_cost  */
> +    4, /* unalign_load_cost  */
> +    /* Although stores generally have a latency of 2 and compete for the
> +       vector pipes, in practice it's better not to model that.  */
> +    1, /* unalign_store_cost  */
> +    1  /* store_cost  */
> +  },
> +  3, /* clast_cost  */
> +  10, /* fadda_f16_cost  */
> +  6, /* fadda_f32_cost  */
> +  4, /* fadda_f64_cost  */
> +  /* A strided Advanced SIMD x64 load would take two parallel FP loads
> +     (6 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
> +     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
> +     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
> +     (cost 2) to that, to avoid the difference being lost in rounding.
> +
> +     There is no easy comparison between a strided Advanced SIMD x32 load
> +     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
> +     operation more than a 64-bit gather.  */
> +  14, /* gather_load_x32_cost  */
> +  12, /* gather_load_x64_cost  */
> +  3 /* scatter_store_elt_cost  */
> +};
> +
> +static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
> +{
> +  {
> +    {
> +      3, /* loads_per_cycle  */
> +      2, /* stores_per_cycle  */
> +      4, /* general_ops_per_cycle  */
> +      0, /* fp_simd_load_general_ops  */
> +      1 /* fp_simd_store_general_ops  */
> +    },
> +    2, /* ld2_st2_general_ops  */
> +    2, /* ld3_st3_general_ops  */
> +    3 /* ld4_st4_general_ops  */
> +  },
> +  2, /* pred_ops_per_cycle  */
> +  2, /* while_pred_ops  */
> +  2, /* int_cmp_pred_ops  */
> +  1, /* fp_cmp_pred_ops  */
> +  1, /* gather_scatter_pair_general_ops  */
> +  1 /* gather_scatter_pair_pred_ops  */
> +};
> +
> +static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
> +{
> +  &neoversev1_scalar_issue_info,
> +  &neoversev1_advsimd_issue_info,
> +  &neoverse512tvb_sve_issue_info
> +};
> +
> +static const struct cpu_vector_cost neoverse512tvb_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  2, /* scalar_fp_stmt_cost  */
> +  4, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  1, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &neoversev1_advsimd_vector_cost, /* advsimd  */
> +  &neoverse512tvb_sve_vector_cost, /* sve  */
> +  &neoverse512tvb_vec_issue_info /* issue_info  */
> +};
> +
> +static const struct tune_params neoverse512tvb_tunings =
> +{
> +  &cortexa76_extra_costs,
> +  &neoversev1_addrcost_table,
> +  &neoversev1_regmove_cost,
> +  &neoverse512tvb_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_128 | SVE_256, /* sve_width  */
> +  { 4, /* load_int.  */
> +    2, /* store_int.  */
> +    6, /* load_fp.  */
> +    2, /* store_fp.  */
> +    6, /* load_pred.  */
> +    1 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  3, /* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> +  "32:16",	/* function_align.  */
> +  "4",		/* jump_align.  */
> +  "32:16",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  4,	/* fma_reassoc_width.  */
> +  2,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
> +   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
> +   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
> +  &generic_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_NEOVERSE512TVB.  */
> diff --git a/gcc/config/aarch64/tuning_models/neoversen1.h b/gcc/config/aarch64/tuning_models/neoversen1.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..132166d3d06430b725e4448937332cc159c11cda
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/neoversen1.h
> @@ -0,0 +1,60 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_NEOVERSEN1
> +#define GCC_AARCH64_H_NEOVERSEN1
> +
> +#include "generic.h"
> +
> +static const struct tune_params neoversen1_tunings =
> +{
> +  &cortexa76_extra_costs,
> +  &generic_addrcost_table,
> +  &generic_regmove_cost,
> +  &cortexa57_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    2, /* store_int.  */
> +    5, /* load_fp.  */
> +    2, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  3, /* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> +  "32:16",	/* function_align.  */
> +  "4",		/* jump_align.  */
> +  "32:16",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  2,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
> +  &generic_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_NEOVERSEN1.  */
> diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h b/gcc/config/aarch64/tuning_models/neoversen2.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..395a6d82b8403e586bf179cade055543cf9b9eb0
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/neoversen2.h
> @@ -0,0 +1,245 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_NEOVERSEN2
> +#define GCC_AARCH64_H_NEOVERSEN2
> +
> +#include "generic.h"
> +
> +static const struct cpu_addrcost_table neoversen2_addrcost_table =
> +{
> +    {
> +      1, /* hi  */
> +      0, /* si  */
> +      0, /* di  */
> +      1, /* ti  */
> +    },
> +  0, /* pre_modify  */
> +  0, /* post_modify  */
> +  2, /* post_modify_ld3_st3  */
> +  2, /* post_modify_ld4_st4  */
> +  0, /* register_offset  */
> +  0, /* register_sextend  */
> +  0, /* register_zextend  */
> +  0 /* imm_offset  */
> +};
> +
> +static const struct cpu_regmove_cost neoversen2_regmove_cost =
> +{
> +  1, /* GP2GP  */
> +  /* Spilling to int<->fp instead of memory is recommended so set
> +     realistic costs compared to memmov_cost.  */
> +  3, /* GP2FP  */
> +  2, /* FP2GP  */
> +  2 /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
> +{
> +  2, /* int_stmt_cost  */
> +  2, /* fp_stmt_cost  */
> +  2, /* ld2_st2_permute_cost */
> +  2, /* ld3_st3_permute_cost  */
> +  3, /* ld4_st4_permute_cost  */
> +  3, /* permute_cost  */
> +  4, /* reduc_i8_cost  */
> +  4, /* reduc_i16_cost  */
> +  2, /* reduc_i32_cost  */
> +  2, /* reduc_i64_cost  */
> +  6, /* reduc_f16_cost  */
> +  4, /* reduc_f32_cost  */
> +  2, /* reduc_f64_cost  */
> +  2, /* store_elt_extra_cost  */
> +  /* This value is just inherited from the Cortex-A57 table.  */
> +  8, /* vec_to_scalar_cost  */
> +  /* This depends very much on what the scalar value is and
> +     where it comes from.  E.g. some constants take two dependent
> +     instructions or a load, while others might be moved from a GPR.
> +     4 seems to be a reasonable compromise in practice.  */
> +  4, /* scalar_to_vec_cost  */
> +  4, /* align_load_cost  */
> +  4, /* unalign_load_cost  */
> +  /* Although stores have a latency of 2 and compete for the
> +     vector pipes, in practice it's better not to model that.  */
> +  1, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +static const sve_vec_cost neoversen2_sve_vector_cost =
> +{
> +  {
> +    2, /* int_stmt_cost  */
> +    2, /* fp_stmt_cost  */
> +    3, /* ld2_st2_permute_cost  */
> +    4, /* ld3_st3_permute_cost  */
> +    4, /* ld4_st4_permute_cost  */
> +    3, /* permute_cost  */
> +    /* Theoretically, a reduction involving 15 scalar ADDs could
> +       complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
> +       completes in 11 cycles, so give it a cost of 15 + 6.  */
> +    21, /* reduc_i8_cost  */
> +    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
> +    13, /* reduc_i16_cost  */
> +    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
> +    9, /* reduc_i32_cost  */
> +    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
> +    2, /* reduc_i64_cost  */
> +    /* Theoretically, a reduction involving 7 scalar FADDs could
> +       complete in ~8 cycles and would have a cost of 14.  FADDV
> +       completes in 6 cycles, so give it a cost of 14 - 2.  */
> +    12, /* reduc_f16_cost  */
> +    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
> +    6, /* reduc_f32_cost  */
> +    /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
> +    2, /* reduc_f64_cost  */
> +    2, /* store_elt_extra_cost  */
> +    /* This value is just inherited from the Cortex-A57 table.  */
> +    8, /* vec_to_scalar_cost  */
> +    /* See the comment above the Advanced SIMD versions.  */
> +    4, /* scalar_to_vec_cost  */
> +    4, /* align_load_cost  */
> +    4, /* unalign_load_cost  */
> +    /* Although stores have a latency of 2 and compete for the
> +       vector pipes, in practice it's better not to model that.  */
> +    1, /* unalign_store_cost  */
> +    1  /* store_cost  */
> +  },
> +  3, /* clast_cost  */
> +  10, /* fadda_f16_cost  */
> +  6, /* fadda_f32_cost  */
> +  4, /* fadda_f64_cost  */
> +  /* A strided Advanced SIMD x64 load would take two parallel FP loads
> +     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
> +     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
> +     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
> +     (cost 2) to that, to avoid the difference being lost in rounding.
> +
> +     There is no easy comparison between a strided Advanced SIMD x32 load
> +     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
> +     operation more than a 64-bit gather.  */
> +  14, /* gather_load_x32_cost  */
> +  12, /* gather_load_x64_cost  */
> +  3 /* scatter_store_elt_cost  */
> +};
> +
> +static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
> +{
> +  3, /* loads_stores_per_cycle  */
> +  2, /* stores_per_cycle  */
> +  4, /* general_ops_per_cycle  */
> +  0, /* fp_simd_load_general_ops  */
> +  1 /* fp_simd_store_general_ops  */
> +};
> +
> +static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
> +{
> +  {
> +    3, /* loads_stores_per_cycle  */
> +    2, /* stores_per_cycle  */
> +    2, /* general_ops_per_cycle  */
> +    0, /* fp_simd_load_general_ops  */
> +    1 /* fp_simd_store_general_ops  */
> +  },
> +  2, /* ld2_st2_general_ops  */
> +  2, /* ld3_st3_general_ops  */
> +  3 /* ld4_st4_general_ops  */
> +};
> +
> +static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
> +{
> +  {
> +    {
> +      3, /* loads_per_cycle  */
> +      2, /* stores_per_cycle  */
> +      2, /* general_ops_per_cycle  */
> +      0, /* fp_simd_load_general_ops  */
> +      1 /* fp_simd_store_general_ops  */
> +    },
> +    2, /* ld2_st2_general_ops  */
> +    3, /* ld3_st3_general_ops  */
> +    3 /* ld4_st4_general_ops  */
> +  },
> +  2, /* pred_ops_per_cycle  */
> +  2, /* while_pred_ops  */
> +  2, /* int_cmp_pred_ops  */
> +  1, /* fp_cmp_pred_ops  */
> +  1, /* gather_scatter_pair_general_ops  */
> +  1 /* gather_scatter_pair_pred_ops  */
> +};
> +
> +static const aarch64_vec_issue_info neoversen2_vec_issue_info =
> +{
> +  &neoversen2_scalar_issue_info,
> +  &neoversen2_advsimd_issue_info,
> +  &neoversen2_sve_issue_info
> +};
> +
> +/* Neoverse N2 costs for vector insn classes.  */
> +static const struct cpu_vector_cost neoversen2_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  2, /* scalar_fp_stmt_cost  */
> +  4, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  1, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &neoversen2_advsimd_vector_cost, /* advsimd  */
> +  &neoversen2_sve_vector_cost, /* sve  */
> +  &neoversen2_vec_issue_info /* issue_info  */
> +};
> +
> +static const struct tune_params neoversen2_tunings =
> +{
> +  &cortexa76_extra_costs,
> +  &neoversen2_addrcost_table,
> +  &neoversen2_regmove_cost,
> +  &neoversen2_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_128, /* sve_width  */
> +  { 4, /* load_int.  */
> +    1, /* store_int.  */
> +    6, /* load_fp.  */
> +    2, /* store_fp.  */
> +    6, /* load_pred.  */
> +    1 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  3, /* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> +  "32:16",	/* function_align.  */
> +  "4",		/* jump_align.  */
> +  "32:16",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  2,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
> +   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
> +   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
> +   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
> +  &generic_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_NEOVERSEN2.  */
> diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h b/gcc/config/aarch64/tuning_models/neoversev1.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..584a5000e06f598dcdd3bcc533dc6dbc642223ca
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/neoversev1.h
> @@ -0,0 +1,237 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_NEOVERSEV1
> +#define GCC_AARCH64_H_NEOVERSEV1
> +
> +#include "generic.h"
> +
> +static const struct cpu_addrcost_table neoversev1_addrcost_table =
> +{
> +    {
> +      1, /* hi  */
> +      0, /* si  */
> +      0, /* di  */
> +      1, /* ti  */
> +    },
> +  0, /* pre_modify  */
> +  0, /* post_modify  */
> +  3, /* post_modify_ld3_st3  */
> +  3, /* post_modify_ld4_st4  */
> +  0, /* register_offset  */
> +  0, /* register_sextend  */
> +  0, /* register_zextend  */
> +  0 /* imm_offset  */
> +};
> +
> +static const struct cpu_regmove_cost neoversev1_regmove_cost =
> +{
> +  1, /* GP2GP  */
> +  /* Spilling to int<->fp instead of memory is recommended so set
> +     realistic costs compared to memmov_cost.  */
> +  3, /* GP2FP  */
> +  2, /* FP2GP  */
> +  2 /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
> +{
> +  2, /* int_stmt_cost  */
> +  2, /* fp_stmt_cost  */
> +  4, /* ld2_st2_permute_cost */
> +  4, /* ld3_st3_permute_cost  */
> +  5, /* ld4_st4_permute_cost  */
> +  3, /* permute_cost  */
> +  4, /* reduc_i8_cost  */
> +  4, /* reduc_i16_cost  */
> +  2, /* reduc_i32_cost  */
> +  2, /* reduc_i64_cost  */
> +  6, /* reduc_f16_cost  */
> +  3, /* reduc_f32_cost  */
> +  2, /* reduc_f64_cost  */
> +  2, /* store_elt_extra_cost  */
> +  /* This value is just inherited from the Cortex-A57 table.  */
> +  8, /* vec_to_scalar_cost  */
> +  /* This depends very much on what the scalar value is and
> +     where it comes from.  E.g. some constants take two dependent
> +     instructions or a load, while others might be moved from a GPR.
> +     4 seems to be a reasonable compromise in practice.  */
> +  4, /* scalar_to_vec_cost  */
> +  4, /* align_load_cost  */
> +  4, /* unalign_load_cost  */
> +  /* Although stores have a latency of 2 and compete for the
> +     vector pipes, in practice it's better not to model that.  */
> +  1, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +static const sve_vec_cost neoversev1_sve_vector_cost =
> +{
> +  {
> +    2, /* int_stmt_cost  */
> +    2, /* fp_stmt_cost  */
> +    4, /* ld2_st2_permute_cost  */
> +    7, /* ld3_st3_permute_cost  */
> +    8, /* ld4_st4_permute_cost  */
> +    3, /* permute_cost  */
> +    /* Theoretically, a reduction involving 31 scalar ADDs could
> +       complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
> +       completes in 14 cycles, so give it a cost of 31 + 5.  */
> +    36, /* reduc_i8_cost  */
> +    /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
> +    22, /* reduc_i16_cost  */
> +    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
> +    14, /* reduc_i32_cost  */
> +    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
> +    11, /* reduc_i64_cost  */
> +    /* Theoretically, a reduction involving 15 scalar FADDs could
> +       complete in ~9 cycles and would have a cost of 30.  FADDV
> +       completes in 13 cycles, so give it a cost of 30 + 4.  */
> +    34, /* reduc_f16_cost  */
> +    /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
> +    19, /* reduc_f32_cost  */
> +    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
> +    11, /* reduc_f64_cost  */
> +    2, /* store_elt_extra_cost  */
> +    /* This value is just inherited from the Cortex-A57 table.  */
> +    8, /* vec_to_scalar_cost  */
> +    /* See the comment above the Advanced SIMD versions.  */
> +    4, /* scalar_to_vec_cost  */
> +    4, /* align_load_cost  */
> +    4, /* unalign_load_cost  */
> +    /* Although stores have a latency of 2 and compete for the
> +       vector pipes, in practice it's better not to model that.  */
> +    1, /* unalign_store_cost  */
> +    1  /* store_cost  */
> +  },
> +  3, /* clast_cost  */
> +  19, /* fadda_f16_cost  */
> +  11, /* fadda_f32_cost  */
> +  8, /* fadda_f64_cost  */
> +  32, /* gather_load_x32_cost  */
> +  16, /* gather_load_x64_cost  */
> +  3 /* scatter_store_elt_cost  */
> +};
> +
> +static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
> +{
> +  3, /* loads_stores_per_cycle  */
> +  2, /* stores_per_cycle  */
> +  4, /* general_ops_per_cycle  */
> +  0, /* fp_simd_load_general_ops  */
> +  1 /* fp_simd_store_general_ops  */
> +};
> +
> +static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
> +{
> +  {
> +    3, /* loads_stores_per_cycle  */
> +    2, /* stores_per_cycle  */
> +    4, /* general_ops_per_cycle  */
> +    0, /* fp_simd_load_general_ops  */
> +    1 /* fp_simd_store_general_ops  */
> +  },
> +  2, /* ld2_st2_general_ops  */
> +  2, /* ld3_st3_general_ops  */
> +  3 /* ld4_st4_general_ops  */
> +};
> +
> +static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
> +{
> +  {
> +    {
> +      2, /* loads_per_cycle  */
> +      2, /* stores_per_cycle  */
> +      2, /* general_ops_per_cycle  */
> +      0, /* fp_simd_load_general_ops  */
> +      1 /* fp_simd_store_general_ops  */
> +    },
> +    2, /* ld2_st2_general_ops  */
> +    2, /* ld3_st3_general_ops  */
> +    3 /* ld4_st4_general_ops  */
> +  },
> +  1, /* pred_ops_per_cycle  */
> +  2, /* while_pred_ops  */
> +  2, /* int_cmp_pred_ops  */
> +  1, /* fp_cmp_pred_ops  */
> +  1, /* gather_scatter_pair_general_ops  */
> +  1 /* gather_scatter_pair_pred_ops  */
> +};
> +
> +static const aarch64_vec_issue_info neoversev1_vec_issue_info =
> +{
> +  &neoversev1_scalar_issue_info,
> +  &neoversev1_advsimd_issue_info,
> +  &neoversev1_sve_issue_info
> +};
> +
> +/* Neoverse V1 costs for vector insn classes.  */
> +static const struct cpu_vector_cost neoversev1_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  2, /* scalar_fp_stmt_cost  */
> +  4, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  1, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &neoversev1_advsimd_vector_cost, /* advsimd  */
> +  &neoversev1_sve_vector_cost, /* sve  */
> +  &neoversev1_vec_issue_info /* issue_info  */
> +};
> +
> +static const struct tune_params neoversev1_tunings =
> +{
> +  &cortexa76_extra_costs,
> +  &neoversev1_addrcost_table,
> +  &neoversev1_regmove_cost,
> +  &neoversev1_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_256, /* sve_width  */
> +  { 4, /* load_int.  */
> +    2, /* store_int.  */
> +    6, /* load_fp.  */
> +    2, /* store_fp.  */
> +    6, /* load_pred.  */
> +    1 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  3, /* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> +  "32:16",	/* function_align.  */
> +  "4",		/* jump_align.  */
> +  "32:16",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  4,	/* fma_reassoc_width.  */
> +  2,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
> +   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
> +   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
> +   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
> +  &generic_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +
> +#endif /* GCC_AARCH64_H_NEOVERSEV1.  */
> diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..28d4244ef4c99ecdffb7408e39dc21bc191223de
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/neoversev2.h
> @@ -0,0 +1,245 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_NEOVERSEV2
> +#define GCC_AARCH64_H_NEOVERSEV2
> +
> +#include "generic.h"
> +
> +static const struct cpu_addrcost_table neoversev2_addrcost_table =
> +{
> +    {
> +      1, /* hi  */
> +      0, /* si  */
> +      0, /* di  */
> +      1, /* ti  */
> +    },
> +  0, /* pre_modify  */
> +  0, /* post_modify  */
> +  2, /* post_modify_ld3_st3  */
> +  2, /* post_modify_ld4_st4  */
> +  0, /* register_offset  */
> +  0, /* register_sextend  */
> +  0, /* register_zextend  */
> +  0 /* imm_offset  */
> +};
> +
> +static const struct cpu_regmove_cost neoversev2_regmove_cost =
> +{
> +  1, /* GP2GP  */
> +  /* Spilling to int<->fp instead of memory is recommended so set
> +     realistic costs compared to memmov_cost.  */
> +  3, /* GP2FP  */
> +  2, /* FP2GP  */
> +  2 /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
> +{
> +  2, /* int_stmt_cost  */
> +  2, /* fp_stmt_cost  */
> +  2, /* ld2_st2_permute_cost */
> +  2, /* ld3_st3_permute_cost  */
> +  3, /* ld4_st4_permute_cost  */
> +  3, /* permute_cost  */
> +  4, /* reduc_i8_cost  */
> +  4, /* reduc_i16_cost  */
> +  2, /* reduc_i32_cost  */
> +  2, /* reduc_i64_cost  */
> +  6, /* reduc_f16_cost  */
> +  3, /* reduc_f32_cost  */
> +  2, /* reduc_f64_cost  */
> +  2, /* store_elt_extra_cost  */
> +  /* This value is just inherited from the Cortex-A57 table.  */
> +  8, /* vec_to_scalar_cost  */
> +  /* This depends very much on what the scalar value is and
> +     where it comes from.  E.g. some constants take two dependent
> +     instructions or a load, while others might be moved from a GPR.
> +     4 seems to be a reasonable compromise in practice.  */
> +  4, /* scalar_to_vec_cost  */
> +  4, /* align_load_cost  */
> +  4, /* unalign_load_cost  */
> +  /* Although stores have a latency of 2 and compete for the
> +     vector pipes, in practice it's better not to model that.  */
> +  1, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +static const sve_vec_cost neoversev2_sve_vector_cost =
> +{
> +  {
> +    2, /* int_stmt_cost  */
> +    2, /* fp_stmt_cost  */
> +    3, /* ld2_st2_permute_cost  */
> +    3, /* ld3_st3_permute_cost  */
> +    4, /* ld4_st4_permute_cost  */
> +    3, /* permute_cost  */
> +    /* Theoretically, a reduction involving 15 scalar ADDs could
> +       complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
> +       completes in 11 cycles, so give it a cost of 15 + 8.  */
> +    21, /* reduc_i8_cost  */
> +    /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
> +    14, /* reduc_i16_cost  */
> +    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
> +    7, /* reduc_i32_cost  */
> +    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
> +    2, /* reduc_i64_cost  */
> +    /* Theoretically, a reduction involving 7 scalar FADDs could
> +       complete in ~6 cycles and would have a cost of 14.  FADDV
> +       completes in 8 cycles, so give it a cost of 14 + 2.  */
> +    16, /* reduc_f16_cost  */
> +    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
> +    8, /* reduc_f32_cost  */
> +    /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2.  */
> +    4, /* reduc_f64_cost  */
> +    2, /* store_elt_extra_cost  */
> +    /* This value is just inherited from the Cortex-A57 table.  */
> +    8, /* vec_to_scalar_cost  */
> +    /* See the comment above the Advanced SIMD versions.  */
> +    4, /* scalar_to_vec_cost  */
> +    4, /* align_load_cost  */
> +    4, /* unalign_load_cost  */
> +    /* Although stores have a latency of 2 and compete for the
> +       vector pipes, in practice it's better not to model that.  */
> +    1, /* unalign_store_cost  */
> +    1  /* store_cost  */
> +  },
> +  3, /* clast_cost  */
> +  10, /* fadda_f16_cost  */
> +  6, /* fadda_f32_cost  */
> +  4, /* fadda_f64_cost  */
> +  /* A strided Advanced SIMD x64 load would take two parallel FP loads
> +     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
> +     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
> +     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
> +     (cost 2) to that, to avoid the difference being lost in rounding.
> +
> +     There is no easy comparison between a strided Advanced SIMD x32 load
> +     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
> +     operation more than a 64-bit gather.  */
> +  14, /* gather_load_x32_cost  */
> +  12, /* gather_load_x64_cost  */
> +  3 /* scatter_store_elt_cost  */
> +};
> +
> +static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
> +{
> +  3, /* loads_stores_per_cycle  */
> +  2, /* stores_per_cycle  */
> +  6, /* general_ops_per_cycle  */
> +  0, /* fp_simd_load_general_ops  */
> +  1 /* fp_simd_store_general_ops  */
> +};
> +
> +static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
> +{
> +  {
> +    3, /* loads_stores_per_cycle  */
> +    2, /* stores_per_cycle  */
> +    4, /* general_ops_per_cycle  */
> +    0, /* fp_simd_load_general_ops  */
> +    1 /* fp_simd_store_general_ops  */
> +  },
> +  2, /* ld2_st2_general_ops  */
> +  2, /* ld3_st3_general_ops  */
> +  3 /* ld4_st4_general_ops  */
> +};
> +
> +static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
> +{
> +  {
> +    {
> +      3, /* loads_per_cycle  */
> +      2, /* stores_per_cycle  */
> +      4, /* general_ops_per_cycle  */
> +      0, /* fp_simd_load_general_ops  */
> +      1 /* fp_simd_store_general_ops  */
> +    },
> +    2, /* ld2_st2_general_ops  */
> +    3, /* ld3_st3_general_ops  */
> +    3 /* ld4_st4_general_ops  */
> +  },
> +  2, /* pred_ops_per_cycle  */
> +  2, /* while_pred_ops  */
> +  2, /* int_cmp_pred_ops  */
> +  1, /* fp_cmp_pred_ops  */
> +  1, /* gather_scatter_pair_general_ops  */
> +  1 /* gather_scatter_pair_pred_ops  */
> +};
> +
> +static const aarch64_vec_issue_info neoversev2_vec_issue_info =
> +{
> +  &neoversev2_scalar_issue_info,
> +  &neoversev2_advsimd_issue_info,
> +  &neoversev2_sve_issue_info
> +};
> +
> +/* Demeter costs for vector insn classes.  */
> +static const struct cpu_vector_cost neoversev2_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  2, /* scalar_fp_stmt_cost  */
> +  4, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  1, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &neoversev2_advsimd_vector_cost, /* advsimd  */
> +  &neoversev2_sve_vector_cost, /* sve  */
> +  &neoversev2_vec_issue_info /* issue_info  */
> +};
> +
> +static const struct tune_params neoversev2_tunings =
> +{
> +  &cortexa76_extra_costs,
> +  &neoversev2_addrcost_table,
> +  &neoversev2_regmove_cost,
> +  &neoversev2_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_128, /* sve_width  */
> +  { 4, /* load_int.  */
> +    2, /* store_int.  */
> +    6, /* load_fp.  */
> +    1, /* store_fp.  */
> +    6, /* load_pred.  */
> +    2 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  5, /* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
> +  "32:16",	/* function_align.  */
> +  "4",		/* jump_align.  */
> +  "32:16",	/* loop_align.  */
> +  3,	/* int_reassoc_width.  */
> +  6,	/* fp_reassoc_width.  */
> +  4,	/* fma_reassoc_width.  */
> +  3,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
> +   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
> +   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
> +   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
> +  &generic_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_NEOVERSEV2.  */
> diff --git a/gcc/config/aarch64/tuning_models/qdf24xx.h b/gcc/config/aarch64/tuning_models/qdf24xx.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..29c9b9f5843acc15450a2492b141c02ee48a3f13
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/qdf24xx.h
> @@ -0,0 +1,137 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_QDF24XX
> +#define GCC_AARCH64_H_QDF24XX
> +
> +#include "generic.h"
> +
> +static const struct cpu_addrcost_table qdf24xx_addrcost_table =
> +{
> +    {
> +      1, /* hi  */
> +      1, /* si  */
> +      1, /* di  */
> +      2, /* ti  */
> +    },
> +  1, /* pre_modify  */
> +  1, /* post_modify  */
> +  1, /* post_modify_ld3_st3  */
> +  1, /* post_modify_ld4_st4  */
> +  3, /* register_offset  */
> +  3, /* register_sextend  */
> +  3, /* register_zextend  */
> +  2, /* imm_offset  */
> +};
> +
> +static const struct cpu_regmove_cost qdf24xx_regmove_cost =
> +{
> +  2, /* GP2GP  */
> +  /* Avoid the use of int<->fp moves for spilling.  */
> +  6, /* GP2FP  */
> +  6, /* FP2GP  */
> +  4 /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
> +{
> +  1, /* int_stmt_cost  */
> +  3, /* fp_stmt_cost  */
> +  0, /* ld2_st2_permute_cost  */
> +  0, /* ld3_st3_permute_cost  */
> +  0, /* ld4_st4_permute_cost  */
> +  2, /* permute_cost  */
> +  1, /* reduc_i8_cost  */
> +  1, /* reduc_i16_cost  */
> +  1, /* reduc_i32_cost  */
> +  1, /* reduc_i64_cost  */
> +  1, /* reduc_f16_cost  */
> +  1, /* reduc_f32_cost  */
> +  1, /* reduc_f64_cost  */
> +  1, /* store_elt_extra_cost  */
> +  1, /* vec_to_scalar_cost  */
> +  1, /* scalar_to_vec_cost  */
> +  1, /* align_load_cost  */
> +  1, /* unalign_load_cost  */
> +  1, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +/* QDF24XX costs for vector insn classes.  */
> +static const struct cpu_vector_cost qdf24xx_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
> +  1, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  3, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &qdf24xx_advsimd_vector_cost, /* advsimd  */
> +  nullptr, /* sve  */
> +  nullptr /* issue_info  */
> +};
> +
> +static const cpu_prefetch_tune qdf24xx_prefetch_tune =
> +{
> +  4,			/* num_slots  */
> +  32,			/* l1_cache_size  */
> +  64,			/* l1_cache_line_size  */
> +  512,			/* l2_cache_size  */
> +  false,		/* prefetch_dynamic_strides */
> +  2048,			/* minimum_stride */
> +  3			/* default_opt_level  */
> +};
> +
> +static const struct tune_params qdf24xx_tunings =
> +{
> +  &qdf24xx_extra_costs,
> +  &qdf24xx_addrcost_table,
> +  &qdf24xx_regmove_cost,
> +  &qdf24xx_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  4, /* issue_rate  */
> +  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> +   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
> +  "16",	/* function_align.  */
> +  "8",	/* jump_align.  */
> +  "16",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
> +  &qdf24xx_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_QDF24XX.  */
> diff --git a/gcc/config/aarch64/tuning_models/saphira.h b/gcc/config/aarch64/tuning_models/saphira.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..e584d316bb7c3c2d232cf7623a92100ad261f07d
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/saphira.h
> @@ -0,0 +1,63 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_SAPHIRA
> +#define GCC_AARCH64_H_SAPHIRA
> +
> +#include "generic.h"
> +
> +/* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
> +   for now.  */
> +static const struct tune_params saphira_tunings =
> +{
> +  &generic_extra_costs,
> +  &generic_addrcost_table,
> +  &generic_regmove_cost,
> +  &generic_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  4, /* issue_rate  */
> +  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> +   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
> +  "16",	/* function_align.  */
> +  "8",	/* jump_align.  */
> +  "16",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
> +  &generic_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_SAPHIRA.  */
> diff --git a/gcc/config/aarch64/tuning_models/thunderx.h b/gcc/config/aarch64/tuning_models/thunderx.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..dd4b9d539fc5cf2bd20d84e91d6b72fa7237f99f
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/thunderx.h
> @@ -0,0 +1,117 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_THUNDERX
> +#define GCC_AARCH64_H_THUNDERX
> +
> +#include "generic.h"
> +
> +static const struct cpu_regmove_cost thunderx_regmove_cost =
> +{
> +  2, /* GP2GP  */
> +  2, /* GP2FP  */
> +  6, /* FP2GP  */
> +  4 /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost thunderx_advsimd_vector_cost =
> +{
> +  4, /* int_stmt_cost  */
> +  1, /* fp_stmt_cost  */
> +  0, /* ld2_st2_permute_cost  */
> +  0, /* ld3_st3_permute_cost  */
> +  0, /* ld4_st4_permute_cost  */
> +  4, /* permute_cost  */
> +  2, /* reduc_i8_cost  */
> +  2, /* reduc_i16_cost  */
> +  2, /* reduc_i32_cost  */
> +  2, /* reduc_i64_cost  */
> +  2, /* reduc_f16_cost  */
> +  2, /* reduc_f32_cost  */
> +  2, /* reduc_f64_cost  */
> +  2, /* store_elt_extra_cost  */
> +  2, /* vec_to_scalar_cost  */
> +  2, /* scalar_to_vec_cost  */
> +  3, /* align_load_cost  */
> +  5, /* unalign_load_cost  */
> +  5, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +/* ThunderX costs for vector insn classes.  */
> +static const struct cpu_vector_cost thunderx_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
> +  3, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  3, /* cond_taken_branch_cost  */
> +  3, /* cond_not_taken_branch_cost  */
> +  &thunderx_advsimd_vector_cost, /* advsimd  */
> +  nullptr, /* sve  */
> +  nullptr /* issue_info  */
> +};
> +
> +static const cpu_prefetch_tune thunderx_prefetch_tune =
> +{
> +  8,			/* num_slots  */
> +  32,			/* l1_cache_size  */
> +  128,			/* l1_cache_line_size  */
> +  -1,			/* l2_cache_size  */
> +  true,			/* prefetch_dynamic_strides */
> +  -1,			/* minimum_stride */
> +  -1			/* default_opt_level  */
> +};
> +
> +static const struct tune_params thunderx_tunings =
> +{
> +  &thunderx_extra_costs,
> +  &generic_addrcost_table,
> +  &thunderx_regmove_cost,
> +  &thunderx_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 6, /* load_int.  */
> +    6, /* store_int.  */
> +    6, /* load_fp.  */
> +    6, /* store_fp.  */
> +    6, /* load_pred.  */
> +    6 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  2, /* issue_rate  */
> +  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
> +  "8",	/* function_align.  */
> +  "8",	/* jump_align.  */
> +  "8",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
> +  &thunderx_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_THUNDERX.  */
> diff --git a/gcc/config/aarch64/tuning_models/thunderx2t99.h b/gcc/config/aarch64/tuning_models/thunderx2t99.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..0a376e0bab37b0b5bc1ea23de0e96a9245846fd7
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/thunderx2t99.h
> @@ -0,0 +1,137 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_THUNDERX2T99
> +#define GCC_AARCH64_H_THUNDERX2T99
> +
> +#include "generic.h"
> +
> +static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
> +{
> +    {
> +      1, /* hi  */
> +      1, /* si  */
> +      1, /* di  */
> +      2, /* ti  */
> +    },
> +  0, /* pre_modify  */
> +  0, /* post_modify  */
> +  0, /* post_modify_ld3_st3  */
> +  0, /* post_modify_ld4_st4  */
> +  2, /* register_offset  */
> +  3, /* register_sextend  */
> +  3, /* register_zextend  */
> +  0, /* imm_offset  */
> +};
> +
> +static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
> +{
> +  1, /* GP2GP  */
> +  /* Avoid the use of int<->fp moves for spilling.  */
> +  5, /* GP2FP  */
> +  6, /* FP2GP  */
> +  3, /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
> +{
> +  4, /* int_stmt_cost  */
> +  5, /* fp_stmt_cost  */
> +  0, /* ld2_st2_permute_cost  */
> +  0, /* ld3_st3_permute_cost  */
> +  0, /* ld4_st4_permute_cost  */
> +  10, /* permute_cost  */
> +  6, /* reduc_i8_cost  */
> +  6, /* reduc_i16_cost  */
> +  6, /* reduc_i32_cost  */
> +  6, /* reduc_i64_cost  */
> +  6, /* reduc_f16_cost  */
> +  6, /* reduc_f32_cost  */
> +  6, /* reduc_f64_cost  */
> +  6, /* store_elt_extra_cost  */
> +  6, /* vec_to_scalar_cost  */
> +  5, /* scalar_to_vec_cost  */
> +  4, /* align_load_cost  */
> +  4, /* unalign_load_cost  */
> +  1, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +/* Costs for vector insn classes for Vulcan.  */
> +static const struct cpu_vector_cost thunderx2t99_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  6, /* scalar_fp_stmt_cost  */
> +  4, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  2, /* cond_taken_branch_cost  */
> +  1,  /* cond_not_taken_branch_cost  */
> +  &thunderx2t99_advsimd_vector_cost, /* advsimd  */
> +  nullptr, /* sve  */
> +  nullptr /* issue_info  */
> +};
> +
> +static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
> +{
> +  8,			/* num_slots  */
> +  32,			/* l1_cache_size  */
> +  64,			/* l1_cache_line_size  */
> +  256,			/* l2_cache_size  */
> +  true,			/* prefetch_dynamic_strides */
> +  -1,			/* minimum_stride */
> +  -1			/* default_opt_level  */
> +};
> +
> +static const struct tune_params thunderx2t99_tunings =
> +{
> +  &thunderx2t99_extra_costs,
> +  &thunderx2t99_addrcost_table,
> +  &thunderx2t99_regmove_cost,
> +  &thunderx2t99_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  4, /* issue_rate.  */
> +  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
> +   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
> +  "16",	/* function_align.  */
> +  "8",	/* jump_align.  */
> +  "16",	/* loop_align.  */
> +  3,	/* int_reassoc_width.  */
> +  2,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  2,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> +  &thunderx2t99_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_THUNDERX2T99.  */
> diff --git a/gcc/config/aarch64/tuning_models/thunderx3t110.h b/gcc/config/aarch64/tuning_models/thunderx3t110.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..65203b4af132e12e4994013fbab228bd3873b756
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/thunderx3t110.h
> @@ -0,0 +1,136 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_THUNDERX3T110
> +#define GCC_AARCH64_H_THUNDERX3T110
> +
> +#include "generic.h"
> +
> +static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
> +{
> +    {
> +      1, /* hi  */
> +      1, /* si  */
> +      1, /* di  */
> +      2, /* ti  */
> +    },
> +  0, /* pre_modify  */
> +  0, /* post_modify  */
> +  0, /* post_modify_ld3_st3  */
> +  0, /* post_modify_ld4_st4  */
> +  2, /* register_offset  */
> +  3, /* register_sextend  */
> +  3, /* register_zextend  */
> +  0, /* imm_offset  */
> +};
> +
> +static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
> +{
> +  1, /* GP2GP  */
> +  /* Avoid the use of int<->fp moves for spilling.  */
> +  4, /* GP2FP  */
> +  5, /* FP2GP  */
> +  4  /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
> +{
> +  5, /* int_stmt_cost  */
> +  5, /* fp_stmt_cost  */
> +  0, /* ld2_st2_permute_cost  */
> +  0, /* ld3_st3_permute_cost  */
> +  0, /* ld4_st4_permute_cost  */
> +  10, /* permute_cost  */
> +  5, /* reduc_i8_cost  */
> +  5, /* reduc_i16_cost  */
> +  5, /* reduc_i32_cost  */
> +  5, /* reduc_i64_cost  */
> +  5, /* reduc_f16_cost  */
> +  5, /* reduc_f32_cost  */
> +  5, /* reduc_f64_cost  */
> +  5, /* store_elt_extra_cost  */
> +  5, /* vec_to_scalar_cost  */
> +  5, /* scalar_to_vec_cost  */
> +  4, /* align_load_cost  */
> +  4, /* unalign_load_cost  */
> +  4, /* unalign_store_cost  */
> +  4  /* store_cost  */
> +};
> +
> +static const struct cpu_vector_cost thunderx3t110_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  5, /* scalar_fp_stmt_cost  */
> +  4, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  2, /* cond_taken_branch_cost  */
> +  1,  /* cond_not_taken_branch_cost  */
> +  &thunderx3t110_advsimd_vector_cost, /* advsimd  */
> +  nullptr, /* sve  */
> +  nullptr /* issue_info  */
> +};
> +
> +static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
> +{
> +  8,			/* num_slots  */
> +  32,			/* l1_cache_size  */
> +  64,			/* l1_cache_line_size  */
> +  256,			/* l2_cache_size  */
> +  true,			/* prefetch_dynamic_strides */
> +  -1,			/* minimum_stride */
> +  -1			/* default_opt_level  */
> +};
> +
> +static const struct tune_params thunderx3t110_tunings =
> +{
> +  &thunderx3t110_extra_costs,
> +  &thunderx3t110_addrcost_table,
> +  &thunderx3t110_regmove_cost,
> +  &thunderx3t110_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  6, /* issue_rate.  */
> +  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
> +   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
> +  "16",	/* function_align.  */
> +  "8",	/* jump_align.  */
> +  "16",	/* loop_align.  */
> +  3,	/* int_reassoc_width.  */
> +  2,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  2,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> +  &thunderx3t110_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_THUNDERX3T110.  */
> diff --git a/gcc/config/aarch64/tuning_models/thunderxt88.h b/gcc/config/aarch64/tuning_models/thunderxt88.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..dcc74d31484ee6b99d37920dbfe7b1d59377d074
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/thunderxt88.h
> @@ -0,0 +1,72 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_THUNDERXT88
> +#define GCC_AARCH64_H_THUNDERXT88
> +
> +#include "generic.h"
> +#include "thunderx.h"
> +
> +static const cpu_prefetch_tune thunderxt88_prefetch_tune =
> +{
> +  8,			/* num_slots  */
> +  32,			/* l1_cache_size  */
> +  128,			/* l1_cache_line_size  */
> +  16*1024,		/* l2_cache_size  */
> +  true,			/* prefetch_dynamic_strides */
> +  -1,			/* minimum_stride */
> +  3			/* default_opt_level  */
> +};
> +
> +static const struct tune_params thunderxt88_tunings =
> +{
> +  &thunderx_extra_costs,
> +  &generic_addrcost_table,
> +  &thunderx_regmove_cost,
> +  &thunderx_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 6, /* load_int.  */
> +    6, /* store_int.  */
> +    6, /* load_fp.  */
> +    6, /* store_fp.  */
> +    6, /* load_pred.  */
> +    6 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  2, /* issue_rate  */
> +  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
> +  "8",	/* function_align.  */
> +  "8",	/* jump_align.  */
> +  "8",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  0,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
> +  &thunderxt88_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_THUNDERXT88.  */
> diff --git a/gcc/config/aarch64/tuning_models/tsv110.h b/gcc/config/aarch64/tuning_models/tsv110.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..42aeafce652fff34e3277194993dd4aa1f0383a1
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/tsv110.h
> @@ -0,0 +1,137 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_TSV110
> +#define GCC_AARCH64_H_TSV110
> +
> +#include "generic.h"
> +
> +static const struct cpu_addrcost_table tsv110_addrcost_table =
> +{
> +    {
> +      1, /* hi  */
> +      0, /* si  */
> +      0, /* di  */
> +      1, /* ti  */
> +    },
> +  0, /* pre_modify  */
> +  0, /* post_modify  */
> +  0, /* post_modify_ld3_st3  */
> +  0, /* post_modify_ld4_st4  */
> +  0, /* register_offset  */
> +  1, /* register_sextend  */
> +  1, /* register_zextend  */
> +  0, /* imm_offset  */
> +};
> +
> +static const struct cpu_regmove_cost tsv110_regmove_cost =
> +{
> +  1, /* GP2GP  */
> +  /* Avoid the use of slow int<->fp moves for spilling by setting
> +     their cost higher than memmov_cost.  */
> +  2, /* GP2FP  */
> +  3, /* FP2GP  */
> +  2  /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost tsv110_advsimd_vector_cost =
> +{
> +  2, /* int_stmt_cost  */
> +  2, /* fp_stmt_cost  */
> +  0, /* ld2_st2_permute_cost  */
> +  0, /* ld3_st3_permute_cost  */
> +  0, /* ld4_st4_permute_cost  */
> +  2, /* permute_cost  */
> +  3, /* reduc_i8_cost  */
> +  3, /* reduc_i16_cost  */
> +  3, /* reduc_i32_cost  */
> +  3, /* reduc_i64_cost  */
> +  3, /* reduc_f16_cost  */
> +  3, /* reduc_f32_cost  */
> +  3, /* reduc_f64_cost  */
> +  3, /* store_elt_extra_cost  */
> +  3, /* vec_to_scalar_cost  */
> +  2, /* scalar_to_vec_cost  */
> +  5, /* align_load_cost  */
> +  5, /* unalign_load_cost  */
> +  1, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +static const struct cpu_vector_cost tsv110_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
> +  5, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  1, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &tsv110_advsimd_vector_cost, /* advsimd  */
> +  nullptr, /* sve  */
> +  nullptr /* issue_info  */
> +};
> +
> +static const cpu_prefetch_tune tsv110_prefetch_tune =
> +{
> +  0,                    /* num_slots  */
> +  64,                   /* l1_cache_size  */
> +  64,                   /* l1_cache_line_size  */
> +  512,                  /* l2_cache_size  */
> +  true,                 /* prefetch_dynamic_strides */
> +  -1,                   /* minimum_stride */
> +  -1                    /* default_opt_level  */
> +};
> +
> +static const struct tune_params tsv110_tunings =
> +{
> +  &tsv110_extra_costs,
> +  &tsv110_addrcost_table,
> +  &tsv110_regmove_cost,
> +  &tsv110_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  4,    /* issue_rate  */
> +  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
> +   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
> +  "16", /* function_align.  */
> +  "4",  /* jump_align.  */
> +  "8",  /* loop_align.  */
> +  2,    /* int_reassoc_width.  */
> +  4,    /* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,    /* vec_reassoc_width.  */
> +  2,    /* min_div_recip_mul_sf.  */
> +  2,    /* min_div_recip_mul_df.  */
> +  0,    /* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
> +  &tsv110_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_TSV110.  */
> diff --git a/gcc/config/aarch64/tuning_models/xgene1.h b/gcc/config/aarch64/tuning_models/xgene1.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..53a3eb0ddeb80a9735cc988e242a70e87dc90655
> --- /dev/null
> +++ b/gcc/config/aarch64/tuning_models/xgene1.h
> @@ -0,0 +1,145 @@
> +/* Tuning model description for AArch64 architecture.
> +   Copyright (C) 2009-2023 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   You should have received a copy of the GNU General Public License
> +   along with GCC; see the file COPYING3.  If not see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_AARCH64_H_XGENE1
> +#define GCC_AARCH64_H_XGENE1
> +
> +#include "generic.h"
> +
> +static const struct cpu_addrcost_table xgene1_addrcost_table =
> +{
> +    {
> +      1, /* hi  */
> +      0, /* si  */
> +      0, /* di  */
> +      1, /* ti  */
> +    },
> +  1, /* pre_modify  */
> +  1, /* post_modify  */
> +  1, /* post_modify_ld3_st3  */
> +  1, /* post_modify_ld4_st4  */
> +  0, /* register_offset  */
> +  1, /* register_sextend  */
> +  1, /* register_zextend  */
> +  0, /* imm_offset  */
> +};
> +
> +static const struct cpu_regmove_cost xgene1_regmove_cost =
> +{
> +  1, /* GP2GP  */
> +  /* Avoid the use of slow int<->fp moves for spilling by setting
> +     their cost higher than memmov_cost.  */
> +  8, /* GP2FP  */
> +  8, /* FP2GP  */
> +  2 /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost xgene1_advsimd_vector_cost =
> +{
> +  2, /* int_stmt_cost  */
> +  2, /* fp_stmt_cost  */
> +  0, /* ld2_st2_permute_cost  */
> +  0, /* ld3_st3_permute_cost  */
> +  0, /* ld4_st4_permute_cost  */
> +  2, /* permute_cost  */
> +  4, /* reduc_i8_cost  */
> +  4, /* reduc_i16_cost  */
> +  4, /* reduc_i32_cost  */
> +  4, /* reduc_i64_cost  */
> +  4, /* reduc_f16_cost  */
> +  4, /* reduc_f32_cost  */
> +  4, /* reduc_f64_cost  */
> +  4, /* store_elt_extra_cost  */
> +  4, /* vec_to_scalar_cost  */
> +  4, /* scalar_to_vec_cost  */
> +  10, /* align_load_cost  */
> +  10, /* unalign_load_cost  */
> +  2, /* unalign_store_cost  */
> +  2  /* store_cost  */
> +};
> +
> +/* Generic costs for vector insn classes.  */
> +static const struct cpu_vector_cost xgene1_vector_cost =
> +{
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
> +  5, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  2, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &xgene1_advsimd_vector_cost, /* advsimd  */
> +  nullptr, /* sve  */
> +  nullptr /* issue_info  */
> +};
> +
> +/* Approximation modes for X-Gene 1.  */
> +static const cpu_approx_modes xgene1_approx_modes =
> +{
> +  AARCH64_APPROX_NONE,	/* division  */
> +  AARCH64_APPROX_NONE,	/* sqrt  */
> +  AARCH64_APPROX_ALL	/* recip_sqrt  */
> +};
> +
> +static const cpu_prefetch_tune xgene1_prefetch_tune =
> +{
> +  8,			/* num_slots  */
> +  32,			/* l1_cache_size  */
> +  64,			/* l1_cache_line_size  */
> +  256,			/* l2_cache_size  */
> +  true,                 /* prefetch_dynamic_strides */
> +  -1,                   /* minimum_stride */
> +  -1			/* default_opt_level  */
> +};
> +
> +static const struct tune_params xgene1_tunings =
> +{
> +  &xgene1_extra_costs,
> +  &xgene1_addrcost_table,
> +  &xgene1_regmove_cost,
> +  &xgene1_vector_cost,
> +  &generic_branch_cost,
> +  &xgene1_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 6, /* load_int.  */
> +    6, /* store_int.  */
> +    6, /* load_fp.  */
> +    6, /* store_fp.  */
> +    6, /* load_pred.  */
> +    6 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  4, /* issue_rate  */
> +  AARCH64_FUSE_NOTHING, /* fusible_ops  */
> +  "16",	/* function_align.  */
> +  "16",	/* jump_align.  */
> +  "16",	/* loop_align.  */
> +  2,	/* int_reassoc_width.  */
> +  4,	/* fp_reassoc_width.  */
> +  1,	/* fma_reassoc_width.  */
> +  1,	/* vec_reassoc_width.  */
> +  2,	/* min_div_recip_mul_sf.  */
> +  2,	/* min_div_recip_mul_df.  */
> +  17,	/* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
> +  &xgene1_prefetch_tune,
> +  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> +  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> +};
> +
> +#endif /* GCC_AARCH64_H_XGENE1.  */
> 
> 
> 
>
diff mbox series

Patch

--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -354,2405 +354,30 @@  static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 };
 
 /* Tuning parameters.  */
-
-static const struct cpu_addrcost_table generic_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0 /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table exynosm1_addrcost_table =
-{
-    {
-      0, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      2, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  1, /* register_offset  */
-  1, /* register_sextend  */
-  2, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table xgene1_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  1, /* pre_modify  */
-  1, /* post_modify  */
-  1, /* post_modify_ld3_st3  */
-  1, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  1, /* register_sextend  */
-  1, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
-{
-    {
-      1, /* hi  */
-      1, /* si  */
-      1, /* di  */
-      2, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  2, /* register_offset  */
-  3, /* register_sextend  */
-  3, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
-{
-    {
-      1, /* hi  */
-      1, /* si  */
-      1, /* di  */
-      2, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  2, /* register_offset  */
-  3, /* register_sextend  */
-  3, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table tsv110_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  1, /* register_sextend  */
-  1, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table qdf24xx_addrcost_table =
-{
-    {
-      1, /* hi  */
-      1, /* si  */
-      1, /* di  */
-      2, /* ti  */
-    },
-  1, /* pre_modify  */
-  1, /* post_modify  */
-  1, /* post_modify_ld3_st3  */
-  1, /* post_modify_ld4_st4  */
-  3, /* register_offset  */
-  3, /* register_sextend  */
-  3, /* register_zextend  */
-  2, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table a64fx_addrcost_table =
-{
-    {
-      1, /* hi  */
-      1, /* si  */
-      1, /* di  */
-      2, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* post_modify_ld3_st3  */
-  0, /* post_modify_ld4_st4  */
-  2, /* register_offset  */
-  3, /* register_sextend  */
-  3, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table neoversev1_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  3, /* post_modify_ld3_st3  */
-  3, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0 /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table neoversen2_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  2, /* post_modify_ld3_st3  */
-  2, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0 /* imm_offset  */
-};
-
-static const struct cpu_addrcost_table neoversev2_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  2, /* post_modify_ld3_st3  */
-  2, /* post_modify_ld4_st4  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0 /* imm_offset  */
-};
-
-static const struct cpu_regmove_cost generic_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  5, /* GP2FP  */
-  5, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost cortexa57_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  5, /* GP2FP  */
-  5, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost cortexa53_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  5, /* GP2FP  */
-  5, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost exynosm1_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost (actual, 4 and 9).  */
-  9, /* GP2FP  */
-  9, /* FP2GP  */
-  1 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost thunderx_regmove_cost =
-{
-  2, /* GP2GP  */
-  2, /* GP2FP  */
-  6, /* FP2GP  */
-  4 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost xgene1_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  8, /* GP2FP  */
-  8, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost qdf24xx_regmove_cost =
-{
-  2, /* GP2GP  */
-  /* Avoid the use of int<->fp moves for spilling.  */
-  6, /* GP2FP  */
-  6, /* FP2GP  */
-  4 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of int<->fp moves for spilling.  */
-  5, /* GP2FP  */
-  6, /* FP2GP  */
-  3, /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of int<->fp moves for spilling.  */
-  4, /* GP2FP  */
-  5, /* FP2GP  */
-  4  /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost tsv110_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  2, /* GP2FP  */
-  3, /* FP2GP  */
-  2  /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost a64fx_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Avoid the use of slow int<->fp moves for spilling by setting
-     their cost higher than memmov_cost.  */
-  5, /* GP2FP  */
-  7, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost neoversen2_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Spilling to int<->fp instead of memory is recommended so set
-     realistic costs compared to memmov_cost.  */
-  3, /* GP2FP  */
-  2, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost neoversev1_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Spilling to int<->fp instead of memory is recommended so set
-     realistic costs compared to memmov_cost.  */
-  3, /* GP2FP  */
-  2, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-static const struct cpu_regmove_cost neoversev2_regmove_cost =
-{
-  1, /* GP2GP  */
-  /* Spilling to int<->fp instead of memory is recommended so set
-     realistic costs compared to memmov_cost.  */
-  3, /* GP2FP  */
-  2, /* FP2GP  */
-  2 /* FP2FP  */
-};
-
-/* Generic costs for Advanced SIMD vector operations.   */
-static const advsimd_vec_cost generic_advsimd_vector_cost =
-{
-  1, /* int_stmt_cost  */
-  1, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  2, /* reduc_i8_cost  */
-  2, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  2, /* reduc_f16_cost  */
-  2, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  2, /* vec_to_scalar_cost  */
-  1, /* scalar_to_vec_cost  */
-  1, /* align_load_cost  */
-  1, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* Generic costs for SVE vector operations.  */
-static const sve_vec_cost generic_sve_vector_cost =
-{
-  {
-    1, /* int_stmt_cost  */
-    1, /* fp_stmt_cost  */
-    0, /* ld2_st2_permute_cost  */
-    0, /* ld3_st3_permute_cost  */
-    0, /* ld4_st4_permute_cost  */
-    2, /* permute_cost  */
-    2, /* reduc_i8_cost  */
-    2, /* reduc_i16_cost  */
-    2, /* reduc_i32_cost  */
-    2, /* reduc_i64_cost  */
-    2, /* reduc_f16_cost  */
-    2, /* reduc_f32_cost  */
-    2, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    2, /* vec_to_scalar_cost  */
-    1, /* scalar_to_vec_cost  */
-    1, /* align_load_cost  */
-    1, /* unalign_load_cost  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  2, /* clast_cost  */
-  2, /* fadda_f16_cost  */
-  2, /* fadda_f32_cost  */
-  2, /* fadda_f64_cost  */
-  4, /* gather_load_x32_cost  */
-  2, /* gather_load_x64_cost  */
-  1 /* scatter_store_elt_cost  */
-};
-
-/* Generic costs for vector insn classes.  */
-static const struct cpu_vector_cost generic_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  1, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  3, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &generic_advsimd_vector_cost, /* advsimd  */
-  &generic_sve_vector_cost, /* sve */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost a64fx_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  5, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  13, /* reduc_i8_cost  */
-  13, /* reduc_i16_cost  */
-  13, /* reduc_i32_cost  */
-  13, /* reduc_i64_cost  */
-  13, /* reduc_f16_cost  */
-  13, /* reduc_f32_cost  */
-  13, /* reduc_f64_cost  */
-  13, /* store_elt_extra_cost  */
-  13, /* vec_to_scalar_cost  */
-  4, /* scalar_to_vec_cost  */
-  6, /* align_load_cost  */
-  6, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const sve_vec_cost a64fx_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    5, /* fp_stmt_cost  */
-    0, /* ld2_st2_permute_cost  */
-    0, /* ld3_st3_permute_cost  */
-    0, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    13, /* reduc_i8_cost  */
-    13, /* reduc_i16_cost  */
-    13, /* reduc_i32_cost  */
-    13, /* reduc_i64_cost  */
-    13, /* reduc_f16_cost  */
-    13, /* reduc_f32_cost  */
-    13, /* reduc_f64_cost  */
-    13, /* store_elt_extra_cost  */
-    13, /* vec_to_scalar_cost  */
-    4, /* scalar_to_vec_cost  */
-    6, /* align_load_cost  */
-    6, /* unalign_load_cost  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  13, /* clast_cost  */
-  13, /* fadda_f16_cost  */
-  13, /* fadda_f32_cost  */
-  13, /* fadda_f64_cost  */
-  64, /* gather_load_x32_cost  */
-  32, /* gather_load_x64_cost  */
-  1 /* scatter_store_elt_cost  */
-};
-
-static const struct cpu_vector_cost a64fx_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  5, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  3, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &a64fx_advsimd_vector_cost, /* advsimd  */
-  &a64fx_sve_vector_cost, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
-{
-  1, /* int_stmt_cost  */
-  3, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  1, /* reduc_i8_cost  */
-  1, /* reduc_i16_cost  */
-  1, /* reduc_i32_cost  */
-  1, /* reduc_i64_cost  */
-  1, /* reduc_f16_cost  */
-  1, /* reduc_f32_cost  */
-  1, /* reduc_f64_cost  */
-  1, /* store_elt_extra_cost  */
-  1, /* vec_to_scalar_cost  */
-  1, /* scalar_to_vec_cost  */
-  1, /* align_load_cost  */
-  1, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* QDF24XX costs for vector insn classes.  */
-static const struct cpu_vector_cost qdf24xx_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  1, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  3, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &qdf24xx_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-
-static const advsimd_vec_cost thunderx_advsimd_vector_cost =
-{
-  4, /* int_stmt_cost  */
-  1, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  4, /* permute_cost  */
-  2, /* reduc_i8_cost  */
-  2, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  2, /* reduc_f16_cost  */
-  2, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  2, /* vec_to_scalar_cost  */
-  2, /* scalar_to_vec_cost  */
-  3, /* align_load_cost  */
-  5, /* unalign_load_cost  */
-  5, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* ThunderX costs for vector insn classes.  */
-static const struct cpu_vector_cost thunderx_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  3, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  3, /* cond_taken_branch_cost  */
-  3, /* cond_not_taken_branch_cost  */
-  &thunderx_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost tsv110_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  3, /* reduc_i8_cost  */
-  3, /* reduc_i16_cost  */
-  3, /* reduc_i32_cost  */
-  3, /* reduc_i64_cost  */
-  3, /* reduc_f16_cost  */
-  3, /* reduc_f32_cost  */
-  3, /* reduc_f64_cost  */
-  3, /* store_elt_extra_cost  */
-  3, /* vec_to_scalar_cost  */
-  2, /* scalar_to_vec_cost  */
-  5, /* align_load_cost  */
-  5, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const struct cpu_vector_cost tsv110_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  5, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &tsv110_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  8, /* reduc_i8_cost  */
-  8, /* reduc_i16_cost  */
-  8, /* reduc_i32_cost  */
-  8, /* reduc_i64_cost  */
-  8, /* reduc_f16_cost  */
-  8, /* reduc_f32_cost  */
-  8, /* reduc_f64_cost  */
-  8, /* store_elt_extra_cost  */
-  8, /* vec_to_scalar_cost  */
-  8, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* Cortex-A57 costs for vector insn classes.  */
-static const struct cpu_vector_cost cortexa57_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &cortexa57_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
-{
-  3, /* int_stmt_cost  */
-  3, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  3, /* reduc_i8_cost  */
-  3, /* reduc_i16_cost  */
-  3, /* reduc_i32_cost  */
-  3, /* reduc_i64_cost  */
-  3, /* reduc_f16_cost  */
-  3, /* reduc_f32_cost  */
-  3, /* reduc_f64_cost  */
-  3, /* store_elt_extra_cost  */
-  3, /* vec_to_scalar_cost  */
-  3, /* scalar_to_vec_cost  */
-  5, /* align_load_cost  */
-  5, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const struct cpu_vector_cost exynosm1_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  5, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &exynosm1_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost xgene1_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  4, /* reduc_i8_cost  */
-  4, /* reduc_i16_cost  */
-  4, /* reduc_i32_cost  */
-  4, /* reduc_i64_cost  */
-  4, /* reduc_f16_cost  */
-  4, /* reduc_f32_cost  */
-  4, /* reduc_f64_cost  */
-  4, /* store_elt_extra_cost  */
-  4, /* vec_to_scalar_cost  */
-  4, /* scalar_to_vec_cost  */
-  10, /* align_load_cost  */
-  10, /* unalign_load_cost  */
-  2, /* unalign_store_cost  */
-  2  /* store_cost  */
-};
-
-/* Generic costs for vector insn classes.  */
-static const struct cpu_vector_cost xgene1_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  1, /* scalar_fp_stmt_cost  */
-  5, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  2, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &xgene1_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
-{
-  4, /* int_stmt_cost  */
-  5, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  10, /* permute_cost  */
-  6, /* reduc_i8_cost  */
-  6, /* reduc_i16_cost  */
-  6, /* reduc_i32_cost  */
-  6, /* reduc_i64_cost  */
-  6, /* reduc_f16_cost  */
-  6, /* reduc_f32_cost  */
-  6, /* reduc_f64_cost  */
-  6, /* store_elt_extra_cost  */
-  6, /* vec_to_scalar_cost  */
-  5, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* Costs for vector insn classes for Vulcan.  */
-static const struct cpu_vector_cost thunderx2t99_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  6, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  2, /* cond_taken_branch_cost  */
-  1,  /* cond_not_taken_branch_cost  */
-  &thunderx2t99_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
-{
-  5, /* int_stmt_cost  */
-  5, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  10, /* permute_cost  */
-  5, /* reduc_i8_cost  */
-  5, /* reduc_i16_cost  */
-  5, /* reduc_i32_cost  */
-  5, /* reduc_i64_cost  */
-  5, /* reduc_f16_cost  */
-  5, /* reduc_f32_cost  */
-  5, /* reduc_f64_cost  */
-  5, /* store_elt_extra_cost  */
-  5, /* vec_to_scalar_cost  */
-  5, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  4, /* unalign_store_cost  */
-  4  /* store_cost  */
-};
-
-static const struct cpu_vector_cost thunderx3t110_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  5, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  2, /* cond_taken_branch_cost  */
-  1,  /* cond_not_taken_branch_cost  */
-  &thunderx3t110_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr /* issue_info  */
-};
-
-static const advsimd_vec_cost ampere1_advsimd_vector_cost =
-{
-  1, /* int_stmt_cost  */
-  3, /* fp_stmt_cost  */
-  0, /* ld2_st2_permute_cost  */
-  0, /* ld3_st3_permute_cost  */
-  0, /* ld4_st4_permute_cost  */
-  2, /* permute_cost  */
-  12, /* reduc_i8_cost  */
-  9, /* reduc_i16_cost  */
-  6, /* reduc_i32_cost  */
-  5, /* reduc_i64_cost  */
-  9, /* reduc_f16_cost  */
-  6, /* reduc_f32_cost  */
-  5, /* reduc_f64_cost  */
-  8, /* store_elt_extra_cost  */
-  6, /* vec_to_scalar_cost  */
-  7, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-/* Ampere-1 costs for vector insn classes.  */
-static const struct cpu_vector_cost ampere1_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  3, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &ampere1_advsimd_vector_cost, /* advsimd  */
-  nullptr, /* sve  */
-  nullptr  /* issue_info  */
-};
-
-/* Generic costs for branch instructions.  */
-static const struct cpu_branch_cost generic_branch_cost =
-{
-  1,  /* Predictable.  */
-  3   /* Unpredictable.  */
-};
-
-/* Generic approximation modes.  */
-static const cpu_approx_modes generic_approx_modes =
-{
-  AARCH64_APPROX_NONE,	/* division  */
-  AARCH64_APPROX_NONE,	/* sqrt  */
-  AARCH64_APPROX_NONE	/* recip_sqrt  */
-};
-
-/* Approximation modes for Exynos M1.  */
-static const cpu_approx_modes exynosm1_approx_modes =
-{
-  AARCH64_APPROX_NONE,	/* division  */
-  AARCH64_APPROX_ALL,	/* sqrt  */
-  AARCH64_APPROX_ALL	/* recip_sqrt  */
-};
-
-/* Approximation modes for X-Gene 1.  */
-static const cpu_approx_modes xgene1_approx_modes =
-{
-  AARCH64_APPROX_NONE,	/* division  */
-  AARCH64_APPROX_NONE,	/* sqrt  */
-  AARCH64_APPROX_ALL	/* recip_sqrt  */
-};
-
-/* Generic prefetch settings (which disable prefetch).  */
-static const cpu_prefetch_tune generic_prefetch_tune =
-{
-  0,			/* num_slots  */
-  -1,			/* l1_cache_size  */
-  -1,			/* l1_cache_line_size  */
-  -1,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune exynosm1_prefetch_tune =
-{
-  0,			/* num_slots  */
-  -1,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  -1,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune qdf24xx_prefetch_tune =
-{
-  4,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  512,			/* l2_cache_size  */
-  false,		/* prefetch_dynamic_strides */
-  2048,			/* minimum_stride */
-  3			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune thunderxt88_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  128,			/* l1_cache_line_size  */
-  16*1024,		/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  3			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune thunderx_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  128,			/* l1_cache_line_size  */
-  -1,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  256,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  256,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune tsv110_prefetch_tune =
-{
-  0,                    /* num_slots  */
-  64,                   /* l1_cache_size  */
-  64,                   /* l1_cache_line_size  */
-  512,                  /* l2_cache_size  */
-  true,                 /* prefetch_dynamic_strides */
-  -1,                   /* minimum_stride */
-  -1                    /* default_opt_level  */
-};
-
-static const cpu_prefetch_tune xgene1_prefetch_tune =
-{
-  8,			/* num_slots  */
-  32,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  256,			/* l2_cache_size  */
-  true,                 /* prefetch_dynamic_strides */
-  -1,                   /* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune a64fx_prefetch_tune =
-{
-  8,			/* num_slots  */
-  64,			/* l1_cache_size  */
-  256,			/* l1_cache_line_size  */
-  32768,		/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const cpu_prefetch_tune ampere1_prefetch_tune =
-{
-  0,			/* num_slots  */
-  64,			/* l1_cache_size  */
-  64,			/* l1_cache_line_size  */
-  2048,			/* l2_cache_size  */
-  true,			/* prefetch_dynamic_strides */
-  -1,			/* minimum_stride */
-  -1			/* default_opt_level  */
-};
-
-static const struct tune_params generic_tunings =
-{
-  &cortexa57_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &generic_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "16:12",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
-     Neoverse V1.  It does not have a noticeable effect on A64FX and should
-     have at most a very minor effect on SVE2 cores.  */
-  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa35_tunings =
-{
-  &cortexa53_extra_costs,
-  &generic_addrcost_table,
-  &cortexa53_regmove_cost,
-  &generic_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  1, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa53_tunings =
-{
-  &cortexa53_extra_costs,
-  &generic_addrcost_table,
-  &cortexa53_regmove_cost,
-  &generic_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa57_tunings =
-{
-  &cortexa57_extra_costs,
-  &generic_addrcost_table,
-  &cortexa57_regmove_cost,
-  &cortexa57_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa72_tunings =
-{
-  &cortexa57_extra_costs,
-  &generic_addrcost_table,
-  &cortexa57_regmove_cost,
-  &cortexa57_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params cortexa73_tunings =
-{
-  &cortexa57_extra_costs,
-  &generic_addrcost_table,
-  &cortexa57_regmove_cost,
-  &cortexa57_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate.  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params exynosm1_tunings =
-{
-  &exynosm1_extra_costs,
-  &exynosm1_addrcost_table,
-  &exynosm1_regmove_cost,
-  &exynosm1_vector_cost,
-  &generic_branch_cost,
-  &exynosm1_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3,	/* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
-  "4",	/* function_align.  */
-  "4",	/* jump_align.  */
-  "4",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  48,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
-  &exynosm1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params thunderxt88_tunings =
-{
-  &thunderx_extra_costs,
-  &generic_addrcost_table,
-  &thunderx_regmove_cost,
-  &thunderx_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 6, /* load_int.  */
-    6, /* store_int.  */
-    6, /* load_fp.  */
-    6, /* store_fp.  */
-    6, /* load_pred.  */
-    6 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate  */
-  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
-  "8",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &thunderxt88_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
-};
-
-static const struct tune_params thunderx_tunings =
-{
-  &thunderx_extra_costs,
-  &generic_addrcost_table,
-  &thunderx_regmove_cost,
-  &thunderx_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 6, /* load_int.  */
-    6, /* store_int.  */
-    6, /* load_fp.  */
-    6, /* store_fp.  */
-    6, /* load_pred.  */
-    6 /* store_pred.  */
-  }, /* memmov_cost.  */
-  2, /* issue_rate  */
-  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
-  "8",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "8",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
-  &thunderx_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
-};
-
-static const struct tune_params tsv110_tunings =
-{
-  &tsv110_extra_costs,
-  &tsv110_addrcost_table,
-  &tsv110_regmove_cost,
-  &tsv110_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4,    /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
-   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
-  "16", /* function_align.  */
-  "4",  /* jump_align.  */
-  "8",  /* loop_align.  */
-  2,    /* int_reassoc_width.  */
-  4,    /* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,    /* vec_reassoc_width.  */
-  2,    /* min_div_recip_mul_sf.  */
-  2,    /* min_div_recip_mul_df.  */
-  0,    /* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
-  &tsv110_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params xgene1_tunings =
-{
-  &xgene1_extra_costs,
-  &xgene1_addrcost_table,
-  &xgene1_regmove_cost,
-  &xgene1_vector_cost,
-  &generic_branch_cost,
-  &xgene1_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 6, /* load_int.  */
-    6, /* store_int.  */
-    6, /* load_fp.  */
-    6, /* store_fp.  */
-    6, /* load_pred.  */
-    6 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  AARCH64_FUSE_NOTHING, /* fusible_ops  */
-  "16",	/* function_align.  */
-  "16",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  17,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
-  &xgene1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params emag_tunings =
-{
-  &xgene1_extra_costs,
-  &xgene1_addrcost_table,
-  &xgene1_regmove_cost,
-  &xgene1_vector_cost,
-  &generic_branch_cost,
-  &xgene1_approx_modes,
-  SVE_NOT_IMPLEMENTED,
-  { 6, /* load_int.  */
-    6, /* store_int.  */
-    6, /* load_fp.  */
-    6, /* store_fp.  */
-    6, /* load_pred.  */
-    6 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  AARCH64_FUSE_NOTHING, /* fusible_ops  */
-  "16",	/* function_align.  */
-  "16",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  17,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
-  &xgene1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params qdf24xx_tunings =
-{
-  &qdf24xx_extra_costs,
-  &qdf24xx_addrcost_table,
-  &qdf24xx_regmove_cost,
-  &qdf24xx_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
-  "16",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
-  &qdf24xx_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-/* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
-   for now.  */
-static const struct tune_params saphira_tunings =
-{
-  &generic_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &generic_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
-   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
-  "16",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  1,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params thunderx2t99_tunings =
-{
-  &thunderx2t99_extra_costs,
-  &thunderx2t99_addrcost_table,
-  &thunderx2t99_regmove_cost,
-  &thunderx2t99_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate.  */
-  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
-   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  3,	/* int_reassoc_width.  */
-  2,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &thunderx2t99_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params thunderx3t110_tunings =
-{
-  &thunderx3t110_extra_costs,
-  &thunderx3t110_addrcost_table,
-  &thunderx3t110_regmove_cost,
-  &thunderx3t110_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  6, /* issue_rate.  */
-  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
-   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
-  "16",	/* function_align.  */
-  "8",	/* jump_align.  */
-  "16",	/* loop_align.  */
-  3,	/* int_reassoc_width.  */
-  2,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &thunderx3t110_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params neoversen1_tunings =
-{
-  &cortexa76_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &cortexa57_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    2, /* store_int.  */
-    5, /* load_fp.  */
-    2, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const struct tune_params ampere1_tunings =
-{
-  &ampere1_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &ampere1_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
-   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
-   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
-   AARCH64_FUSE_CMP_BRANCH),
-  /* fusible_ops  */
-  "32",		/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  4,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &ampere1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
-};
-
-static const struct tune_params ampere1a_tunings =
-{
-  &ampere1a_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &ampere1_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_NOT_IMPLEMENTED, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  4, /* issue_rate  */
-  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
-   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
-   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
-   AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
-   AARCH64_FUSE_ADDSUB_2REG_CONST1),
-  /* fusible_ops  */
-  "32",		/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &ampere1_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
-};
-
-static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  4, /* ld2_st2_permute_cost */
-  4, /* ld3_st3_permute_cost  */
-  5, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  4, /* reduc_i8_cost  */
-  4, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  6, /* reduc_f16_cost  */
-  3, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  /* This value is just inherited from the Cortex-A57 table.  */
-  8, /* vec_to_scalar_cost  */
-  /* This depends very much on what the scalar value is and
-     where it comes from.  E.g. some constants take two dependent
-     instructions or a load, while others might be moved from a GPR.
-     4 seems to be a reasonable compromise in practice.  */
-  4, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  /* Although stores have a latency of 2 and compete for the
-     vector pipes, in practice it's better not to model that.  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const sve_vec_cost neoversev1_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    2, /* fp_stmt_cost  */
-    4, /* ld2_st2_permute_cost  */
-    7, /* ld3_st3_permute_cost  */
-    8, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    /* Theoretically, a reduction involving 31 scalar ADDs could
-       complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
-       completes in 14 cycles, so give it a cost of 31 + 5.  */
-    36, /* reduc_i8_cost  */
-    /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
-    22, /* reduc_i16_cost  */
-    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
-    14, /* reduc_i32_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
-    11, /* reduc_i64_cost  */
-    /* Theoretically, a reduction involving 15 scalar FADDs could
-       complete in ~9 cycles and would have a cost of 30.  FADDV
-       completes in 13 cycles, so give it a cost of 30 + 4.  */
-    34, /* reduc_f16_cost  */
-    /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
-    19, /* reduc_f32_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
-    11, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    /* This value is just inherited from the Cortex-A57 table.  */
-    8, /* vec_to_scalar_cost  */
-    /* See the comment above the Advanced SIMD versions.  */
-    4, /* scalar_to_vec_cost  */
-    4, /* align_load_cost  */
-    4, /* unalign_load_cost  */
-    /* Although stores have a latency of 2 and compete for the
-       vector pipes, in practice it's better not to model that.  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  3, /* clast_cost  */
-  19, /* fadda_f16_cost  */
-  11, /* fadda_f32_cost  */
-  8, /* fadda_f64_cost  */
-  32, /* gather_load_x32_cost  */
-  16, /* gather_load_x64_cost  */
-  3 /* scatter_store_elt_cost  */
-};
-
-static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
-{
-  3, /* loads_stores_per_cycle  */
-  2, /* stores_per_cycle  */
-  4, /* general_ops_per_cycle  */
-  0, /* fp_simd_load_general_ops  */
-  1 /* fp_simd_store_general_ops  */
-};
-
-static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
-{
-  {
-    3, /* loads_stores_per_cycle  */
-    2, /* stores_per_cycle  */
-    4, /* general_ops_per_cycle  */
-    0, /* fp_simd_load_general_ops  */
-    1 /* fp_simd_store_general_ops  */
-  },
-  2, /* ld2_st2_general_ops  */
-  2, /* ld3_st3_general_ops  */
-  3 /* ld4_st4_general_ops  */
-};
-
-static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
-{
-  {
-    {
-      2, /* loads_per_cycle  */
-      2, /* stores_per_cycle  */
-      2, /* general_ops_per_cycle  */
-      0, /* fp_simd_load_general_ops  */
-      1 /* fp_simd_store_general_ops  */
-    },
-    2, /* ld2_st2_general_ops  */
-    2, /* ld3_st3_general_ops  */
-    3 /* ld4_st4_general_ops  */
-  },
-  1, /* pred_ops_per_cycle  */
-  2, /* while_pred_ops  */
-  2, /* int_cmp_pred_ops  */
-  1, /* fp_cmp_pred_ops  */
-  1, /* gather_scatter_pair_general_ops  */
-  1 /* gather_scatter_pair_pred_ops  */
-};
-
-static const aarch64_vec_issue_info neoversev1_vec_issue_info =
-{
-  &neoversev1_scalar_issue_info,
-  &neoversev1_advsimd_issue_info,
-  &neoversev1_sve_issue_info
-};
-
-/* Neoverse V1 costs for vector insn classes.  */
-static const struct cpu_vector_cost neoversev1_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  2, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &neoversev1_advsimd_vector_cost, /* advsimd  */
-  &neoversev1_sve_vector_cost, /* sve  */
-  &neoversev1_vec_issue_info /* issue_info  */
-};
-
-static const struct tune_params neoversev1_tunings =
-{
-  &cortexa76_extra_costs,
-  &neoversev1_addrcost_table,
-  &neoversev1_regmove_cost,
-  &neoversev1_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_256, /* sve_width  */
-  { 4, /* load_int.  */
-    2, /* store_int.  */
-    6, /* load_fp.  */
-    2, /* store_fp.  */
-    6, /* load_pred.  */
-    1 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  4,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
-
-static const sve_vec_cost neoverse512tvb_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    2, /* fp_stmt_cost  */
-    4, /* ld2_st2_permute_cost  */
-    5, /* ld3_st3_permute_cost  */
-    5, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    /* Theoretically, a reduction involving 15 scalar ADDs could
-       complete in ~5 cycles and would have a cost of 15.  Assume that
-       [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6.  */
-    21, /* reduc_i8_cost  */
-    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
-    13, /* reduc_i16_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
-    9, /* reduc_i32_cost  */
-    /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7.  */
-    8, /* reduc_i64_cost  */
-    /* Theoretically, a reduction involving 7 scalar FADDs could
-       complete in ~6 cycles and would have a cost of 14.  Assume that
-       FADDV completes in 8 cycles and so give it a cost of 14 + 2.  */
-    16, /* reduc_f16_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
-    8, /* reduc_f32_cost  */
-    /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2.  */
-    4, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    /* This value is just inherited from the Cortex-A57 table.  */
-    8, /* vec_to_scalar_cost  */
-    /* This depends very much on what the scalar value is and
-       where it comes from.  E.g. some constants take two dependent
-       instructions or a load, while others might be moved from a GPR.
-       4 seems to be a reasonable compromise in practice.  */
-    4, /* scalar_to_vec_cost  */
-    4, /* align_load_cost  */
-    4, /* unalign_load_cost  */
-    /* Although stores generally have a latency of 2 and compete for the
-       vector pipes, in practice it's better not to model that.  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  3, /* clast_cost  */
-  10, /* fadda_f16_cost  */
-  6, /* fadda_f32_cost  */
-  4, /* fadda_f64_cost  */
-  /* A strided Advanced SIMD x64 load would take two parallel FP loads
-     (6 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
-     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
-     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
-     (cost 2) to that, to avoid the difference being lost in rounding.
-
-     There is no easy comparison between a strided Advanced SIMD x32 load
-     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
-     operation more than a 64-bit gather.  */
-  14, /* gather_load_x32_cost  */
-  12, /* gather_load_x64_cost  */
-  3 /* scatter_store_elt_cost  */
-};
-
-static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
-{
-  {
-    {
-      3, /* loads_per_cycle  */
-      2, /* stores_per_cycle  */
-      4, /* general_ops_per_cycle  */
-      0, /* fp_simd_load_general_ops  */
-      1 /* fp_simd_store_general_ops  */
-    },
-    2, /* ld2_st2_general_ops  */
-    2, /* ld3_st3_general_ops  */
-    3 /* ld4_st4_general_ops  */
-  },
-  2, /* pred_ops_per_cycle  */
-  2, /* while_pred_ops  */
-  2, /* int_cmp_pred_ops  */
-  1, /* fp_cmp_pred_ops  */
-  1, /* gather_scatter_pair_general_ops  */
-  1 /* gather_scatter_pair_pred_ops  */
-};
-
-static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
-{
-  &neoversev1_scalar_issue_info,
-  &neoversev1_advsimd_issue_info,
-  &neoverse512tvb_sve_issue_info
-};
-
-static const struct cpu_vector_cost neoverse512tvb_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  2, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &neoversev1_advsimd_vector_cost, /* advsimd  */
-  &neoverse512tvb_sve_vector_cost, /* sve  */
-  &neoverse512tvb_vec_issue_info /* issue_info  */
-};
-
-static const struct tune_params neoverse512tvb_tunings =
-{
-  &cortexa76_extra_costs,
-  &neoversev1_addrcost_table,
-  &neoversev1_regmove_cost,
-  &neoverse512tvb_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_128 | SVE_256, /* sve_width  */
-  { 4, /* load_int.  */
-    2, /* store_int.  */
-    6, /* load_fp.  */
-    2, /* store_fp.  */
-    6, /* load_pred.  */
-    1 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  4,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
-};
-
-static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  2, /* ld2_st2_permute_cost */
-  2, /* ld3_st3_permute_cost  */
-  3, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  4, /* reduc_i8_cost  */
-  4, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  6, /* reduc_f16_cost  */
-  4, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  /* This value is just inherited from the Cortex-A57 table.  */
-  8, /* vec_to_scalar_cost  */
-  /* This depends very much on what the scalar value is and
-     where it comes from.  E.g. some constants take two dependent
-     instructions or a load, while others might be moved from a GPR.
-     4 seems to be a reasonable compromise in practice.  */
-  4, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  /* Although stores have a latency of 2 and compete for the
-     vector pipes, in practice it's better not to model that.  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const sve_vec_cost neoversen2_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    2, /* fp_stmt_cost  */
-    3, /* ld2_st2_permute_cost  */
-    4, /* ld3_st3_permute_cost  */
-    4, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    /* Theoretically, a reduction involving 15 scalar ADDs could
-       complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
-       completes in 11 cycles, so give it a cost of 15 + 6.  */
-    21, /* reduc_i8_cost  */
-    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
-    13, /* reduc_i16_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
-    9, /* reduc_i32_cost  */
-    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
-    2, /* reduc_i64_cost  */
-    /* Theoretically, a reduction involving 7 scalar FADDs could
-       complete in ~8 cycles and would have a cost of 14.  FADDV
-       completes in 6 cycles, so give it a cost of 14 - 2.  */
-    12, /* reduc_f16_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
-    6, /* reduc_f32_cost  */
-    /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
-    2, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    /* This value is just inherited from the Cortex-A57 table.  */
-    8, /* vec_to_scalar_cost  */
-    /* See the comment above the Advanced SIMD versions.  */
-    4, /* scalar_to_vec_cost  */
-    4, /* align_load_cost  */
-    4, /* unalign_load_cost  */
-    /* Although stores have a latency of 2 and compete for the
-       vector pipes, in practice it's better not to model that.  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  3, /* clast_cost  */
-  10, /* fadda_f16_cost  */
-  6, /* fadda_f32_cost  */
-  4, /* fadda_f64_cost  */
-  /* A strided Advanced SIMD x64 load would take two parallel FP loads
-     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
-     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
-     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
-     (cost 2) to that, to avoid the difference being lost in rounding.
-
-     There is no easy comparison between a strided Advanced SIMD x32 load
-     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
-     operation more than a 64-bit gather.  */
-  14, /* gather_load_x32_cost  */
-  12, /* gather_load_x64_cost  */
-  3 /* scatter_store_elt_cost  */
-};
-
-static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
-{
-  3, /* loads_stores_per_cycle  */
-  2, /* stores_per_cycle  */
-  4, /* general_ops_per_cycle  */
-  0, /* fp_simd_load_general_ops  */
-  1 /* fp_simd_store_general_ops  */
-};
-
-static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
-{
-  {
-    3, /* loads_stores_per_cycle  */
-    2, /* stores_per_cycle  */
-    2, /* general_ops_per_cycle  */
-    0, /* fp_simd_load_general_ops  */
-    1 /* fp_simd_store_general_ops  */
-  },
-  2, /* ld2_st2_general_ops  */
-  2, /* ld3_st3_general_ops  */
-  3 /* ld4_st4_general_ops  */
-};
-
-static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
-{
-  {
-    {
-      3, /* loads_per_cycle  */
-      2, /* stores_per_cycle  */
-      2, /* general_ops_per_cycle  */
-      0, /* fp_simd_load_general_ops  */
-      1 /* fp_simd_store_general_ops  */
-    },
-    2, /* ld2_st2_general_ops  */
-    3, /* ld3_st3_general_ops  */
-    3 /* ld4_st4_general_ops  */
-  },
-  2, /* pred_ops_per_cycle  */
-  2, /* while_pred_ops  */
-  2, /* int_cmp_pred_ops  */
-  1, /* fp_cmp_pred_ops  */
-  1, /* gather_scatter_pair_general_ops  */
-  1 /* gather_scatter_pair_pred_ops  */
-};
-
-static const aarch64_vec_issue_info neoversen2_vec_issue_info =
-{
-  &neoversen2_scalar_issue_info,
-  &neoversen2_advsimd_issue_info,
-  &neoversen2_sve_issue_info
-};
-
-/* Neoverse N2 costs for vector insn classes.  */
-static const struct cpu_vector_cost neoversen2_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  2, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &neoversen2_advsimd_vector_cost, /* advsimd  */
-  &neoversen2_sve_vector_cost, /* sve  */
-  &neoversen2_vec_issue_info /* issue_info  */
-};
-
-static const struct tune_params neoversen2_tunings =
-{
-  &cortexa76_extra_costs,
-  &neoversen2_addrcost_table,
-  &neoversen2_regmove_cost,
-  &neoversen2_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_128, /* sve_width  */
-  { 4, /* load_int.  */
-    1, /* store_int.  */
-    6, /* load_fp.  */
-    2, /* store_fp.  */
-    6, /* load_pred.  */
-    1 /* store_pred.  */
-  }, /* memmov_cost.  */
-  3, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  2,	/* int_reassoc_width.  */
-  4,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
-   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
-};
-
-static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
-{
-  2, /* int_stmt_cost  */
-  2, /* fp_stmt_cost  */
-  2, /* ld2_st2_permute_cost */
-  2, /* ld3_st3_permute_cost  */
-  3, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
-  4, /* reduc_i8_cost  */
-  4, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  6, /* reduc_f16_cost  */
-  3, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* store_elt_extra_cost  */
-  /* This value is just inherited from the Cortex-A57 table.  */
-  8, /* vec_to_scalar_cost  */
-  /* This depends very much on what the scalar value is and
-     where it comes from.  E.g. some constants take two dependent
-     instructions or a load, while others might be moved from a GPR.
-     4 seems to be a reasonable compromise in practice.  */
-  4, /* scalar_to_vec_cost  */
-  4, /* align_load_cost  */
-  4, /* unalign_load_cost  */
-  /* Although stores have a latency of 2 and compete for the
-     vector pipes, in practice it's better not to model that.  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
-};
-
-static const sve_vec_cost neoversev2_sve_vector_cost =
-{
-  {
-    2, /* int_stmt_cost  */
-    2, /* fp_stmt_cost  */
-    3, /* ld2_st2_permute_cost  */
-    3, /* ld3_st3_permute_cost  */
-    4, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
-    /* Theoretically, a reduction involving 15 scalar ADDs could
-       complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
-       completes in 11 cycles, so give it a cost of 15 + 8.  */
-    21, /* reduc_i8_cost  */
-    /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
-    14, /* reduc_i16_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
-    7, /* reduc_i32_cost  */
-    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
-    2, /* reduc_i64_cost  */
-    /* Theoretically, a reduction involving 7 scalar FADDs could
-       complete in ~6 cycles and would have a cost of 14.  FADDV
-       completes in 8 cycles, so give it a cost of 14 + 2.  */
-    16, /* reduc_f16_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
-    8, /* reduc_f32_cost  */
-    /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2.  */
-    4, /* reduc_f64_cost  */
-    2, /* store_elt_extra_cost  */
-    /* This value is just inherited from the Cortex-A57 table.  */
-    8, /* vec_to_scalar_cost  */
-    /* See the comment above the Advanced SIMD versions.  */
-    4, /* scalar_to_vec_cost  */
-    4, /* align_load_cost  */
-    4, /* unalign_load_cost  */
-    /* Although stores have a latency of 2 and compete for the
-       vector pipes, in practice it's better not to model that.  */
-    1, /* unalign_store_cost  */
-    1  /* store_cost  */
-  },
-  3, /* clast_cost  */
-  10, /* fadda_f16_cost  */
-  6, /* fadda_f32_cost  */
-  4, /* fadda_f64_cost  */
-  /* A strided Advanced SIMD x64 load would take two parallel FP loads
-     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
-     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
-     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
-     (cost 2) to that, to avoid the difference being lost in rounding.
-
-     There is no easy comparison between a strided Advanced SIMD x32 load
-     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
-     operation more than a 64-bit gather.  */
-  14, /* gather_load_x32_cost  */
-  12, /* gather_load_x64_cost  */
-  3 /* scatter_store_elt_cost  */
-};
-
-static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
-{
-  3, /* loads_stores_per_cycle  */
-  2, /* stores_per_cycle  */
-  6, /* general_ops_per_cycle  */
-  0, /* fp_simd_load_general_ops  */
-  1 /* fp_simd_store_general_ops  */
-};
-
-static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
-{
-  {
-    3, /* loads_stores_per_cycle  */
-    2, /* stores_per_cycle  */
-    4, /* general_ops_per_cycle  */
-    0, /* fp_simd_load_general_ops  */
-    1 /* fp_simd_store_general_ops  */
-  },
-  2, /* ld2_st2_general_ops  */
-  2, /* ld3_st3_general_ops  */
-  3 /* ld4_st4_general_ops  */
-};
-
-static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
-{
-  {
-    {
-      3, /* loads_per_cycle  */
-      2, /* stores_per_cycle  */
-      4, /* general_ops_per_cycle  */
-      0, /* fp_simd_load_general_ops  */
-      1 /* fp_simd_store_general_ops  */
-    },
-    2, /* ld2_st2_general_ops  */
-    3, /* ld3_st3_general_ops  */
-    3 /* ld4_st4_general_ops  */
-  },
-  2, /* pred_ops_per_cycle  */
-  2, /* while_pred_ops  */
-  2, /* int_cmp_pred_ops  */
-  1, /* fp_cmp_pred_ops  */
-  1, /* gather_scatter_pair_general_ops  */
-  1 /* gather_scatter_pair_pred_ops  */
-};
-
-static const aarch64_vec_issue_info neoversev2_vec_issue_info =
-{
-  &neoversev2_scalar_issue_info,
-  &neoversev2_advsimd_issue_info,
-  &neoversev2_sve_issue_info
-};
-
-/* Demeter costs for vector insn classes.  */
-static const struct cpu_vector_cost neoversev2_vector_cost =
-{
-  1, /* scalar_int_stmt_cost  */
-  2, /* scalar_fp_stmt_cost  */
-  4, /* scalar_load_cost  */
-  1, /* scalar_store_cost  */
-  1, /* cond_taken_branch_cost  */
-  1, /* cond_not_taken_branch_cost  */
-  &neoversev2_advsimd_vector_cost, /* advsimd  */
-  &neoversev2_sve_vector_cost, /* sve  */
-  &neoversev2_vec_issue_info /* issue_info  */
-};
-
-static const struct tune_params neoversev2_tunings =
-{
-  &cortexa76_extra_costs,
-  &neoversev2_addrcost_table,
-  &neoversev2_regmove_cost,
-  &neoversev2_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_128, /* sve_width  */
-  { 4, /* load_int.  */
-    2, /* store_int.  */
-    6, /* load_fp.  */
-    1, /* store_fp.  */
-    6, /* load_pred.  */
-    2 /* store_pred.  */
-  }, /* memmov_cost.  */
-  5, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16",	/* function_align.  */
-  "4",		/* jump_align.  */
-  "32:16",	/* loop_align.  */
-  3,	/* int_reassoc_width.  */
-  6,	/* fp_reassoc_width.  */
-  4,	/* fma_reassoc_width.  */
-  3,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
-   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
-  &generic_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
-};
-
-static const struct tune_params a64fx_tunings =
-{
-  &a64fx_extra_costs,
-  &a64fx_addrcost_table,
-  &a64fx_regmove_cost,
-  &a64fx_vector_cost,
-  &generic_branch_cost,
-  &generic_approx_modes,
-  SVE_512, /* sve_width  */
-  { 4, /* load_int.  */
-    4, /* store_int.  */
-    4, /* load_fp.  */
-    4, /* store_fp.  */
-    4, /* load_pred.  */
-    4 /* store_pred.  */
-  }, /* memmov_cost.  */
-  7, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32",	/* function_align.  */
-  "16",	/* jump_align.  */
-  "32",	/* loop_align.  */
-  4,	/* int_reassoc_width.  */
-  2,	/* fp_reassoc_width.  */
-  1,	/* fma_reassoc_width.  */
-  2,	/* vec_reassoc_width.  */
-  2,	/* min_div_recip_mul_sf.  */
-  2,	/* min_div_recip_mul_df.  */
-  0,	/* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
-  &a64fx_prefetch_tune,
-  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
-  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
-};
+#include "tuning_models/generic.h"
+#include "tuning_models/cortexa35.h"
+#include "tuning_models/cortexa53.h"
+#include "tuning_models/cortexa57.h"
+#include "tuning_models/cortexa72.h"
+#include "tuning_models/cortexa73.h"
+#include "tuning_models/exynosm1.h"
+#include "tuning_models/thunderxt88.h"
+#include "tuning_models/thunderx.h"
+#include "tuning_models/tsv110.h"
+#include "tuning_models/xgene1.h"
+#include "tuning_models/emag.h"
+#include "tuning_models/qdf24xx.h"
+#include "tuning_models/saphira.h"
+#include "tuning_models/thunderx2t99.h"
+#include "tuning_models/thunderx3t110.h"
+#include "tuning_models/neoversen1.h"
+#include "tuning_models/ampere1.h"
+#include "tuning_models/ampere1a.h"
+#include "tuning_models/neoversev1.h"
+#include "tuning_models/neoverse512tvb.h"
+#include "tuning_models/neoversen2.h"
+#include "tuning_models/neoversev2.h"
+#include "tuning_models/a64fx.h"
 
 /* Support for fine-grained override of the tuning structures.  */
 struct aarch64_tuning_override_function
diff --git a/gcc/config/aarch64/tuning_models/a64fx.h b/gcc/config/aarch64/tuning_models/a64fx.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b06c27eba1e4de01738bdfdc077460f9135fb41
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/a64fx.h
@@ -0,0 +1,169 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_A64FX
+#define GCC_AARCH64_H_A64FX
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table a64fx_addrcost_table =
+{
+    {
+      1, /* hi  */
+      1, /* si  */
+      1, /* di  */
+      2, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* post_modify_ld3_st3  */
+  0, /* post_modify_ld4_st4  */
+  2, /* register_offset  */
+  3, /* register_sextend  */
+  3, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost a64fx_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  5, /* GP2FP  */
+  7, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost a64fx_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  5, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  13, /* reduc_i8_cost  */
+  13, /* reduc_i16_cost  */
+  13, /* reduc_i32_cost  */
+  13, /* reduc_i64_cost  */
+  13, /* reduc_f16_cost  */
+  13, /* reduc_f32_cost  */
+  13, /* reduc_f64_cost  */
+  13, /* store_elt_extra_cost  */
+  13, /* vec_to_scalar_cost  */
+  4, /* scalar_to_vec_cost  */
+  6, /* align_load_cost  */
+  6, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const sve_vec_cost a64fx_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    5, /* fp_stmt_cost  */
+    0, /* ld2_st2_permute_cost  */
+    0, /* ld3_st3_permute_cost  */
+    0, /* ld4_st4_permute_cost  */
+    3, /* permute_cost  */
+    13, /* reduc_i8_cost  */
+    13, /* reduc_i16_cost  */
+    13, /* reduc_i32_cost  */
+    13, /* reduc_i64_cost  */
+    13, /* reduc_f16_cost  */
+    13, /* reduc_f32_cost  */
+    13, /* reduc_f64_cost  */
+    13, /* store_elt_extra_cost  */
+    13, /* vec_to_scalar_cost  */
+    4, /* scalar_to_vec_cost  */
+    6, /* align_load_cost  */
+    6, /* unalign_load_cost  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  13, /* clast_cost  */
+  13, /* fadda_f16_cost  */
+  13, /* fadda_f32_cost  */
+  13, /* fadda_f64_cost  */
+  64, /* gather_load_x32_cost  */
+  32, /* gather_load_x64_cost  */
+  1 /* scatter_store_elt_cost  */
+};
+
+static const struct cpu_vector_cost a64fx_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  5, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  3, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &a64fx_advsimd_vector_cost, /* advsimd  */
+  &a64fx_sve_vector_cost, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const cpu_prefetch_tune a64fx_prefetch_tune =
+{
+  8,			/* num_slots  */
+  64,			/* l1_cache_size  */
+  256,			/* l1_cache_line_size  */
+  32768,		/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params a64fx_tunings =
+{
+  &a64fx_extra_costs,
+  &a64fx_addrcost_table,
+  &a64fx_regmove_cost,
+  &a64fx_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_512, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  7, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32",	/* function_align.  */
+  "16",	/* jump_align.  */
+  "32",	/* loop_align.  */
+  4,	/* int_reassoc_width.  */
+  2,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &a64fx_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_A64FX.  */
diff --git a/gcc/config/aarch64/tuning_models/ampere1.h b/gcc/config/aarch64/tuning_models/ampere1.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d2a1c696103259f23cf73df26cef9d4fa05ac73
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/ampere1.h
@@ -0,0 +1,113 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_AMPERE1
+#define GCC_AARCH64_H_AMPERE1
+
+#include "generic.h"
+
+static const advsimd_vec_cost ampere1_advsimd_vector_cost =
+{
+  1, /* int_stmt_cost  */
+  3, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  2, /* permute_cost  */
+  12, /* reduc_i8_cost  */
+  9, /* reduc_i16_cost  */
+  6, /* reduc_i32_cost  */
+  5, /* reduc_i64_cost  */
+  9, /* reduc_f16_cost  */
+  6, /* reduc_f32_cost  */
+  5, /* reduc_f64_cost  */
+  8, /* store_elt_extra_cost  */
+  6, /* vec_to_scalar_cost  */
+  7, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+/* Ampere-1 costs for vector insn classes.  */
+static const struct cpu_vector_cost ampere1_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  3, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &ampere1_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr  /* issue_info  */
+};
+
+static const cpu_prefetch_tune ampere1_prefetch_tune =
+{
+  0,			/* num_slots  */
+  64,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  2048,			/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params ampere1_tunings =
+{
+  &ampere1_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &ampere1_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
+   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
+   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
+   AARCH64_FUSE_CMP_BRANCH),
+  /* fusible_ops  */
+  "32",		/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  4,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &ampere1_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_AMPERE1.  */
diff --git a/gcc/config/aarch64/tuning_models/ampere1a.h b/gcc/config/aarch64/tuning_models/ampere1a.h
new file mode 100644
index 0000000000000000000000000000000000000000..c419ffb3c1a936a01690ad157c6c71dc645273c8
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/ampere1a.h
@@ -0,0 +1,65 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_AMPERE1A
+#define GCC_AARCH64_H_AMPERE1A
+
+#include "generic.h"
+
+static const struct tune_params ampere1a_tunings =
+{
+  &ampere1a_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &ampere1_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
+   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
+   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
+   AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
+   AARCH64_FUSE_ADDSUB_2REG_CONST1),
+  /* fusible_ops  */
+  "32",		/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &ampere1_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_AMPERE1A.  */
diff --git a/gcc/config/aarch64/tuning_models/cortexa35.h b/gcc/config/aarch64/tuning_models/cortexa35.h
new file mode 100644
index 0000000000000000000000000000000000000000..5534335348db96cc57fc9eccd7ff79a624cb528a
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa35.h
@@ -0,0 +1,62 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_CORTEXA35
+#define GCC_AARCH64_H_CORTEXA35
+
+#include "generic.h"
+#include "cortexa53.h"
+
+static const struct tune_params cortexa35_tunings =
+{
+  &cortexa53_extra_costs,
+  &generic_addrcost_table,
+  &cortexa53_regmove_cost,
+  &generic_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  1, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_CORTEXA35.  */
diff --git a/gcc/config/aarch64/tuning_models/cortexa53.h b/gcc/config/aarch64/tuning_models/cortexa53.h
new file mode 100644
index 0000000000000000000000000000000000000000..9dfdccc5968e7f062af5c78f153bfe3838263b0a
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa53.h
@@ -0,0 +1,71 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_CORTEXA53
+#define GCC_AARCH64_H_CORTEXA53
+
+#include "generic.h"
+
+static const struct cpu_regmove_cost cortexa53_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  5, /* GP2FP  */
+  5, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const struct tune_params cortexa53_tunings =
+{
+  &cortexa53_extra_costs,
+  &generic_addrcost_table,
+  &cortexa53_regmove_cost,
+  &generic_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  2, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_CORTEXA53.  */
diff --git a/gcc/config/aarch64/tuning_models/cortexa57.h b/gcc/config/aarch64/tuning_models/cortexa57.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c4789d57833a5879dda8e2fe454ac5f56cb0601
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa57.h
@@ -0,0 +1,109 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_CORTEXA57
+#define GCC_AARCH64_H_CORTEXA57
+
+#include "generic.h"
+
+static const struct cpu_regmove_cost cortexa57_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  5, /* GP2FP  */
+  5, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  8, /* reduc_i8_cost  */
+  8, /* reduc_i16_cost  */
+  8, /* reduc_i32_cost  */
+  8, /* reduc_i64_cost  */
+  8, /* reduc_f16_cost  */
+  8, /* reduc_f32_cost  */
+  8, /* reduc_f64_cost  */
+  8, /* store_elt_extra_cost  */
+  8, /* vec_to_scalar_cost  */
+  8, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+/* Cortex-A57 costs for vector insn classes.  */
+static const struct cpu_vector_cost cortexa57_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &cortexa57_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const struct tune_params cortexa57_tunings =
+{
+  &cortexa57_extra_costs,
+  &generic_addrcost_table,
+  &cortexa57_regmove_cost,
+  &cortexa57_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_CORTEXA57.  */
diff --git a/gcc/config/aarch64/tuning_models/cortexa72.h b/gcc/config/aarch64/tuning_models/cortexa72.h
new file mode 100644
index 0000000000000000000000000000000000000000..968171c9b2e898d7479dbcb462e33fe3905e183d
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa72.h
@@ -0,0 +1,61 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_CORTEXA72
+#define GCC_AARCH64_H_CORTEXA72
+
+#include "generic.h"
+
+static const struct tune_params cortexa72_tunings =
+{
+  &cortexa57_extra_costs,
+  &generic_addrcost_table,
+  &cortexa57_regmove_cost,
+  &cortexa57_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_CORTEXA72.  */
diff --git a/gcc/config/aarch64/tuning_models/cortexa73.h b/gcc/config/aarch64/tuning_models/cortexa73.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d1a504ddac39604dd193ce0f434fd2f5145c129
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/cortexa73.h
@@ -0,0 +1,62 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_CORTEXA73
+#define GCC_AARCH64_H_CORTEXA73
+
+#include "generic.h"
+
+static const struct tune_params cortexa73_tunings =
+{
+  &cortexa57_extra_costs,
+  &generic_addrcost_table,
+  &cortexa57_regmove_cost,
+  &cortexa57_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  2, /* issue_rate.  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+
+#endif /* GCC_AARCH64_H_CORTEXA73.  */
diff --git a/gcc/config/aarch64/tuning_models/emag.h b/gcc/config/aarch64/tuning_models/emag.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f3402c3fc2a94704eeaf9223ecb0ca1c057cace
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/emag.h
@@ -0,0 +1,60 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_EMAG
+#define GCC_AARCH64_H_EMAG
+
+#include "generic.h"
+
+static const struct tune_params emag_tunings =
+{
+  &xgene1_extra_costs,
+  &xgene1_addrcost_table,
+  &xgene1_regmove_cost,
+  &xgene1_vector_cost,
+  &generic_branch_cost,
+  &xgene1_approx_modes,
+  SVE_NOT_IMPLEMENTED,
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  AARCH64_FUSE_NOTHING, /* fusible_ops  */
+  "16",	/* function_align.  */
+  "16",	/* jump_align.  */
+  "16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  17,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
+  &xgene1_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_EMAG.  */
diff --git a/gcc/config/aarch64/tuning_models/exynosm1.h b/gcc/config/aarch64/tuning_models/exynosm1.h
new file mode 100644
index 0000000000000000000000000000000000000000..a42ea4df97f3f048c41481c304fd3684a69d743b
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/exynosm1.h
@@ -0,0 +1,144 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_EXYNOSM1
+#define GCC_AARCH64_H_EXYNOSM1
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table exynosm1_addrcost_table =
+{
+    {
+      0, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      2, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* post_modify_ld3_st3  */
+  0, /* post_modify_ld4_st4  */
+  1, /* register_offset  */
+  1, /* register_sextend  */
+  2, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost exynosm1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost (actual, 4 and 9).  */
+  9, /* GP2FP  */
+  9, /* FP2GP  */
+  1 /* FP2FP  */
+};
+
+static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
+{
+  3, /* int_stmt_cost  */
+  3, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  3, /* reduc_i8_cost  */
+  3, /* reduc_i16_cost  */
+  3, /* reduc_i32_cost  */
+  3, /* reduc_i64_cost  */
+  3, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  3, /* reduc_f64_cost  */
+  3, /* store_elt_extra_cost  */
+  3, /* vec_to_scalar_cost  */
+  3, /* scalar_to_vec_cost  */
+  5, /* align_load_cost  */
+  5, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const struct cpu_vector_cost exynosm1_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  5, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &exynosm1_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+/* Approximation modes for Exynos M1.  */
+static const cpu_approx_modes exynosm1_approx_modes =
+{
+  AARCH64_APPROX_NONE,	/* division  */
+  AARCH64_APPROX_ALL,	/* sqrt  */
+  AARCH64_APPROX_ALL	/* recip_sqrt  */
+};
+
+static const cpu_prefetch_tune exynosm1_prefetch_tune =
+{
+  0,			/* num_slots  */
+  -1,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  -1,			/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params exynosm1_tunings =
+{
+  &exynosm1_extra_costs,
+  &exynosm1_addrcost_table,
+  &exynosm1_regmove_cost,
+  &exynosm1_vector_cost,
+  &generic_branch_cost,
+  &exynosm1_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3,	/* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
+  "4",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "4",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  48,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
+  &exynosm1_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_EXYNOSM1.  */
diff --git a/gcc/config/aarch64/tuning_models/generic.h b/gcc/config/aarch64/tuning_models/generic.h
new file mode 100644
index 0000000000000000000000000000000000000000..deb2c1cffe255bddcb5be571b12086442782da60
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/generic.h
@@ -0,0 +1,190 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_GENERIC
+#define GCC_AARCH64_H_GENERIC
+
+static const struct cpu_addrcost_table generic_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* post_modify_ld3_st3  */
+  0, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost generic_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  5, /* GP2FP  */
+  5, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+/* Generic costs for Advanced SIMD vector operations.   */
+static const advsimd_vec_cost generic_advsimd_vector_cost =
+{
+  1, /* int_stmt_cost  */
+  1, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  2, /* permute_cost  */
+  2, /* reduc_i8_cost  */
+  2, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  2, /* reduc_f16_cost  */
+  2, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  2, /* vec_to_scalar_cost  */
+  1, /* scalar_to_vec_cost  */
+  1, /* align_load_cost  */
+  1, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+/* Generic costs for SVE vector operations.  */
+static const sve_vec_cost generic_sve_vector_cost =
+{
+  {
+    1, /* int_stmt_cost  */
+    1, /* fp_stmt_cost  */
+    0, /* ld2_st2_permute_cost  */
+    0, /* ld3_st3_permute_cost  */
+    0, /* ld4_st4_permute_cost  */
+    2, /* permute_cost  */
+    2, /* reduc_i8_cost  */
+    2, /* reduc_i16_cost  */
+    2, /* reduc_i32_cost  */
+    2, /* reduc_i64_cost  */
+    2, /* reduc_f16_cost  */
+    2, /* reduc_f32_cost  */
+    2, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    2, /* vec_to_scalar_cost  */
+    1, /* scalar_to_vec_cost  */
+    1, /* align_load_cost  */
+    1, /* unalign_load_cost  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  2, /* clast_cost  */
+  2, /* fadda_f16_cost  */
+  2, /* fadda_f32_cost  */
+  2, /* fadda_f64_cost  */
+  4, /* gather_load_x32_cost  */
+  2, /* gather_load_x64_cost  */
+  1 /* scatter_store_elt_cost  */
+};
+
+/* Generic costs for vector insn classes.  */
+static const struct cpu_vector_cost generic_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  1, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  3, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &generic_advsimd_vector_cost, /* advsimd  */
+  &generic_sve_vector_cost, /* sve */
+  nullptr /* issue_info  */
+};
+
+/* Generic costs for branch instructions.  */
+static const struct cpu_branch_cost generic_branch_cost =
+{
+  1,  /* Predictable.  */
+  3   /* Unpredictable.  */
+};
+
+/* Generic approximation modes.  */
+static const cpu_approx_modes generic_approx_modes =
+{
+  AARCH64_APPROX_NONE,	/* division  */
+  AARCH64_APPROX_NONE,	/* sqrt  */
+  AARCH64_APPROX_NONE	/* recip_sqrt  */
+};
+
+/* Generic prefetch settings (which disable prefetch).  */
+static const cpu_prefetch_tune generic_prefetch_tune =
+{
+  0,			/* num_slots  */
+  -1,			/* l1_cache_size  */
+  -1,			/* l1_cache_line_size  */
+  -1,			/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params generic_tunings =
+{
+  &cortexa57_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &generic_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  2, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "16:12",	/* function_align.  */
+  "4",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
+     Neoverse V1.  It does not have a noticeable effect on A64FX and should
+     have at most a very minor effect on SVE2 cores.  */
+  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_GENERIC.  */
diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
new file mode 100644
index 0000000000000000000000000000000000000000..50d7b23712cc6a8be8f35246657ec5d86d6d4191
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
@@ -0,0 +1,164 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_NEOVERSE512TVB
+#define GCC_AARCH64_H_NEOVERSE512TVB
+
+#include "generic.h"
+
+static const sve_vec_cost neoverse512tvb_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    2, /* fp_stmt_cost  */
+    4, /* ld2_st2_permute_cost  */
+    5, /* ld3_st3_permute_cost  */
+    5, /* ld4_st4_permute_cost  */
+    3, /* permute_cost  */
+    /* Theoretically, a reduction involving 15 scalar ADDs could
+       complete in ~5 cycles and would have a cost of 15.  Assume that
+       [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6.  */
+    21, /* reduc_i8_cost  */
+    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
+    13, /* reduc_i16_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
+    9, /* reduc_i32_cost  */
+    /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7.  */
+    8, /* reduc_i64_cost  */
+    /* Theoretically, a reduction involving 7 scalar FADDs could
+       complete in ~6 cycles and would have a cost of 14.  Assume that
+       FADDV completes in 8 cycles and so give it a cost of 14 + 2.  */
+    16, /* reduc_f16_cost  */
+    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
+    8, /* reduc_f32_cost  */
+    /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2.  */
+    4, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    /* This value is just inherited from the Cortex-A57 table.  */
+    8, /* vec_to_scalar_cost  */
+    /* This depends very much on what the scalar value is and
+       where it comes from.  E.g. some constants take two dependent
+       instructions or a load, while others might be moved from a GPR.
+       4 seems to be a reasonable compromise in practice.  */
+    4, /* scalar_to_vec_cost  */
+    4, /* align_load_cost  */
+    4, /* unalign_load_cost  */
+    /* Although stores generally have a latency of 2 and compete for the
+       vector pipes, in practice it's better not to model that.  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  3, /* clast_cost  */
+  10, /* fadda_f16_cost  */
+  6, /* fadda_f32_cost  */
+  4, /* fadda_f64_cost  */
+  /* A strided Advanced SIMD x64 load would take two parallel FP loads
+     (6 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
+     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
+     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
+     (cost 2) to that, to avoid the difference being lost in rounding.
+
+     There is no easy comparison between a strided Advanced SIMD x32 load
+     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
+     operation more than a 64-bit gather.  */
+  14, /* gather_load_x32_cost  */
+  12, /* gather_load_x64_cost  */
+  3 /* scatter_store_elt_cost  */
+};
+
+static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
+{
+  {
+    {
+      3, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      4, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    2, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  2, /* pred_ops_per_cycle  */
+  2, /* while_pred_ops  */
+  2, /* int_cmp_pred_ops  */
+  1, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
+{
+  &neoversev1_scalar_issue_info,
+  &neoversev1_advsimd_issue_info,
+  &neoverse512tvb_sve_issue_info
+};
+
+static const struct cpu_vector_cost neoverse512tvb_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  2, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &neoversev1_advsimd_vector_cost, /* advsimd  */
+  &neoverse512tvb_sve_vector_cost, /* sve  */
+  &neoverse512tvb_vec_issue_info /* issue_info  */
+};
+
+static const struct tune_params neoverse512tvb_tunings =
+{
+  &cortexa76_extra_costs,
+  &neoversev1_addrcost_table,
+  &neoversev1_regmove_cost,
+  &neoverse512tvb_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_128 | SVE_256, /* sve_width  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    6, /* load_fp.  */
+    2, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  4,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_NEOVERSE512TVB.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversen1.h b/gcc/config/aarch64/tuning_models/neoversen1.h
new file mode 100644
index 0000000000000000000000000000000000000000..132166d3d06430b725e4448937332cc159c11cda
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoversen1.h
@@ -0,0 +1,60 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_NEOVERSEN1
+#define GCC_AARCH64_H_NEOVERSEN1
+
+#include "generic.h"
+
+static const struct tune_params neoversen1_tunings =
+{
+  &cortexa76_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &cortexa57_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    5, /* load_fp.  */
+    2, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_NEOVERSEN1.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h b/gcc/config/aarch64/tuning_models/neoversen2.h
new file mode 100644
index 0000000000000000000000000000000000000000..395a6d82b8403e586bf179cade055543cf9b9eb0
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoversen2.h
@@ -0,0 +1,245 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_NEOVERSEN2
+#define GCC_AARCH64_H_NEOVERSEN2
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table neoversen2_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  2, /* post_modify_ld3_st3  */
+  2, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost neoversen2_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Spilling to int<->fp instead of memory is recommended so set
+     realistic costs compared to memmov_cost.  */
+  3, /* GP2FP  */
+  2, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  2, /* ld2_st2_permute_cost */
+  2, /* ld3_st3_permute_cost  */
+  3, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  4, /* reduc_i8_cost  */
+  4, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  6, /* reduc_f16_cost  */
+  4, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  /* This value is just inherited from the Cortex-A57 table.  */
+  8, /* vec_to_scalar_cost  */
+  /* This depends very much on what the scalar value is and
+     where it comes from.  E.g. some constants take two dependent
+     instructions or a load, while others might be moved from a GPR.
+     4 seems to be a reasonable compromise in practice.  */
+  4, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  /* Although stores have a latency of 2 and compete for the
+     vector pipes, in practice it's better not to model that.  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const sve_vec_cost neoversen2_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    2, /* fp_stmt_cost  */
+    3, /* ld2_st2_permute_cost  */
+    4, /* ld3_st3_permute_cost  */
+    4, /* ld4_st4_permute_cost  */
+    3, /* permute_cost  */
+    /* Theoretically, a reduction involving 15 scalar ADDs could
+       complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
+       completes in 11 cycles, so give it a cost of 15 + 6.  */
+    21, /* reduc_i8_cost  */
+    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
+    13, /* reduc_i16_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
+    9, /* reduc_i32_cost  */
+    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
+    2, /* reduc_i64_cost  */
+    /* Theoretically, a reduction involving 7 scalar FADDs could
+       complete in ~8 cycles and would have a cost of 14.  FADDV
+       completes in 6 cycles, so give it a cost of 14 - 2.  */
+    12, /* reduc_f16_cost  */
+    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
+    6, /* reduc_f32_cost  */
+    /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
+    2, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    /* This value is just inherited from the Cortex-A57 table.  */
+    8, /* vec_to_scalar_cost  */
+    /* See the comment above the Advanced SIMD versions.  */
+    4, /* scalar_to_vec_cost  */
+    4, /* align_load_cost  */
+    4, /* unalign_load_cost  */
+    /* Although stores have a latency of 2 and compete for the
+       vector pipes, in practice it's better not to model that.  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  3, /* clast_cost  */
+  10, /* fadda_f16_cost  */
+  6, /* fadda_f32_cost  */
+  4, /* fadda_f64_cost  */
+  /* A strided Advanced SIMD x64 load would take two parallel FP loads
+     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
+     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
+     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
+     (cost 2) to that, to avoid the difference being lost in rounding.
+
+     There is no easy comparison between a strided Advanced SIMD x32 load
+     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
+     operation more than a 64-bit gather.  */
+  14, /* gather_load_x32_cost  */
+  12, /* gather_load_x64_cost  */
+  3 /* scatter_store_elt_cost  */
+};
+
+static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
+{
+  3, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  4, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
+{
+  {
+    3, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    2, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  2, /* ld2_st2_general_ops  */
+  2, /* ld3_st3_general_ops  */
+  3 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
+{
+  {
+    {
+      3, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      2, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    3, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  2, /* pred_ops_per_cycle  */
+  2, /* while_pred_ops  */
+  2, /* int_cmp_pred_ops  */
+  1, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info neoversen2_vec_issue_info =
+{
+  &neoversen2_scalar_issue_info,
+  &neoversen2_advsimd_issue_info,
+  &neoversen2_sve_issue_info
+};
+
+/* Neoverse N2 costs for vector insn classes.  */
+static const struct cpu_vector_cost neoversen2_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  2, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &neoversen2_advsimd_vector_cost, /* advsimd  */
+  &neoversen2_sve_vector_cost, /* sve  */
+  &neoversen2_vec_issue_info /* issue_info  */
+};
+
+static const struct tune_params neoversen2_tunings =
+{
+  &cortexa76_extra_costs,
+  &neoversen2_addrcost_table,
+  &neoversen2_regmove_cost,
+  &neoversen2_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_128, /* sve_width  */
+  { 4, /* load_int.  */
+    1, /* store_int.  */
+    6, /* load_fp.  */
+    2, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
+   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_NEOVERSEN2.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h b/gcc/config/aarch64/tuning_models/neoversev1.h
new file mode 100644
index 0000000000000000000000000000000000000000..584a5000e06f598dcdd3bcc533dc6dbc642223ca
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -0,0 +1,237 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_NEOVERSEV1
+#define GCC_AARCH64_H_NEOVERSEV1
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table neoversev1_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  3, /* post_modify_ld3_st3  */
+  3, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost neoversev1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Spilling to int<->fp instead of memory is recommended so set
+     realistic costs compared to memmov_cost.  */
+  3, /* GP2FP  */
+  2, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  4, /* ld2_st2_permute_cost */
+  4, /* ld3_st3_permute_cost  */
+  5, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  4, /* reduc_i8_cost  */
+  4, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  6, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  /* This value is just inherited from the Cortex-A57 table.  */
+  8, /* vec_to_scalar_cost  */
+  /* This depends very much on what the scalar value is and
+     where it comes from.  E.g. some constants take two dependent
+     instructions or a load, while others might be moved from a GPR.
+     4 seems to be a reasonable compromise in practice.  */
+  4, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  /* Although stores have a latency of 2 and compete for the
+     vector pipes, in practice it's better not to model that.  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const sve_vec_cost neoversev1_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    2, /* fp_stmt_cost  */
+    4, /* ld2_st2_permute_cost  */
+    7, /* ld3_st3_permute_cost  */
+    8, /* ld4_st4_permute_cost  */
+    3, /* permute_cost  */
+    /* Theoretically, a reduction involving 31 scalar ADDs could
+       complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
+       completes in 14 cycles, so give it a cost of 31 + 5.  */
+    36, /* reduc_i8_cost  */
+    /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
+    22, /* reduc_i16_cost  */
+    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
+    14, /* reduc_i32_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
+    11, /* reduc_i64_cost  */
+    /* Theoretically, a reduction involving 15 scalar FADDs could
+       complete in ~9 cycles and would have a cost of 30.  FADDV
+       completes in 13 cycles, so give it a cost of 30 + 4.  */
+    34, /* reduc_f16_cost  */
+    /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
+    19, /* reduc_f32_cost  */
+    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
+    11, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    /* This value is just inherited from the Cortex-A57 table.  */
+    8, /* vec_to_scalar_cost  */
+    /* See the comment above the Advanced SIMD versions.  */
+    4, /* scalar_to_vec_cost  */
+    4, /* align_load_cost  */
+    4, /* unalign_load_cost  */
+    /* Although stores have a latency of 2 and compete for the
+       vector pipes, in practice it's better not to model that.  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  3, /* clast_cost  */
+  19, /* fadda_f16_cost  */
+  11, /* fadda_f32_cost  */
+  8, /* fadda_f64_cost  */
+  32, /* gather_load_x32_cost  */
+  16, /* gather_load_x64_cost  */
+  3 /* scatter_store_elt_cost  */
+};
+
+static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
+{
+  3, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  4, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
+{
+  {
+    3, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    4, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  2, /* ld2_st2_general_ops  */
+  2, /* ld3_st3_general_ops  */
+  3 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
+{
+  {
+    {
+      2, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      2, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    2, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  1, /* pred_ops_per_cycle  */
+  2, /* while_pred_ops  */
+  2, /* int_cmp_pred_ops  */
+  1, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info neoversev1_vec_issue_info =
+{
+  &neoversev1_scalar_issue_info,
+  &neoversev1_advsimd_issue_info,
+  &neoversev1_sve_issue_info
+};
+
+/* Neoverse V1 costs for vector insn classes.  */
+static const struct cpu_vector_cost neoversev1_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  2, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &neoversev1_advsimd_vector_cost, /* advsimd  */
+  &neoversev1_sve_vector_cost, /* sve  */
+  &neoversev1_vec_issue_info /* issue_info  */
+};
+
+static const struct tune_params neoversev1_tunings =
+{
+  &cortexa76_extra_costs,
+  &neoversev1_addrcost_table,
+  &neoversev1_regmove_cost,
+  &neoversev1_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_256, /* sve_width  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    6, /* load_fp.  */
+    2, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
+  3, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  4,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
+   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+
+#endif /* GCC_AARCH64_H_NEOVERSEV1.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h
new file mode 100644
index 0000000000000000000000000000000000000000..28d4244ef4c99ecdffb7408e39dc21bc191223de
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -0,0 +1,245 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_NEOVERSEV2
+#define GCC_AARCH64_H_NEOVERSEV2
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table neoversev2_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  2, /* post_modify_ld3_st3  */
+  2, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost neoversev2_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Spilling to int<->fp instead of memory is recommended so set
+     realistic costs compared to memmov_cost.  */
+  3, /* GP2FP  */
+  2, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  2, /* ld2_st2_permute_cost */
+  2, /* ld3_st3_permute_cost  */
+  3, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  4, /* reduc_i8_cost  */
+  4, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  6, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  /* This value is just inherited from the Cortex-A57 table.  */
+  8, /* vec_to_scalar_cost  */
+  /* This depends very much on what the scalar value is and
+     where it comes from.  E.g. some constants take two dependent
+     instructions or a load, while others might be moved from a GPR.
+     4 seems to be a reasonable compromise in practice.  */
+  4, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  /* Although stores have a latency of 2 and compete for the
+     vector pipes, in practice it's better not to model that.  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const sve_vec_cost neoversev2_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    2, /* fp_stmt_cost  */
+    3, /* ld2_st2_permute_cost  */
+    3, /* ld3_st3_permute_cost  */
+    4, /* ld4_st4_permute_cost  */
+    3, /* permute_cost  */
+    /* Theoretically, a reduction involving 15 scalar ADDs could
+       complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
+       completes in 11 cycles, so give it a cost of 15 + 8.  */
+    21, /* reduc_i8_cost  */
+    /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
+    14, /* reduc_i16_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
+    7, /* reduc_i32_cost  */
+    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
+    2, /* reduc_i64_cost  */
+    /* Theoretically, a reduction involving 7 scalar FADDs could
+       complete in ~6 cycles and would have a cost of 14.  FADDV
+       completes in 8 cycles, so give it a cost of 14 + 2.  */
+    16, /* reduc_f16_cost  */
+    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
+    8, /* reduc_f32_cost  */
+    /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2.  */
+    4, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    /* This value is just inherited from the Cortex-A57 table.  */
+    8, /* vec_to_scalar_cost  */
+    /* See the comment above the Advanced SIMD versions.  */
+    4, /* scalar_to_vec_cost  */
+    4, /* align_load_cost  */
+    4, /* unalign_load_cost  */
+    /* Although stores have a latency of 2 and compete for the
+       vector pipes, in practice it's better not to model that.  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  3, /* clast_cost  */
+  10, /* fadda_f16_cost  */
+  6, /* fadda_f32_cost  */
+  4, /* fadda_f64_cost  */
+  /* A strided Advanced SIMD x64 load would take two parallel FP loads
+     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
+     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
+     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
+     (cost 2) to that, to avoid the difference being lost in rounding.
+
+     There is no easy comparison between a strided Advanced SIMD x32 load
+     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
+     operation more than a 64-bit gather.  */
+  14, /* gather_load_x32_cost  */
+  12, /* gather_load_x64_cost  */
+  3 /* scatter_store_elt_cost  */
+};
+
+static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
+{
+  3, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  6, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
+{
+  {
+    3, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    4, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  2, /* ld2_st2_general_ops  */
+  2, /* ld3_st3_general_ops  */
+  3 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
+{
+  {
+    {
+      3, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      4, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    3, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  2, /* pred_ops_per_cycle  */
+  2, /* while_pred_ops  */
+  2, /* int_cmp_pred_ops  */
+  1, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info neoversev2_vec_issue_info =
+{
+  &neoversev2_scalar_issue_info,
+  &neoversev2_advsimd_issue_info,
+  &neoversev2_sve_issue_info
+};
+
+/* Demeter costs for vector insn classes.  */
+static const struct cpu_vector_cost neoversev2_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  2, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &neoversev2_advsimd_vector_cost, /* advsimd  */
+  &neoversev2_sve_vector_cost, /* sve  */
+  &neoversev2_vec_issue_info /* issue_info  */
+};
+
+static const struct tune_params neoversev2_tunings =
+{
+  &cortexa76_extra_costs,
+  &neoversev2_addrcost_table,
+  &neoversev2_regmove_cost,
+  &neoversev2_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_128, /* sve_width  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    6, /* load_fp.  */
+    1, /* store_fp.  */
+    6, /* load_pred.  */
+    2 /* store_pred.  */
+  }, /* memmov_cost.  */
+  5, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  3,	/* int_reassoc_width.  */
+  6,	/* fp_reassoc_width.  */
+  4,	/* fma_reassoc_width.  */
+  3,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
+   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_NEOVERSEV2.  */
diff --git a/gcc/config/aarch64/tuning_models/qdf24xx.h b/gcc/config/aarch64/tuning_models/qdf24xx.h
new file mode 100644
index 0000000000000000000000000000000000000000..29c9b9f5843acc15450a2492b141c02ee48a3f13
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/qdf24xx.h
@@ -0,0 +1,137 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_QDF24XX
+#define GCC_AARCH64_H_QDF24XX
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table qdf24xx_addrcost_table =
+{
+    {
+      1, /* hi  */
+      1, /* si  */
+      1, /* di  */
+      2, /* ti  */
+    },
+  1, /* pre_modify  */
+  1, /* post_modify  */
+  1, /* post_modify_ld3_st3  */
+  1, /* post_modify_ld4_st4  */
+  3, /* register_offset  */
+  3, /* register_sextend  */
+  3, /* register_zextend  */
+  2, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost qdf24xx_regmove_cost =
+{
+  2, /* GP2GP  */
+  /* Avoid the use of int<->fp moves for spilling.  */
+  6, /* GP2FP  */
+  6, /* FP2GP  */
+  4 /* FP2FP  */
+};
+
+static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
+{
+  1, /* int_stmt_cost  */
+  3, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  2, /* permute_cost  */
+  1, /* reduc_i8_cost  */
+  1, /* reduc_i16_cost  */
+  1, /* reduc_i32_cost  */
+  1, /* reduc_i64_cost  */
+  1, /* reduc_f16_cost  */
+  1, /* reduc_f32_cost  */
+  1, /* reduc_f64_cost  */
+  1, /* store_elt_extra_cost  */
+  1, /* vec_to_scalar_cost  */
+  1, /* scalar_to_vec_cost  */
+  1, /* align_load_cost  */
+  1, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+/* QDF24XX costs for vector insn classes.  */
+static const struct cpu_vector_cost qdf24xx_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  1, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  3, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &qdf24xx_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const cpu_prefetch_tune qdf24xx_prefetch_tune =
+{
+  4,			/* num_slots  */
+  32,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  512,			/* l2_cache_size  */
+  false,		/* prefetch_dynamic_strides */
+  2048,			/* minimum_stride */
+  3			/* default_opt_level  */
+};
+
+static const struct tune_params qdf24xx_tunings =
+{
+  &qdf24xx_extra_costs,
+  &qdf24xx_addrcost_table,
+  &qdf24xx_regmove_cost,
+  &qdf24xx_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
+  "16",	/* function_align.  */
+  "8",	/* jump_align.  */
+  "16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
+  &qdf24xx_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_QDF24XX.  */
diff --git a/gcc/config/aarch64/tuning_models/saphira.h b/gcc/config/aarch64/tuning_models/saphira.h
new file mode 100644
index 0000000000000000000000000000000000000000..e584d316bb7c3c2d232cf7623a92100ad261f07d
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/saphira.h
@@ -0,0 +1,63 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_SAPHIRA
+#define GCC_AARCH64_H_SAPHIRA
+
+#include "generic.h"
+
+/* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
+   for now.  */
+static const struct tune_params saphira_tunings =
+{
+  &generic_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &generic_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+   | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
+  "16",	/* function_align.  */
+  "8",	/* jump_align.  */
+  "16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
+  &generic_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_SAPHIRA.  */
diff --git a/gcc/config/aarch64/tuning_models/thunderx.h b/gcc/config/aarch64/tuning_models/thunderx.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd4b9d539fc5cf2bd20d84e91d6b72fa7237f99f
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/thunderx.h
@@ -0,0 +1,117 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_THUNDERX
+#define GCC_AARCH64_H_THUNDERX
+
+#include "generic.h"
+
+static const struct cpu_regmove_cost thunderx_regmove_cost =
+{
+  2, /* GP2GP  */
+  2, /* GP2FP  */
+  6, /* FP2GP  */
+  4 /* FP2FP  */
+};
+
+static const advsimd_vec_cost thunderx_advsimd_vector_cost =
+{
+  4, /* int_stmt_cost  */
+  1, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  4, /* permute_cost  */
+  2, /* reduc_i8_cost  */
+  2, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  2, /* reduc_f16_cost  */
+  2, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  2, /* vec_to_scalar_cost  */
+  2, /* scalar_to_vec_cost  */
+  3, /* align_load_cost  */
+  5, /* unalign_load_cost  */
+  5, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+/* ThunderX costs for vector insn classes.  */
+static const struct cpu_vector_cost thunderx_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  3, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  3, /* cond_taken_branch_cost  */
+  3, /* cond_not_taken_branch_cost  */
+  &thunderx_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const cpu_prefetch_tune thunderx_prefetch_tune =
+{
+  8,			/* num_slots  */
+  32,			/* l1_cache_size  */
+  128,			/* l1_cache_line_size  */
+  -1,			/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params thunderx_tunings =
+{
+  &thunderx_extra_costs,
+  &generic_addrcost_table,
+  &thunderx_regmove_cost,
+  &thunderx_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
+  2, /* issue_rate  */
+  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
+  "8",	/* function_align.  */
+  "8",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
+  &thunderx_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_THUNDERX.  */
diff --git a/gcc/config/aarch64/tuning_models/thunderx2t99.h b/gcc/config/aarch64/tuning_models/thunderx2t99.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a376e0bab37b0b5bc1ea23de0e96a9245846fd7
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/thunderx2t99.h
@@ -0,0 +1,137 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_THUNDERX2T99
+#define GCC_AARCH64_H_THUNDERX2T99
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
+{
+    {
+      1, /* hi  */
+      1, /* si  */
+      1, /* di  */
+      2, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* post_modify_ld3_st3  */
+  0, /* post_modify_ld4_st4  */
+  2, /* register_offset  */
+  3, /* register_sextend  */
+  3, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of int<->fp moves for spilling.  */
+  5, /* GP2FP  */
+  6, /* FP2GP  */
+  3, /* FP2FP  */
+};
+
+static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
+{
+  4, /* int_stmt_cost  */
+  5, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  10, /* permute_cost  */
+  6, /* reduc_i8_cost  */
+  6, /* reduc_i16_cost  */
+  6, /* reduc_i32_cost  */
+  6, /* reduc_i64_cost  */
+  6, /* reduc_f16_cost  */
+  6, /* reduc_f32_cost  */
+  6, /* reduc_f64_cost  */
+  6, /* store_elt_extra_cost  */
+  6, /* vec_to_scalar_cost  */
+  5, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+/* Costs for vector insn classes for Vulcan.  */
+static const struct cpu_vector_cost thunderx2t99_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  6, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  2, /* cond_taken_branch_cost  */
+  1,  /* cond_not_taken_branch_cost  */
+  &thunderx2t99_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
+{
+  8,			/* num_slots  */
+  32,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  256,			/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params thunderx2t99_tunings =
+{
+  &thunderx2t99_extra_costs,
+  &thunderx2t99_addrcost_table,
+  &thunderx2t99_regmove_cost,
+  &thunderx2t99_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate.  */
+  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
+   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "8",	/* jump_align.  */
+  "16",	/* loop_align.  */
+  3,	/* int_reassoc_width.  */
+  2,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &thunderx2t99_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_THUNDERX2T99.  */
diff --git a/gcc/config/aarch64/tuning_models/thunderx3t110.h b/gcc/config/aarch64/tuning_models/thunderx3t110.h
new file mode 100644
index 0000000000000000000000000000000000000000..65203b4af132e12e4994013fbab228bd3873b756
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/thunderx3t110.h
@@ -0,0 +1,136 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_THUNDERX3T110
+#define GCC_AARCH64_H_THUNDERX3T110
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
+{
+    {
+      1, /* hi  */
+      1, /* si  */
+      1, /* di  */
+      2, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* post_modify_ld3_st3  */
+  0, /* post_modify_ld4_st4  */
+  2, /* register_offset  */
+  3, /* register_sextend  */
+  3, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of int<->fp moves for spilling.  */
+  4, /* GP2FP  */
+  5, /* FP2GP  */
+  4  /* FP2FP  */
+};
+
+static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
+{
+  5, /* int_stmt_cost  */
+  5, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  10, /* permute_cost  */
+  5, /* reduc_i8_cost  */
+  5, /* reduc_i16_cost  */
+  5, /* reduc_i32_cost  */
+  5, /* reduc_i64_cost  */
+  5, /* reduc_f16_cost  */
+  5, /* reduc_f32_cost  */
+  5, /* reduc_f64_cost  */
+  5, /* store_elt_extra_cost  */
+  5, /* vec_to_scalar_cost  */
+  5, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  4, /* unalign_store_cost  */
+  4  /* store_cost  */
+};
+
+static const struct cpu_vector_cost thunderx3t110_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  5, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  2, /* cond_taken_branch_cost  */
+  1,  /* cond_not_taken_branch_cost  */
+  &thunderx3t110_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
+{
+  8,			/* num_slots  */
+  32,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  256,			/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params thunderx3t110_tunings =
+{
+  &thunderx3t110_extra_costs,
+  &thunderx3t110_addrcost_table,
+  &thunderx3t110_regmove_cost,
+  &thunderx3t110_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  6, /* issue_rate.  */
+  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
+   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "8",	/* jump_align.  */
+  "16",	/* loop_align.  */
+  3,	/* int_reassoc_width.  */
+  2,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &thunderx3t110_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_THUNDERX3T110.  */
diff --git a/gcc/config/aarch64/tuning_models/thunderxt88.h b/gcc/config/aarch64/tuning_models/thunderxt88.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcc74d31484ee6b99d37920dbfe7b1d59377d074
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/thunderxt88.h
@@ -0,0 +1,72 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_THUNDERXT88
+#define GCC_AARCH64_H_THUNDERXT88
+
+#include "generic.h"
+#include "thunderx.h"
+
+static const cpu_prefetch_tune thunderxt88_prefetch_tune =
+{
+  8,			/* num_slots  */
+  32,			/* l1_cache_size  */
+  128,			/* l1_cache_line_size  */
+  16*1024,		/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  3			/* default_opt_level  */
+};
+
+static const struct tune_params thunderxt88_tunings =
+{
+  &thunderx_extra_costs,
+  &generic_addrcost_table,
+  &thunderx_regmove_cost,
+  &thunderx_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
+  2, /* issue_rate  */
+  AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
+  "8",	/* function_align.  */
+  "8",	/* jump_align.  */
+  "8",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &thunderxt88_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_THUNDERXT88.  */
diff --git a/gcc/config/aarch64/tuning_models/tsv110.h b/gcc/config/aarch64/tuning_models/tsv110.h
new file mode 100644
index 0000000000000000000000000000000000000000..42aeafce652fff34e3277194993dd4aa1f0383a1
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/tsv110.h
@@ -0,0 +1,137 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_TSV110
+#define GCC_AARCH64_H_TSV110
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table tsv110_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  0, /* post_modify_ld3_st3  */
+  0, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  1, /* register_sextend  */
+  1, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost tsv110_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  2, /* GP2FP  */
+  3, /* FP2GP  */
+  2  /* FP2FP  */
+};
+
+static const advsimd_vec_cost tsv110_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  2, /* permute_cost  */
+  3, /* reduc_i8_cost  */
+  3, /* reduc_i16_cost  */
+  3, /* reduc_i32_cost  */
+  3, /* reduc_i64_cost  */
+  3, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  3, /* reduc_f64_cost  */
+  3, /* store_elt_extra_cost  */
+  3, /* vec_to_scalar_cost  */
+  2, /* scalar_to_vec_cost  */
+  5, /* align_load_cost  */
+  5, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const struct cpu_vector_cost tsv110_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  5, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &tsv110_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+static const cpu_prefetch_tune tsv110_prefetch_tune =
+{
+  0,                    /* num_slots  */
+  64,                   /* l1_cache_size  */
+  64,                   /* l1_cache_line_size  */
+  512,                  /* l2_cache_size  */
+  true,                 /* prefetch_dynamic_strides */
+  -1,                   /* minimum_stride */
+  -1                    /* default_opt_level  */
+};
+
+static const struct tune_params tsv110_tunings =
+{
+  &tsv110_extra_costs,
+  &tsv110_addrcost_table,
+  &tsv110_regmove_cost,
+  &tsv110_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4,    /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
+   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
+  "16", /* function_align.  */
+  "4",  /* jump_align.  */
+  "8",  /* loop_align.  */
+  2,    /* int_reassoc_width.  */
+  4,    /* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,    /* vec_reassoc_width.  */
+  2,    /* min_div_recip_mul_sf.  */
+  2,    /* min_div_recip_mul_df.  */
+  0,    /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
+  &tsv110_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_TSV110.  */
diff --git a/gcc/config/aarch64/tuning_models/xgene1.h b/gcc/config/aarch64/tuning_models/xgene1.h
new file mode 100644
index 0000000000000000000000000000000000000000..53a3eb0ddeb80a9735cc988e242a70e87dc90655
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/xgene1.h
@@ -0,0 +1,145 @@ 
+/* Tuning model description for AArch64 architecture.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_XGENE1
+#define GCC_AARCH64_H_XGENE1
+
+#include "generic.h"
+
+static const struct cpu_addrcost_table xgene1_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  1, /* pre_modify  */
+  1, /* post_modify  */
+  1, /* post_modify_ld3_st3  */
+  1, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  1, /* register_sextend  */
+  1, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost xgene1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  8, /* GP2FP  */
+  8, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost xgene1_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  2, /* permute_cost  */
+  4, /* reduc_i8_cost  */
+  4, /* reduc_i16_cost  */
+  4, /* reduc_i32_cost  */
+  4, /* reduc_i64_cost  */
+  4, /* reduc_f16_cost  */
+  4, /* reduc_f32_cost  */
+  4, /* reduc_f64_cost  */
+  4, /* store_elt_extra_cost  */
+  4, /* vec_to_scalar_cost  */
+  4, /* scalar_to_vec_cost  */
+  10, /* align_load_cost  */
+  10, /* unalign_load_cost  */
+  2, /* unalign_store_cost  */
+  2  /* store_cost  */
+};
+
+/* Generic costs for vector insn classes.  */
+static const struct cpu_vector_cost xgene1_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  5, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  2, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &xgene1_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  nullptr /* issue_info  */
+};
+
+/* Approximation modes for X-Gene 1.  */
+static const cpu_approx_modes xgene1_approx_modes =
+{
+  AARCH64_APPROX_NONE,	/* division  */
+  AARCH64_APPROX_NONE,	/* sqrt  */
+  AARCH64_APPROX_ALL	/* recip_sqrt  */
+};
+
+static const cpu_prefetch_tune xgene1_prefetch_tune =
+{
+  8,			/* num_slots  */
+  32,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  256,			/* l2_cache_size  */
+  true,                 /* prefetch_dynamic_strides */
+  -1,                   /* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static const struct tune_params xgene1_tunings =
+{
+  &xgene1_extra_costs,
+  &xgene1_addrcost_table,
+  &xgene1_regmove_cost,
+  &xgene1_vector_cost,
+  &generic_branch_cost,
+  &xgene1_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  AARCH64_FUSE_NOTHING, /* fusible_ops  */
+  "16",	/* function_align.  */
+  "16",	/* jump_align.  */
+  "16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  1,	/* fma_reassoc_width.  */
+  1,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  17,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
+  &xgene1_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_XGENE1.  */