[aarch64] Update reg-costs to differentiate between memmove costs

Message ID	e9f205c9-96e7-0825-11d5-d3f312d1984e@arm.com
State	New
Headers	show Return-Path: <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org DC60E3858D1E Content-Type: multipart/mixed; boundary="------------8YIptQSdEL6VP4RjVJzAduL2" Message-ID: <e9f205c9-96e7-0825-11d5-d3f312d1984e@arm.com> Date: Wed, 16 Mar 2022 14:56:57 +0000 MIME-Version: 1.0 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Thunderbird/91.6.2 Content-Language: en-US To: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org> Subject: [aarch64] Update reg-costs to differentiate between memmove costs Precedence: list From: "Andre Vieira \(lists\) via Gcc-patches" <gcc-patches@gcc.gnu.org> Reply-To: "Andre Vieira \(lists\)" <andre.simoesdiasvieira@arm.com> Cc: Richard Sandiford <richard.sandiford@arm.com> Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org>
Series	[aarch64] Update reg-costs to differentiate between memmove costs \| expand [aarch64] Update reg-costs to differentiate between memmove costs

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index f2fde35c6eb4989af8736db8fad004171c160282..5190eb8b96ea9af809a28470905b8b85ee720b09 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -508,6 +508,18 @@ struct cpu_prefetch_tune const int default_opt_level; }; +/* Model the costs for loads/stores for reload so that it can do more + accurate spill heuristics. */ +struct cpu_memmov_cost +{ + int load_int; + int store_int; + int load_fp; + int store_fp; + int load_pred; + int store_pred; +}; + struct tune_params { const struct cpu_cost_table *insn_extra_cost; @@ -520,7 +532,8 @@ struct tune_params or SVE_NOT_IMPLEMENTED if not applicable. Only used for tuning decisions, does not disable VLA vectorization. */ unsigned int sve_width; - int memmov_cost; + /* Structure used by reload to cost spills. */ + struct cpu_memmov_cost memmov_cost; int issue_rate; unsigned int fusible_ops; const char *function_align; diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 9a94f3a30b0f1acc3c9b8a0e3d703e60780d0cbc..3fc5e0bd3d3f39f99b0c8ffb9357603bc0998515 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -1291,7 +1291,13 @@ static const struct tune_params generic_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 2, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ "16:12", /* function_align. */ @@ -1320,7 +1326,13 @@ static const struct tune_params cortexa35_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 1, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ @@ -1347,7 +1359,13 @@ static const struct tune_params cortexa53_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 2, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ @@ -1374,7 +1392,13 @@ static const struct tune_params cortexa57_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 3, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ @@ -1401,7 +1425,13 @@ static const struct tune_params cortexa72_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 3, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ @@ -1428,7 +1458,13 @@ static const struct tune_params cortexa73_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost. */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 2, /* issue_rate. */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ @@ -1457,7 +1493,13 @@ static const struct tune_params exynosm1_tunings = &generic_branch_cost, &exynosm1_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 3, /* issue_rate */ (AARCH64_FUSE_AES_AESMC), /* fusible_ops */ "4", /* function_align. */ @@ -1483,7 +1525,13 @@ static const struct tune_params thunderxt88_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 6, /* memmov_cost */ + { 6, /* load_int. */ + 6, /* store_int. */ + 6, /* load_fp. */ + 6, /* store_fp. */ + 6, /* load_pred. */ + 6 /* store_pred. */ + }, /* memmov_cost. */ 2, /* issue_rate */ AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */ "8", /* function_align. */ @@ -1509,7 +1557,13 @@ static const struct tune_params thunderx_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 6, /* memmov_cost */ + { 6, /* load_int. */ + 6, /* store_int. */ + 6, /* load_fp. */ + 6, /* store_fp. */ + 6, /* load_pred. */ + 6 /* store_pred. */ + }, /* memmov_cost. */ 2, /* issue_rate */ AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */ "8", /* function_align. */ @@ -1536,7 +1590,13 @@ static const struct tune_params tsv110_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 4, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ @@ -1563,7 +1623,13 @@ static const struct tune_params xgene1_tunings = &generic_branch_cost, &xgene1_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 6, /* memmov_cost */ + { 6, /* load_int. */ + 6, /* store_int. */ + 6, /* load_fp. */ + 6, /* store_fp. */ + 6, /* load_pred. */ + 6 /* store_pred. */ + }, /* memmov_cost. */ 4, /* issue_rate */ AARCH64_FUSE_NOTHING, /* fusible_ops */ "16", /* function_align. */ @@ -1589,7 +1655,13 @@ static const struct tune_params emag_tunings = &generic_branch_cost, &xgene1_approx_modes, SVE_NOT_IMPLEMENTED, - 6, /* memmov_cost */ + { 6, /* load_int. */ + 6, /* store_int. */ + 6, /* load_fp. */ + 6, /* store_fp. */ + 6, /* load_pred. */ + 6 /* store_pred. */ + }, /* memmov_cost. */ 4, /* issue_rate */ AARCH64_FUSE_NOTHING, /* fusible_ops */ "16", /* function_align. */ @@ -1615,7 +1687,13 @@ static const struct tune_params qdf24xx_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 4, /* issue_rate */ (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */ @@ -1644,7 +1722,13 @@ static const struct tune_params saphira_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 4, /* issue_rate */ (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */ @@ -1671,7 +1755,13 @@ static const struct tune_params thunderx2t99_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost. */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 4, /* issue_rate. */ (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ @@ -1698,7 +1788,13 @@ static const struct tune_params thunderx3t110_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost. */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 6, /* issue_rate. */ (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ @@ -1725,7 +1821,13 @@ static const struct tune_params neoversen1_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 1, /* store_int. */ + 5, /* load_fp. */ + 2, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 3, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ "32:16", /* function_align. */ @@ -1751,7 +1853,13 @@ static const struct tune_params ampere1_tunings = &generic_branch_cost, &generic_approx_modes, SVE_NOT_IMPLEMENTED, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 4, /* issue_rate */ (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK | @@ -1926,7 +2034,13 @@ static const struct tune_params neoversev1_tunings = &generic_branch_cost, &generic_approx_modes, SVE_256, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 1, /* store_int. */ + 6, /* load_fp. */ + 2, /* store_fp. */ + 6, /* load_pred. */ + 1 /* store_pred. */ + }, /* memmov_cost. */ 3, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ "32:16", /* function_align. */ @@ -2057,7 +2171,13 @@ static const struct tune_params neoverse512tvb_tunings = &generic_branch_cost, &generic_approx_modes, SVE_128 | SVE_256, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 1, /* store_int. */ + 6, /* load_fp. */ + 2, /* store_fp. */ + 6, /* load_pred. */ + 1 /* store_pred. */ + }, /* memmov_cost. */ 3, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ "32:16", /* function_align. */ @@ -2085,7 +2205,13 @@ static const struct tune_params neoversen2_tunings = &generic_branch_cost, &generic_approx_modes, SVE_128, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 1, /* store_int. */ + 6, /* load_fp. */ + 2, /* store_fp. */ + 6, /* load_pred. */ + 1 /* store_pred. */ + }, /* memmov_cost. */ 3, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ "32:16", /* function_align. */ @@ -2111,7 +2237,13 @@ static const struct tune_params a64fx_tunings = &generic_branch_cost, &generic_approx_modes, SVE_512, /* sve_width */ - 4, /* memmov_cost */ + { 4, /* load_int. */ + 4, /* store_int. */ + 4, /* load_fp. */ + 4, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ 7, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ "32", /* function_align. */ @@ -14501,12 +14633,41 @@ aarch64_register_move_cost (machine_mode mode, return regmove_cost->FP2FP; } +/* Implements TARGET_MEMORY_MOVE_COST. */ static int -aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED, - reg_class_t rclass ATTRIBUTE_UNUSED, - bool in ATTRIBUTE_UNUSED) +aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in) { - return aarch64_tune_params.memmov_cost; + enum reg_class rclass = (enum reg_class) rclass_i; + switch (rclass) + { + case PR_LO_REGS: + case PR_HI_REGS: + case PR_REGS: + return in ? aarch64_tune_params.memmov_cost.load_pred + : aarch64_tune_params.memmov_cost.store_pred; + case POINTER_AND_FP_REGS: + case ALL_REGS: + { + if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) + return in ? aarch64_tune_params.memmov_cost.load_pred + : aarch64_tune_params.memmov_cost.store_pred; + + if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)) + return in ? aarch64_tune_params.memmov_cost.load_fp + : aarch64_tune_params.memmov_cost.store_fp; + + return in ? aarch64_tune_params.memmov_cost.load_int + : aarch64_tune_params.memmov_cost.store_int; + } + case FP_LO8_REGS: + case FP_LO_REGS: + case FP_REGS: + return in ? aarch64_tune_params.memmov_cost.load_fp + : aarch64_tune_params.memmov_cost.store_fp; + default: + return in ? aarch64_tune_params.memmov_cost.load_int + : aarch64_tune_params.memmov_cost.store_int; + } } /* Implement TARGET_INIT_BUILTINS. */

[aarch64] Update reg-costs to differentiate between memmove costs

Commit Message

Comments

Patch