diff mbox series

[aarch64] Update reg-costs to differentiate between memmove costs

Message ID e9f205c9-96e7-0825-11d5-d3f312d1984e@arm.com
State New
Headers show
Series [aarch64] Update reg-costs to differentiate between memmove costs | expand

Commit Message

Andre Vieira (lists) March 16, 2022, 2:56 p.m. UTC
This patch introduces a struct to differentiate between different 
memmove costs to enable a better modeling of memory operations. These 
have been modelled for 
-mcpu/-mtune=neoverse-v1/neoverse-n1/neoverse-n2/neoverse-512tvb, for 
all other tunings all entries are equal to the old single memmove cost 
to ensure the behaviour remains the same.


2022-03-16  Tamar Christina  <tamar.christina@arm.com>
                        Andre Vieira <andre.simoesdiasvieira@arm.com>

gcc/ChangeLog:

         * config/aarch64/aarch64-protos.h (struct cpu_memmov_cost): New 
struct.
         (struct tune_params): Change type of memmov_cost to use 
cpu_memmov_cost.
         * config/aarch64/aarch64.cc (aarch64_memory_move_cost): Update 
all tunings
         to use new cpu_memmov_cost struct.

Comments

Richard Sandiford March 16, 2022, 5:10 p.m. UTC | #1
"Andre Vieira (lists)" <andre.simoesdiasvieira@arm.com> writes:
> This patch introduces a struct to differentiate between different 
> memmove costs to enable a better modeling of memory operations. These 
> have been modelled for 
> -mcpu/-mtune=neoverse-v1/neoverse-n1/neoverse-n2/neoverse-512tvb, for 
> all other tunings all entries are equal to the old single memmove cost 
> to ensure the behaviour remains the same.

Thanks for doing this.  Having the same cost for loads and stores
has been a long-standing wart.

> 2022-03-16  Tamar Christina  <tamar.christina@arm.com>
>                         Andre Vieira <andre.simoesdiasvieira@arm.com>
>
> gcc/ChangeLog:
>
>          * config/aarch64/aarch64-protos.h (struct cpu_memmov_cost): New 
> struct.
>          (struct tune_params): Change type of memmov_cost to use 
> cpu_memmov_cost.
>          * config/aarch64/aarch64.cc (aarch64_memory_move_cost): Update 
> all tunings
>          to use new cpu_memmov_cost struct.
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index f2fde35c6eb4989af8736db8fad004171c160282..5190eb8b96ea9af809a28470905b8b85ee720b09 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -508,6 +508,18 @@ struct cpu_prefetch_tune
>    const int default_opt_level;
>  };
>  
> +/* Model the costs for loads/stores for reload so that it can do more

I'd say s/reload/the register allocators/ here, since the costs affect
decisions made by IRA too.

> +   accurate spill heuristics.  */
> +struct cpu_memmov_cost
> +{
> +  int load_int;
> +  int store_int;
> +  int load_fp;
> +  int store_fp;
> +  int load_pred;
> +  int store_pred;
> +};
> +
>  struct tune_params
>  {
>    const struct cpu_cost_table *insn_extra_cost;
> […]
> @@ -14501,12 +14633,41 @@ aarch64_register_move_cost (machine_mode mode,
>    return regmove_cost->FP2FP;
>  }
>  
> +/* Implements TARGET_MEMORY_MOVE_COST.  */
>  static int
> -aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
> -			  reg_class_t rclass ATTRIBUTE_UNUSED,
> -			  bool in ATTRIBUTE_UNUSED)
> +aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
>  {
> -  return aarch64_tune_params.memmov_cost;
> +  enum reg_class rclass = (enum reg_class) rclass_i;
> +  switch (rclass)
> +    {
> +    case PR_LO_REGS:
> +    case PR_HI_REGS:
> +    case PR_REGS:
> +      return in ? aarch64_tune_params.memmov_cost.load_pred
> +		: aarch64_tune_params.memmov_cost.store_pred;
> +    case POINTER_AND_FP_REGS:
> +    case ALL_REGS:
> +      {
> +	if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
> +	  return in ? aarch64_tune_params.memmov_cost.load_pred
> +		    : aarch64_tune_params.memmov_cost.store_pred;
> +
> +	if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode))
> +	  return in ? aarch64_tune_params.memmov_cost.load_fp
> +		    : aarch64_tune_params.memmov_cost.store_fp;
> +
> +	return in ? aarch64_tune_params.memmov_cost.load_int
> +		  : aarch64_tune_params.memmov_cost.store_int;
> +      }
> +    case FP_LO8_REGS:
> +    case FP_LO_REGS:
> +    case FP_REGS:
> +      return in ? aarch64_tune_params.memmov_cost.load_fp
> +		: aarch64_tune_params.memmov_cost.store_fp;
> +    default:
> +      return in ? aarch64_tune_params.memmov_cost.load_int
> +		: aarch64_tune_params.memmov_cost.store_int;
> +    }
>  }

It would be good to avoid listing individual subclasses if possible,
since it's easy for the list to get out of date if more subclasses
are added.

An alternative would be:

  if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
      ? reg_classes_intersect_p (rclass, PR_REGS)
      : reg_class_subset_p (rclass, PR_REGS))
    return (in
	    ? aarch64_tune_params.memmov_cost.load_pred
	    : aarch64_tune_params.memmov_cost.store_pred);

  if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
      ? reg_classes_intersect_p (rclass, FP_REGS)
      : reg_class_subset_p (rclass, FP_REGS))
    return (in
	    ? aarch64_tune_params.memmov_cost.load_fp
	    : aarch64_tune_params.memmov_cost.store_fp);

  return (in
	  ? aarch64_tune_params.memmov_cost.load_int
	  : aarch64_tune_params.memmov_cost.store_int);

OK with that change, if it works.

Thanks,
Richard
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index f2fde35c6eb4989af8736db8fad004171c160282..5190eb8b96ea9af809a28470905b8b85ee720b09 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -508,6 +508,18 @@  struct cpu_prefetch_tune
   const int default_opt_level;
 };
 
+/* Model the costs for loads/stores for reload so that it can do more
+   accurate spill heuristics.  */
+struct cpu_memmov_cost
+{
+  int load_int;
+  int store_int;
+  int load_fp;
+  int store_fp;
+  int load_pred;
+  int store_pred;
+};
+
 struct tune_params
 {
   const struct cpu_cost_table *insn_extra_cost;
@@ -520,7 +532,8 @@  struct tune_params
      or SVE_NOT_IMPLEMENTED if not applicable.  Only used for tuning
      decisions, does not disable VLA vectorization.  */
   unsigned int sve_width;
-  int memmov_cost;
+  /* Structure used by reload to cost spills.  */
+  struct cpu_memmov_cost memmov_cost;
   int issue_rate;
   unsigned int fusible_ops;
   const char *function_align;
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9a94f3a30b0f1acc3c9b8a0e3d703e60780d0cbc..3fc5e0bd3d3f39f99b0c8ffb9357603bc0998515 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1291,7 +1291,13 @@  static const struct tune_params generic_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   2, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
   "16:12",	/* function_align.  */
@@ -1320,7 +1326,13 @@  static const struct tune_params cortexa35_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   1, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
@@ -1347,7 +1359,13 @@  static const struct tune_params cortexa53_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   2, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
@@ -1374,7 +1392,13 @@  static const struct tune_params cortexa57_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
@@ -1401,7 +1425,13 @@  static const struct tune_params cortexa72_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
@@ -1428,7 +1458,13 @@  static const struct tune_params cortexa73_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost.  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   2, /* issue_rate.  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
@@ -1457,7 +1493,13 @@  static const struct tune_params exynosm1_tunings =
   &generic_branch_cost,
   &exynosm1_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4,	/* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   3,	/* issue_rate  */
   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
   "4",	/* function_align.  */
@@ -1483,7 +1525,13 @@  static const struct tune_params thunderxt88_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  6, /* memmov_cost  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
   2, /* issue_rate  */
   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
   "8",	/* function_align.  */
@@ -1509,7 +1557,13 @@  static const struct tune_params thunderx_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  6, /* memmov_cost  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
   2, /* issue_rate  */
   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
   "8",	/* function_align.  */
@@ -1536,7 +1590,13 @@  static const struct tune_params tsv110_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4,    /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   4,    /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
@@ -1563,7 +1623,13 @@  static const struct tune_params xgene1_tunings =
   &generic_branch_cost,
   &xgene1_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  6, /* memmov_cost  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
   4, /* issue_rate  */
   AARCH64_FUSE_NOTHING, /* fusible_ops  */
   "16",	/* function_align.  */
@@ -1589,7 +1655,13 @@  static const struct tune_params emag_tunings =
   &generic_branch_cost,
   &xgene1_approx_modes,
   SVE_NOT_IMPLEMENTED,
-  6, /* memmov_cost  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
   4, /* issue_rate  */
   AARCH64_FUSE_NOTHING, /* fusible_ops  */
   "16",	/* function_align.  */
@@ -1615,7 +1687,13 @@  static const struct tune_params qdf24xx_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   4, /* issue_rate  */
   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
@@ -1644,7 +1722,13 @@  static const struct tune_params saphira_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   4, /* issue_rate  */
   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
@@ -1671,7 +1755,13 @@  static const struct tune_params thunderx2t99_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost.  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   4, /* issue_rate.  */
   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
@@ -1698,7 +1788,13 @@  static const struct tune_params thunderx3t110_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost.  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   6, /* issue_rate.  */
   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
@@ -1725,7 +1821,13 @@  static const struct tune_params neoversen1_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    1, /* store_int.  */
+    5, /* load_fp.  */
+    2, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
   "32:16",	/* function_align.  */
@@ -1751,7 +1853,13 @@  static const struct tune_params ampere1_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   4, /* issue_rate  */
   (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
    AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
@@ -1926,7 +2034,13 @@  static const struct tune_params neoversev1_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_256, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    1, /* store_int.  */
+    6, /* load_fp.  */
+    2, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
   "32:16",	/* function_align.  */
@@ -2057,7 +2171,13 @@  static const struct tune_params neoverse512tvb_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_128 | SVE_256, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    1, /* store_int.  */
+    6, /* load_fp.  */
+    2, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
   "32:16",	/* function_align.  */
@@ -2085,7 +2205,13 @@  static const struct tune_params neoversen2_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_128, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    1, /* store_int.  */
+    6, /* load_fp.  */
+    2, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
   "32:16",	/* function_align.  */
@@ -2111,7 +2237,13 @@  static const struct tune_params a64fx_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_512, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   7, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
   "32",	/* function_align.  */
@@ -14501,12 +14633,41 @@  aarch64_register_move_cost (machine_mode mode,
   return regmove_cost->FP2FP;
 }
 
+/* Implements TARGET_MEMORY_MOVE_COST.  */
 static int
-aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
-			  reg_class_t rclass ATTRIBUTE_UNUSED,
-			  bool in ATTRIBUTE_UNUSED)
+aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
 {
-  return aarch64_tune_params.memmov_cost;
+  enum reg_class rclass = (enum reg_class) rclass_i;
+  switch (rclass)
+    {
+    case PR_LO_REGS:
+    case PR_HI_REGS:
+    case PR_REGS:
+      return in ? aarch64_tune_params.memmov_cost.load_pred
+		: aarch64_tune_params.memmov_cost.store_pred;
+    case POINTER_AND_FP_REGS:
+    case ALL_REGS:
+      {
+	if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
+	  return in ? aarch64_tune_params.memmov_cost.load_pred
+		    : aarch64_tune_params.memmov_cost.store_pred;
+
+	if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode))
+	  return in ? aarch64_tune_params.memmov_cost.load_fp
+		    : aarch64_tune_params.memmov_cost.store_fp;
+
+	return in ? aarch64_tune_params.memmov_cost.load_int
+		  : aarch64_tune_params.memmov_cost.store_int;
+      }
+    case FP_LO8_REGS:
+    case FP_LO_REGS:
+    case FP_REGS:
+      return in ? aarch64_tune_params.memmov_cost.load_fp
+		: aarch64_tune_params.memmov_cost.store_fp;
+    default:
+      return in ? aarch64_tune_params.memmov_cost.load_int
+		: aarch64_tune_params.memmov_cost.store_int;
+    }
 }
 
 /* Implement TARGET_INIT_BUILTINS.  */