diff mbox series

AArch64: Remove AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS

Message ID PAWPR08MB89827AE96FB255C52AC6355B837D2@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers show
Series AArch64: Remove AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS | expand

Commit Message

Wilco Dijkstra Jan. 30, 2024, 3:37 p.m. UTC
(follow-on based on review comments on
https://gcc.gnu.org/pipermail/gcc-patches/2024-January/641913.html)


Remove the tune AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS since it is only
used by an old core and doesn't properly support -Os.  SPECINT_2017
shows that removing it has no performance difference, while codesize
is reduced by 0.07%.

Passes regress, OK for commit?

gcc/ChangeLog:
        * config/aarch64/aarch64.cc (aarch64_mode_valid_for_sched_fusion_p):
        Remove check for AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS.
        (aarch64_advsimd_ldp_stp_p): Likewise.
        (aarch64_stp_sequence_cost): Likewise.
        (aarch64_expand_cpymem): Likewise.
        (aarch64_expand_setmem): Likewise.
        * config/aarch64/aarch64-ldp-fusion.cc (ldp_operand_mode_ok_p): Likewise.	
        * config/aarch64/aarch64-ldpstp.md: Likewise.
        * config/aarch64/aarch64-tuning-flags.def: Remove NO_LDP_STP_QREGS.
        * config/aarch64/tuning_models/emag.h: Likewise.
        * config/aarch64/tuning_models/xgene1.h: Likewise.

gcc/testsuite/ChangeLog:
        * gcc.target/aarch64/ldp_stp_q_disable.c: Remove test.

---

Comments

Richard Sandiford Feb. 1, 2024, 5:26 p.m. UTC | #1
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> (follow-on based on review comments on
> https://gcc.gnu.org/pipermail/gcc-patches/2024-January/641913.html)
>
>
> Remove the tune AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS since it is only
> used by an old core and doesn't properly support -Os.  SPECINT_2017
> shows that removing it has no performance difference, while codesize
> is reduced by 0.07%.
>
> Passes regress, OK for commit?
>
> gcc/ChangeLog:
>         * config/aarch64/aarch64.cc (aarch64_mode_valid_for_sched_fusion_p):
>         Remove check for AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS.
>         (aarch64_advsimd_ldp_stp_p): Likewise.
>         (aarch64_stp_sequence_cost): Likewise.
>         (aarch64_expand_cpymem): Likewise.
>         (aarch64_expand_setmem): Likewise.
>         * config/aarch64/aarch64-ldp-fusion.cc (ldp_operand_mode_ok_p): Likewise.
>         * config/aarch64/aarch64-ldpstp.md: Likewise.
>         * config/aarch64/aarch64-tuning-flags.def: Remove NO_LDP_STP_QREGS.
>         * config/aarch64/tuning_models/emag.h: Likewise.
>         * config/aarch64/tuning_models/xgene1.h: Likewise.
>
> gcc/testsuite/ChangeLog:
>         * gcc.target/aarch64/ldp_stp_q_disable.c: Remove test.

OK for GCC 15 (but only once GCC 14 has branched) if no-one objects in
the meantime.

Thanks,
Richard

> ---
>
> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc b/gcc/config/aarch64/aarch64-ldp-fusion.cc
> index 22ed95eb743c9ee44e745560b207d389c8fca03b..de6685f75a2650d9a7d39fe6781ec57214092eb1 100644
> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
> @@ -315,17 +315,9 @@ any_post_modify_p (rtx x)
>  static bool
>  ldp_operand_mode_ok_p (machine_mode mode)
>  {
> -  const bool allow_qregs
> -    = !(aarch64_tune_params.extra_tuning_flags
> -       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
> -
>    if (!aarch64_ldpstp_operand_mode_p (mode))
>      return false;
>
> -  const auto size = GET_MODE_SIZE (mode).to_constant ();
> -  if (size == 16 && !allow_qregs)
> -    return false;
> -
>    // We don't pair up TImode accesses before RA because TImode is
>    // special in that it can be allocated to a pair of GPRs or a single
>    // FPR, and the RA is best placed to make that decision.
> diff --git a/gcc/config/aarch64/aarch64-ldpstp.md b/gcc/config/aarch64/aarch64-ldpstp.md
> index b7c0bf05cd18c971955d667bae91d7c3dc3f512e..7890a8cc32b24f8e1bc29cb722b10e511e7881ab 100644
> --- a/gcc/config/aarch64/aarch64-ldpstp.md
> +++ b/gcc/config/aarch64/aarch64-ldpstp.md
> @@ -96,9 +96,7 @@ (define_peephole2
>     (set (match_operand:VQ2 2 "register_operand" "")
>         (match_operand:VQ2 3 "memory_operand" ""))]
>    "TARGET_FLOAT
> -   && aarch64_operands_ok_for_ldpstp (operands, true)
> -   && (aarch64_tune_params.extra_tuning_flags
> -       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
> +   && aarch64_operands_ok_for_ldpstp (operands, true)"
>    [(const_int 0)]
>  {
>    aarch64_finish_ldpstp_peephole (operands, true);
> @@ -111,9 +109,7 @@ (define_peephole2
>     (set (match_operand:VQ2 2 "memory_operand" "")
>         (match_operand:VQ2 3 "register_operand" ""))]
>    "TARGET_FLOAT
> -   && aarch64_operands_ok_for_ldpstp (operands, false)
> -   && (aarch64_tune_params.extra_tuning_flags
> -       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
> +   && aarch64_operands_ok_for_ldpstp (operands, false)"
>    [(const_int 0)]
>  {
>    aarch64_finish_ldpstp_peephole (operands, false);
> diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
> index d917da720b22ed6aaf360dc4ebbe8efc4a3185f2..d5bcaebce770f0b217aac783063d39135f754c77 100644
> --- a/gcc/config/aarch64/aarch64-tuning-flags.def
> +++ b/gcc/config/aarch64/aarch64-tuning-flags.def
> @@ -36,9 +36,6 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
>     are not considered cheap.  */
>  AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
>
> -/* Disallow load/store pair instructions on Q-registers.  */
> -AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS)
> -
>  AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
>
>  AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 433c160cba22374f6b7a3445c0202789927abd25..d7e8379b2eb90eccb8608a15cc8d11cc2187a9e7 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -10335,9 +10335,7 @@ aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
>          || mode == SDmode || mode == DDmode
>          || (aarch64_vector_mode_supported_p (mode)
>              && (known_eq (GET_MODE_SIZE (mode), 8)
> -                || (known_eq (GET_MODE_SIZE (mode), 16)
> -                   && (aarch64_tune_params.extra_tuning_flags
> -                       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
> +                || known_eq (GET_MODE_SIZE (mode), 16)));
>  }
>
>  /* Return true if REGNO is a virtual pointer register, or an eliminable
> @@ -16448,10 +16446,6 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
>        return false;
>      }
>
> -  if (aarch64_tune_params.extra_tuning_flags
> -      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> -    return false;
> -
>    return is_gimple_assign (stmt_info->stmt);
>  }
>
> @@ -17099,9 +17093,6 @@ aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
>        /* Count 1 insn per vector if we can't form STP Q pairs.  */
>        if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
>         return count * 2;
> -      if (aarch64_tune_params.extra_tuning_flags
> -         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> -       return count * 2;
>
>        if (stmt_info)
>         {
> @@ -26454,11 +26445,9 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>      return aarch64_expand_cpymem_mops (operands, is_memmove);
>
>    unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
> -  bool use_ldpq = TARGET_SIMD && !(aarch64_tune_params.extra_tuning_flags
> -                                  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
>
>    /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
> -  unsigned max_copy_size = use_ldpq ? 256 : 128;
> +  unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
>    unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
>                                        : aarch64_mops_memcpy_size_threshold;
>
> @@ -26470,16 +26459,13 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>    if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
>      return aarch64_expand_cpymem_mops (operands, is_memmove);
>
> -  unsigned copy_max = 32;
> -
> -  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
> -     support or slow LDP/STP fall back to 16-byte chunks.
> +  /* Default to 32-byte LDP/STP on large copies, however small copies or
> +     no SIMD support fall back to 16-byte chunks.
>
>       ??? Although it would be possible to use LDP/STP Qn in streaming mode
>       (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
>       whether that would improve performance.  */
> -  if (size <= 24 || !use_ldpq)
> -    copy_max = 16;
> +  unsigned copy_max = (size <= 24 || !TARGET_SIMD) ? 16 : 32;
>
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> @@ -26612,13 +26598,8 @@ aarch64_expand_setmem (rtx *operands)
>    src = expand_vector_broadcast (V16QImode, val);
>    src = force_reg (V16QImode, src);
>
> -  /* Set maximum amount to write in one go.  We allow 32-byte chunks based
> -     on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> -  unsigned set_max = 32;
> -
> -  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
> -                   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> -    set_max = 16;
> +  /* Set maximum number of bytes to write per instruction.  */
> +  unsigned set_max = (len <= 24) ? 16 : 32;
>
>    int offset = 0;
>    while (len > 0)
> diff --git a/gcc/config/aarch64/tuning_models/emag.h b/gcc/config/aarch64/tuning_models/emag.h
> index cbaf8853ec4f040cab6dc82e62150d99c30ad1df..b6a9c9e2eb17e9ead91e70088a5ece0c74672e70 100644
> --- a/gcc/config/aarch64/tuning_models/emag.h
> +++ b/gcc/config/aarch64/tuning_models/emag.h
> @@ -51,7 +51,7 @@ static const struct tune_params emag_tunings =
>    2,   /* min_div_recip_mul_df.  */
>    17,  /* max_case_values.  */
>    tune_params::AUTOPREFETCHER_OFF,     /* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),       /* tune_flags.  */
> +  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
>    &xgene1_prefetch_tune,
>    AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
>    AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> diff --git a/gcc/config/aarch64/tuning_models/xgene1.h b/gcc/config/aarch64/tuning_models/xgene1.h
> index 3301f0252609f3798b78bac18a480af8c3f0f45d..432793eba9cd66c6d3423a9443fa0534438c21b6 100644
> --- a/gcc/config/aarch64/tuning_models/xgene1.h
> +++ b/gcc/config/aarch64/tuning_models/xgene1.h
> @@ -136,7 +136,7 @@ static const struct tune_params xgene1_tunings =
>    2,   /* min_div_recip_mul_df.  */
>    17,  /* max_case_values.  */
>    tune_params::AUTOPREFETCHER_OFF,     /* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),       /* tune_flags.  */
> +  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
>    &xgene1_prefetch_tune,
>    AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
>    AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c
> deleted file mode 100644
> index 38c1870c47c69175e4b641532333bf7108351476..0000000000000000000000000000000000000000
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c
> +++ /dev/null
> @@ -1,26 +0,0 @@
> -/* { dg-options "-O2 -moverride=tune=no_ldp_stp_qregs" } */
> -
> -typedef float float32x4_t __attribute__ ((__vector_size__ ((16))));
> -
> -float32x4_t arr[4][4];
> -
> -void
> -foo (float32x4_t x, float32x4_t y)
> -{
> -  arr[0][1] = x;
> -  arr[1][0] = y;
> -  arr[2][0] = x;
> -  arr[1][1] = y;
> -  arr[0][2] = x;
> -  arr[0][3] = y;
> -  arr[1][2] = x;
> -  arr[2][1] = y;
> -  arr[3][0] = x;
> -  arr[3][1] = y;
> -  arr[2][2] = x;
> -  arr[1][3] = y;
> -  arr[2][3] = x;
> -  arr[3][2] = y;
> -}
> -
> -/* { dg-final { scan-assembler-not "stp\tq\[0-9\]+, q\[0-9\]" } } */
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 22ed95eb743c9ee44e745560b207d389c8fca03b..de6685f75a2650d9a7d39fe6781ec57214092eb1 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -315,17 +315,9 @@  any_post_modify_p (rtx x)
 static bool
 ldp_operand_mode_ok_p (machine_mode mode)
 {
-  const bool allow_qregs
-    = !(aarch64_tune_params.extra_tuning_flags
-	& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
-
   if (!aarch64_ldpstp_operand_mode_p (mode))
     return false;
 
-  const auto size = GET_MODE_SIZE (mode).to_constant ();
-  if (size == 16 && !allow_qregs)
-    return false;
-
   // We don't pair up TImode accesses before RA because TImode is
   // special in that it can be allocated to a pair of GPRs or a single
   // FPR, and the RA is best placed to make that decision.
diff --git a/gcc/config/aarch64/aarch64-ldpstp.md b/gcc/config/aarch64/aarch64-ldpstp.md
index b7c0bf05cd18c971955d667bae91d7c3dc3f512e..7890a8cc32b24f8e1bc29cb722b10e511e7881ab 100644
--- a/gcc/config/aarch64/aarch64-ldpstp.md
+++ b/gcc/config/aarch64/aarch64-ldpstp.md
@@ -96,9 +96,7 @@  (define_peephole2
    (set (match_operand:VQ2 2 "register_operand" "")
 	(match_operand:VQ2 3 "memory_operand" ""))]
   "TARGET_FLOAT
-   && aarch64_operands_ok_for_ldpstp (operands, true)
-   && (aarch64_tune_params.extra_tuning_flags
-	& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
+   && aarch64_operands_ok_for_ldpstp (operands, true)"
   [(const_int 0)]
 {
   aarch64_finish_ldpstp_peephole (operands, true);
@@ -111,9 +109,7 @@  (define_peephole2
    (set (match_operand:VQ2 2 "memory_operand" "")
 	(match_operand:VQ2 3 "register_operand" ""))]
   "TARGET_FLOAT
-   && aarch64_operands_ok_for_ldpstp (operands, false)
-   && (aarch64_tune_params.extra_tuning_flags
-	& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
+   && aarch64_operands_ok_for_ldpstp (operands, false)"
   [(const_int 0)]
 {
   aarch64_finish_ldpstp_peephole (operands, false);
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index d917da720b22ed6aaf360dc4ebbe8efc4a3185f2..d5bcaebce770f0b217aac783063d39135f754c77 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -36,9 +36,6 @@  AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
    are not considered cheap.  */
 AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
 
-/* Disallow load/store pair instructions on Q-registers.  */
-AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS)
-
 AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
 
 AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 433c160cba22374f6b7a3445c0202789927abd25..d7e8379b2eb90eccb8608a15cc8d11cc2187a9e7 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -10335,9 +10335,7 @@  aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
 	 || mode == SDmode || mode == DDmode
 	 || (aarch64_vector_mode_supported_p (mode)
 	     && (known_eq (GET_MODE_SIZE (mode), 8)
-		 || (known_eq (GET_MODE_SIZE (mode), 16)
-		    && (aarch64_tune_params.extra_tuning_flags
-			& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
+		 || known_eq (GET_MODE_SIZE (mode), 16)));
 }
 
 /* Return true if REGNO is a virtual pointer register, or an eliminable
@@ -16448,10 +16446,6 @@  aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
       return false;
     }
 
-  if (aarch64_tune_params.extra_tuning_flags
-      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
-    return false;
-
   return is_gimple_assign (stmt_info->stmt);
 }
 
@@ -17099,9 +17093,6 @@  aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
       /* Count 1 insn per vector if we can't form STP Q pairs.  */
       if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
 	return count * 2;
-      if (aarch64_tune_params.extra_tuning_flags
-	  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
-	return count * 2;
 
       if (stmt_info)
 	{
@@ -26454,11 +26445,9 @@  aarch64_expand_cpymem (rtx *operands, bool is_memmove)
     return aarch64_expand_cpymem_mops (operands, is_memmove);
 
   unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
-  bool use_ldpq = TARGET_SIMD && !(aarch64_tune_params.extra_tuning_flags
-				   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
 
   /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
-  unsigned max_copy_size = use_ldpq ? 256 : 128;
+  unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
   unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
 				       : aarch64_mops_memcpy_size_threshold;
 
@@ -26470,16 +26459,13 @@  aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
     return aarch64_expand_cpymem_mops (operands, is_memmove);
 
-  unsigned copy_max = 32;
-
-  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
-     support or slow LDP/STP fall back to 16-byte chunks.
+  /* Default to 32-byte LDP/STP on large copies, however small copies or
+     no SIMD support fall back to 16-byte chunks.
 
      ??? Although it would be possible to use LDP/STP Qn in streaming mode
      (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
      whether that would improve performance.  */
-  if (size <= 24 || !use_ldpq)
-    copy_max = 16;
+  unsigned copy_max = (size <= 24 || !TARGET_SIMD) ? 16 : 32;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -26612,13 +26598,8 @@  aarch64_expand_setmem (rtx *operands)
   src = expand_vector_broadcast (V16QImode, val);
   src = force_reg (V16QImode, src);
 
-  /* Set maximum amount to write in one go.  We allow 32-byte chunks based
-     on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
-  unsigned set_max = 32;
-
-  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
-		    & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
-    set_max = 16;
+  /* Set maximum number of bytes to write per instruction.  */
+  unsigned set_max = (len <= 24) ? 16 : 32;
 
   int offset = 0;
   while (len > 0)
diff --git a/gcc/config/aarch64/tuning_models/emag.h b/gcc/config/aarch64/tuning_models/emag.h
index cbaf8853ec4f040cab6dc82e62150d99c30ad1df..b6a9c9e2eb17e9ead91e70088a5ece0c74672e70 100644
--- a/gcc/config/aarch64/tuning_models/emag.h
+++ b/gcc/config/aarch64/tuning_models/emag.h
@@ -51,7 +51,7 @@  static const struct tune_params emag_tunings =
   2,	/* min_div_recip_mul_df.  */
   17,	/* max_case_values.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
   &xgene1_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/xgene1.h b/gcc/config/aarch64/tuning_models/xgene1.h
index 3301f0252609f3798b78bac18a480af8c3f0f45d..432793eba9cd66c6d3423a9443fa0534438c21b6 100644
--- a/gcc/config/aarch64/tuning_models/xgene1.h
+++ b/gcc/config/aarch64/tuning_models/xgene1.h
@@ -136,7 +136,7 @@  static const struct tune_params xgene1_tunings =
   2,	/* min_div_recip_mul_df.  */
   17,	/* max_case_values.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
   &xgene1_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS    /* stp_policy_model.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c
deleted file mode 100644
index 38c1870c47c69175e4b641532333bf7108351476..0000000000000000000000000000000000000000
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c
+++ /dev/null
@@ -1,26 +0,0 @@ 
-/* { dg-options "-O2 -moverride=tune=no_ldp_stp_qregs" } */
-
-typedef float float32x4_t __attribute__ ((__vector_size__ ((16))));
-
-float32x4_t arr[4][4];
-
-void
-foo (float32x4_t x, float32x4_t y)
-{
-  arr[0][1] = x;
-  arr[1][0] = y;
-  arr[2][0] = x;
-  arr[1][1] = y;
-  arr[0][2] = x;
-  arr[0][3] = y;
-  arr[1][2] = x;
-  arr[2][1] = y;
-  arr[3][0] = x;
-  arr[3][1] = y;
-  arr[2][2] = x;
-  arr[1][3] = y;
-  arr[2][3] = x;
-  arr[3][2] = y;
-}
-
-/* { dg-final { scan-assembler-not "stp\tq\[0-9\]+, q\[0-9\]" } } */