diff mbox series

AArch64: memcpy/memset expansions should not emit LDP/STP [PR113618]

Message ID PAWPR08MB898278D3050BCDBEB74859F083432@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers show
Series AArch64: memcpy/memset expansions should not emit LDP/STP [PR113618] | expand

Commit Message

Wilco Dijkstra Feb. 1, 2024, 1:45 p.m. UTC
The new RTL introduced for LDP/STP results in regressions due to use of UNSPEC.
Given the new LDP fusion pass is good at finding LDP opportunities, change the
memcpy, memmove and memset expansions to emit single vector loads/stores.
This fixes the regression and enables more RTL optimization on the standard
memory accesses.  SPEC2017 performance improves slightly.  Codesize is a bit
worse due to missed LDP opportunities as discussed in the PR.

Passes regress, OK for commit?

gcc/ChangeLog:
        PR target/113618
        * config/aarch64/aarch64.cc (aarch64_copy_one_block): Remove. 
        (aarch64_expand_cpymem): Emit single load/store only.
        (aarch64_set_one_block): Remove.
        (aarch64_expand_setmem): Emit single stores only.

gcc/testsuite/ChangeLog:
        PR target/113618
        * gcc.target/aarch64/pr113618.c: New test.

---

Comments

Richard Sandiford Feb. 1, 2024, 6:21 p.m. UTC | #1
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> The new RTL introduced for LDP/STP results in regressions due to use of UNSPEC.
> Given the new LDP fusion pass is good at finding LDP opportunities, change the
> memcpy, memmove and memset expansions to emit single vector loads/stores.
> This fixes the regression and enables more RTL optimization on the standard
> memory accesses.  SPEC2017 performance improves slightly.  Codesize is a bit
> worse due to missed LDP opportunities as discussed in the PR.

It looks like this is really doing two things at once: disabling the
direct emission of LDP/STP Qs, and switching the GPR handling from using
pairs of DImode moves to single TImode moves.  At least, that seems to be
the effect of...

> Passes regress, OK for commit?
>
> gcc/ChangeLog:
>         PR target/113618
>         * config/aarch64/aarch64.cc (aarch64_copy_one_block): Remove.
>         (aarch64_expand_cpymem): Emit single load/store only.
>         (aarch64_set_one_block): Remove.
>         (aarch64_expand_setmem): Emit single stores only.
>
> gcc/testsuite/ChangeLog:
>         PR target/113618
>         * gcc.target/aarch64/pr113618.c: New test.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index d17198b4a5f73f8be8aeca3258b81809ffb48eac..2194441b949a53f181fe373e07bc18341c014918 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -26376,33 +26376,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
>                                     next, amount);
>  }
>
> -typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
> -
> -/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
> -static void
> -aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
> -                       int offset, machine_mode mode)
> -{
> -  /* Emit explict load/store pair instructions for 32-byte copies.  */
> -  if (known_eq (GET_MODE_SIZE (mode), 32))
> -    {
> -      mode = V4SImode;
> -      rtx src1 = adjust_address (src, mode, offset);
> -      rtx dst1 = adjust_address (dst, mode, offset);
> -      rtx reg1 = gen_reg_rtx (mode);
> -      rtx reg2 = gen_reg_rtx (mode);
> -      rtx load = aarch64_gen_load_pair (reg1, reg2, src1);
> -      rtx store = aarch64_gen_store_pair (dst1, reg1, reg2);
> -      ops.safe_push ({ load, store });
> -      return;
> -    }
> -
> -  rtx reg = gen_reg_rtx (mode);
> -  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
> -  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
> -  ops.safe_push ({ load, store });
> -}
> -
>  /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
>     from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
>     rather than memcpy.  Return true iff we succeeded.  */
> @@ -26438,7 +26411,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>    rtx src = operands[1];
>    unsigned align = UINTVAL (operands[3]);
>    rtx base;
> -  machine_mode cur_mode = BLKmode, next_mode;
> +  machine_mode mode = BLKmode, next_mode;
>
>    /* Variable-sized or strict-align copies may use the MOPS expansion.  */
>    if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
> @@ -26465,7 +26438,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>       ??? Although it would be possible to use LDP/STP Qn in streaming mode
>       (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
>       whether that would improve performance.  */
> -  unsigned copy_max = (size <= 24 || !TARGET_SIMD) ? 16 : 32;
> +  bool use_qregs = size > 24 && TARGET_SIMD;
>
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> @@ -26473,7 +26446,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>    src = adjust_automodify_address (src, VOIDmode, base, 0);
>
> -  copy_ops ops;
> +  auto_vec<std::pair<rtx, rtx>, 16> ops;
>    int offset = 0;
>
>    while (size > 0)
> @@ -26482,23 +26455,27 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>          or writing.  */
>        opt_scalar_int_mode mode_iter;
>        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> -       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
> -         cur_mode = mode_iter.require ();
> +       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
> +         mode = mode_iter.require ();

...hard-coding 16 here and...

> +
> +      gcc_assert (mode != BLKmode);
>
> -      gcc_assert (cur_mode != BLKmode);
> +      mode_bytes = GET_MODE_SIZE (mode).to_constant ();
>
> -      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
> +      /* Prefer Q-register accesses.  */
> +      if (mode_bytes == 16 && use_qregs)
> +       mode = V4SImode;
>
> -      /* Prefer Q-register accesses for the last bytes.  */
> -      if (mode_bytes == 16 && copy_max == 32)
> -       cur_mode = V4SImode;
> -      aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
> +      rtx reg = gen_reg_rtx (mode);
> +      rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
> +      rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
> +      ops.safe_push ({ load, store });
>        size -= mode_bytes;
>        offset += mode_bytes;
>
>        /* Emit trailing copies using overlapping unaligned accesses
>          (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> -      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
> +      if (size > 0 && size < 16 && !STRICT_ALIGNMENT)

...changing this limit from 8 to 16 for non-SIMD copies.

Is that deliberate?  If so, please mention that kind of thing in the
covering note.  It sounded like this was intended to change the handling
of vector moves only.

This means that, for GPRs, we are now effectively using the double-word
move patterns to get an LDP/STP indirectly, rather than directly as before.
That seems OK, and I suppose might be slightly preferable to the current
code for things like:

  char a[31], b[31];
  void f() { __builtin_memcpy(a, b, 31); }

Compiled with -O2 -mgeneral-regs-only, the patch gives:

        adrp    x0, .LANCHOR0
        add     x0, x0, :lo12:.LANCHOR0
        add     x2, x0, 47
        add     x1, x0, 15
        ldp     x4, x5, [x0, 32]
        stp     x4, x5, [x0]
        ldp     x2, x3, [x2]
        stp     x2, x3, [x1]

rather than GCC 13's:

        adrp    x0, .LANCHOR0
        add     x0, x0, :lo12:.LANCHOR0
        ldp     x2, x3, [x0, 32]
        stp     x2, x3, [x0]
        ldr     x2, [x0, 48]
        str     x2, [x0, 16]
        ldr     x1, [x0, 55]
        str     x1, [x0, 23]
        ret

But that raises the question: should we do the same thing for Q registers
and V2x16QImode?

We currently use LD1/ST1 in preference to LDP/STP for little-endian
movv2x16qi (but do use LDP/STP for big-endian).  But if that's not
desirable here then it probably isn't desirable elsewhere either.

If emitting individual vector loads and stores is better than using
V2x16QI (and I can see that it might be), then why isn't the same
true for GPRs and DImode vs TImode?

I think the final version of this patch should go in ahead of the
clean-up patch.  As I mentioned in the other review, I think the
clean-up should wait for GCC 15.

Thanks,
Richard

>         {
>           next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
>           int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
> @@ -26510,7 +26487,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>
>    /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
>    int nops = ops.length();
> -  int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
> +  int inc = is_memmove || nops <= 8 ? nops : 6;
>
>    for (int i = 0; i < nops; i += inc)
>      {
> @@ -26525,23 +26502,6 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>    return true;
>  }
>
> -/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
> -static void
> -aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
> -{
> -  /* Emit explict store pair instructions for 32-byte writes.  */
> -  if (known_eq (GET_MODE_SIZE (mode), 32))
> -    {
> -      mode = V16QImode;
> -      rtx dst1 = adjust_address (dst, mode, offset);
> -      emit_insn (aarch64_gen_store_pair (dst1, src, src));
> -      return;
> -    }
> -  if (known_lt (GET_MODE_SIZE (mode), 16))
> -    src = lowpart_subreg (mode, src, GET_MODE (src));
> -  emit_move_insn (adjust_address (dst, mode, offset), src);
> -}
> -
>  /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
>     as for the setmem pattern.  Return true iff we succeed.  */
>  static bool
> @@ -26574,7 +26534,7 @@ aarch64_expand_setmem (rtx *operands)
>    rtx val = operands[2], src;
>    unsigned align = UINTVAL (operands[3]);
>    rtx base;
> -  machine_mode cur_mode = BLKmode, next_mode;
> +  machine_mode mode = BLKmode, next_mode;
>
>    /* Variable-sized or strict-align memset may use the MOPS expansion.  */
>    if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
> @@ -26595,11 +26555,8 @@ aarch64_expand_setmem (rtx *operands)
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
>
>    /* Prepare the val using a DUP/MOVI v0.16B, val.  */
> -  src = expand_vector_broadcast (V16QImode, val);
> -  src = force_reg (V16QImode, src);
> -
> -  /* Set maximum number of bytes to write per instruction.  */
> -  unsigned set_max = (len <= 24) ? 16 : 32;
> +  val = expand_vector_broadcast (V16QImode, val);
> +  val = force_reg (V16QImode, val);
>
>    int offset = 0;
>    while (len > 0)
> @@ -26608,24 +26565,28 @@ aarch64_expand_setmem (rtx *operands)
>          over writing.  */
>        opt_scalar_int_mode mode_iter;
>        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> -       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
> -         cur_mode = mode_iter.require ();
> +       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16))
> +         mode = mode_iter.require ();
> +
> +      gcc_assert (mode != BLKmode);
>
> -      gcc_assert (cur_mode != BLKmode);
> +      mode_bytes = GET_MODE_SIZE (mode).to_constant ();
>
> -      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
> +      src = val;
>
> -      /* Prefer Q-register accesses for the last bytes.  */
> +      /* Prefer Q-register accesses.  */
>        if (mode_bytes == 16)
> -       cur_mode = V16QImode;
> +       mode = V16QImode;
> +      else
> +       src = lowpart_subreg (mode, src, GET_MODE (val));
>
> -      aarch64_set_one_block (src, dst, offset, cur_mode);
> +      emit_move_insn (adjust_address (dst, mode, offset), src);
>        len -= mode_bytes;
>        offset += mode_bytes;
>
>        /* Emit trailing writes using overlapping unaligned accesses
>          (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> -      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
> +      if (len > 0 && len < 16 && !STRICT_ALIGNMENT)
>         {
>           next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
>           int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr113618.c b/gcc/testsuite/gcc.target/aarch64/pr113618.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..f582360e7c1d149ad94c78c3e66f1c4a973750bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr113618.c
> @@ -0,0 +1,36 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +
> +/*
> +** move32:
> +**     ...
> +**     ldp     q([0-9]+), q([0-9]+), \[x0\]
> +**     stp     q\1, q\2, \[x1\]
> +**     ...
> +*/
> +
> +void move32 (char *a, char *b)
> +{
> +    char temp[32];
> +    __builtin_memcpy (temp, a, 32);
> +    __builtin_memcpy (b, temp, 32);
> +}
> +
> +/*
> +** move64:
> +**     ...
> +**     ldp     q([0-9]+), q([0-9]+), \[x0\]
> +**     ldp     q([0-9]+), q([0-9]+), \[x0, 32\]
> +**     stp     q\1, q\2, \[x1\]
> +**     stp     q\3, q\4, \[x1, 32\]
> +**     ...
> +*/
> +
> +void move64 (char *a, char *b)
> +{
> +    char temp[64];
> +    __builtin_memcpy (temp, a, 64);
> +    __builtin_memcpy (b, temp, 64);
> +}
Wilco Dijkstra Feb. 22, 2024, 5:05 p.m. UTC | #2
Hi Richard,

> It looks like this is really doing two things at once: disabling the
> direct emission of LDP/STP Qs, and switching the GPR handling from using
> pairs of DImode moves to single TImode moves.  At least, that seems to be
> the effect of...

No it still uses TImode for the !TARGET_SIMD case.

> +       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
> +         mode = mode_iter.require ();

> ...hard-coding 16 here and...

This only affects the Q register case.

> -      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
> +      if (size > 0 && size < 16 && !STRICT_ALIGNMENT)

> ...changing this limit from 8 to 16 for non-SIMD copies.
>
> Is that deliberate?  If so, please mention that kind of thing in the
> covering note.  It sounded like this was intended to change the handling
> of vector moves only.

Yes it's deliberate. It now basically treats everything as blocks of 16 bytes
which has a nice simplifying effect. I've added a note.

> This means that, for GPRs, we are now effectively using the double-word
> move patterns to get an LDP/STP indirectly, rather than directly as before.

No, there is no difference here.

> That seems OK, and I suppose might be slightly preferable to the current
> code for things like:
>
>  char a[31], b[31];
>  void f() { __builtin_memcpy(a, b, 31); }

Yes an unaligned tail improves slightly by using blocks of 16 bytes.
It's a very rare case, both -mgeneral-regs-only is rarely used, and most
fixed-size copies are a nice multiple of 8.

> But that raises the question: should we do the same thing for Q registers
> and V2x16QImode?

I don't believe it makes sense to use those complex types. And it likely
blocks optimizations in a similar way as UNSPEC does.

> If emitting individual vector loads and stores is better than using
> V2x16QI (and I can see that it might be), then why isn't the same
> true for GPRs and DImode vs TImode?

It might be feasible to do the same for scalar copies. But given that
using TImode works fine, there is no regression here, and use of
-mgeneral-regs-only is rare, what would the benefit be of doing that?

> I think the final version of this patch should go in ahead of the
> clean-up patch.  As I mentioned in the other review, I think the
> clean-up should wait for GCC 15.

I've rebased it to the trunk.

Cheers,
Wilco


v2: Rebase to trunk

The new RTL introduced for LDP/STP results in regressions due to use of UNSPEC.
Given the new LDP fusion pass is good at finding LDP opportunities, change the
memcpy, memmove and memset expansions to emit single vector loads/stores.
This fixes the regression and enables more RTL optimization on the standard
memory accesses.  Handling of unaligned tail of memcpy/memmove is improved
with -mgeneral-regs-only.  SPEC2017 performance improves slightly.  Codesize
is a bit worse due to missed LDP opportunities as discussed in the PR.

Passes regress, OK for commit?

gcc/ChangeLog:
        PR target/113618
        * config/aarch64/aarch64.cc (aarch64_copy_one_block): Remove. 
        (aarch64_expand_cpymem): Emit single load/store only.
        (aarch64_set_one_block): Emit single stores only.

gcc/testsuite/ChangeLog:
        PR target/113618
        * gcc.target/aarch64/pr113618.c: New test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 16318bf925883ecedf9345e53fc0824a553b2747..0a28e033088a00818c6ed9fa8c15ecdee5a86c35 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26465,33 +26465,6 @@ aarch64_progress_pointer (rtx pointer)
   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
 }
 
-typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
-
-/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
-static void
-aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
-			int offset, machine_mode mode)
-{
-  /* Emit explict load/store pair instructions for 32-byte copies.  */
-  if (known_eq (GET_MODE_SIZE (mode), 32))
-    {
-      mode = V4SImode;
-      rtx src1 = adjust_address (src, mode, offset);
-      rtx dst1 = adjust_address (dst, mode, offset);
-      rtx reg1 = gen_reg_rtx (mode);
-      rtx reg2 = gen_reg_rtx (mode);
-      rtx load = aarch64_gen_load_pair (reg1, reg2, src1);
-      rtx store = aarch64_gen_store_pair (dst1, reg1, reg2);
-      ops.safe_push ({ load, store });
-      return;
-    }
-
-  rtx reg = gen_reg_rtx (mode);
-  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
-  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
-  ops.safe_push ({ load, store });
-}
-
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
    rather than memcpy.  Return true iff we succeeded.  */
@@ -26527,7 +26500,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   rtx src = operands[1];
   unsigned align = UINTVAL (operands[3]);
   rtx base;
-  machine_mode cur_mode = BLKmode, next_mode;
+  machine_mode mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
@@ -26550,16 +26523,12 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
     return aarch64_expand_cpymem_mops (operands, is_memmove);
 
-  unsigned copy_max = 32;
-
-  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
-     support or slow LDP/STP fall back to 16-byte chunks.
-
+  /* Default to 32-byte LDP/STP on large copies, however small copies or
+     no SIMD support fall back to 16-byte chunks.
      ??? Although it would be possible to use LDP/STP Qn in streaming mode
      (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
      whether that would improve performance.  */
-  if (size <= 24 || !use_ldpq)
-    copy_max = 16;
+  bool use_qregs = size > 24 && TARGET_SIMD;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -26567,7 +26536,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
-  copy_ops ops;
+  auto_vec<std::pair<rtx, rtx>, 16> ops;
   int offset = 0;
 
   while (size > 0)
@@ -26576,23 +26545,27 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 	 or writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
-	  cur_mode = mode_iter.require ();
+	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
+	  mode = mode_iter.require ();
 
-      gcc_assert (cur_mode != BLKmode);
+      gcc_assert (mode != BLKmode);
+
+      mode_bytes = GET_MODE_SIZE (mode).to_constant ();
 
-      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+      /* Prefer Q-register accesses.  */
+      if (mode_bytes == 16 && use_qregs)
+	mode = V4SImode;
 
-      /* Prefer Q-register accesses for the last bytes.  */
-      if (mode_bytes == 16 && copy_max == 32)
-	cur_mode = V4SImode;
-      aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
+      rtx reg = gen_reg_rtx (mode);
+      rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
+      rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
+      ops.safe_push ({ load, store });
       size -= mode_bytes;
       offset += mode_bytes;
 
       /* Emit trailing copies using overlapping unaligned accesses
 	 (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
+      if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
 	{
 	  next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
 	  int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
@@ -26604,7 +26577,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 
   /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
   int nops = ops.length();
-  int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
+  int inc = is_memmove || nops <= 8 ? nops : 6;
 
   for (int i = 0; i < nops; i += inc)
     {
@@ -26633,7 +26606,8 @@ aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
       /* "Cast" the *dst to the correct mode.  */
       *dst = adjust_address (*dst, mode, 0);
       /* Emit the memset.  */
-      emit_insn (aarch64_gen_store_pair (*dst, src, src));
+      emit_move_insn (*dst, src);
+      emit_move_insn (aarch64_move_pointer (*dst, 16), src);
 
       /* Move the pointers forward.  */
       *dst = aarch64_move_pointer (*dst, 32);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113618.c b/gcc/testsuite/gcc.target/aarch64/pr113618.c
new file mode 100644
index 0000000000000000000000000000000000000000..f582360e7c1d149ad94c78c3e66f1c4a973750bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113618.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+
+/*
+** move32:
+**	...
+**	ldp	q([0-9]+), q([0-9]+), \[x0\]
+**	stp	q\1, q\2, \[x1\]
+**	...
+*/
+
+void move32 (char *a, char *b)
+{
+    char temp[32];
+    __builtin_memcpy (temp, a, 32);
+    __builtin_memcpy (b, temp, 32);
+}
+
+/*
+** move64:
+**	...
+**	ldp	q([0-9]+), q([0-9]+), \[x0\]
+**	ldp	q([0-9]+), q([0-9]+), \[x0, 32\]
+**	stp	q\1, q\2, \[x1\]
+**	stp	q\3, q\4, \[x1, 32\]
+**	...
+*/
+
+void move64 (char *a, char *b)
+{
+    char temp[64];
+    __builtin_memcpy (temp, a, 64);
+    __builtin_memcpy (b, temp, 64);
+}
Richard Sandiford March 7, 2024, 5:09 p.m. UTC | #3
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
>> It looks like this is really doing two things at once: disabling the
>> direct emission of LDP/STP Qs, and switching the GPR handling from using
>> pairs of DImode moves to single TImode moves.  At least, that seems to be
>> the effect of...
>
> No it still uses TImode for the !TARGET_SIMD case.
>
>> +       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
>> +         mode = mode_iter.require ();
>
>> ...hard-coding 16 here and...
>
> This only affects the Q register case.
>
>> -      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
>> +      if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
>
>> ...changing this limit from 8 to 16 for non-SIMD copies.
>>
>> Is that deliberate?  If so, please mention that kind of thing in the
>> covering note.  It sounded like this was intended to change the handling
>> of vector moves only.
>
> Yes it's deliberate. It now basically treats everything as blocks of 16 bytes
> which has a nice simplifying effect. I've added a note.
>
>> This means that, for GPRs, we are now effectively using the double-word
>> move patterns to get an LDP/STP indirectly, rather than directly as before.
>
> No, there is no difference here.
>
>> That seems OK, and I suppose might be slightly preferable to the current
>> code for things like:
>>
>>  char a[31], b[31];
>>  void f() { __builtin_memcpy(a, b, 31); }
>
> Yes an unaligned tail improves slightly by using blocks of 16 bytes.
> It's a very rare case, both -mgeneral-regs-only is rarely used, and most
> fixed-size copies are a nice multiple of 8.
>
>> But that raises the question: should we do the same thing for Q registers
>> and V2x16QImode?
>
> I don't believe it makes sense to use those complex types. And it likely
> blocks optimizations in a similar way as UNSPEC does.

A V2x16QImode move isn't particularly special as far as target-
independent code is concerned.  It's just an ordinary move of an
ordinary vector mode.  And the vector modes that we're picking here
generally have nothing to do with the source data.

But I'd forgotten about:

  /* On LE, for AdvSIMD, don't support anything other than POST_INC or
     REG addressing.  */
  if (advsimd_struct_p
      && TARGET_SIMD
      && !BYTES_BIG_ENDIAN
      && (code != POST_INC && code != REG))
    return false;

> v2: Rebase to trunk
>
> The new RTL introduced for LDP/STP results in regressions due to use of UNSPEC.
> Given the new LDP fusion pass is good at finding LDP opportunities, change the
> memcpy, memmove and memset expansions to emit single vector loads/stores.
> This fixes the regression and enables more RTL optimization on the standard
> memory accesses.  Handling of unaligned tail of memcpy/memmove is improved
> with -mgeneral-regs-only.  SPEC2017 performance improves slightly.  Codesize
> is a bit worse due to missed LDP opportunities as discussed in the PR.
>
> Passes regress, OK for commit?
>
> gcc/ChangeLog:
>         PR target/113618
>         * config/aarch64/aarch64.cc (aarch64_copy_one_block): Remove. 
>         (aarch64_expand_cpymem): Emit single load/store only.
>         (aarch64_set_one_block): Emit single stores only.
>
> gcc/testsuite/ChangeLog:
>         PR target/113618
>         * gcc.target/aarch64/pr113618.c: New test.

OK, thanks.

Richard

> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 16318bf925883ecedf9345e53fc0824a553b2747..0a28e033088a00818c6ed9fa8c15ecdee5a86c35 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -26465,33 +26465,6 @@ aarch64_progress_pointer (rtx pointer)
>    return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
>  }
>  
> -typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
> -
> -/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
> -static void
> -aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
> -			int offset, machine_mode mode)
> -{
> -  /* Emit explict load/store pair instructions for 32-byte copies.  */
> -  if (known_eq (GET_MODE_SIZE (mode), 32))
> -    {
> -      mode = V4SImode;
> -      rtx src1 = adjust_address (src, mode, offset);
> -      rtx dst1 = adjust_address (dst, mode, offset);
> -      rtx reg1 = gen_reg_rtx (mode);
> -      rtx reg2 = gen_reg_rtx (mode);
> -      rtx load = aarch64_gen_load_pair (reg1, reg2, src1);
> -      rtx store = aarch64_gen_store_pair (dst1, reg1, reg2);
> -      ops.safe_push ({ load, store });
> -      return;
> -    }
> -
> -  rtx reg = gen_reg_rtx (mode);
> -  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
> -  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
> -  ops.safe_push ({ load, store });
> -}
> -
>  /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
>     from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
>     rather than memcpy.  Return true iff we succeeded.  */
> @@ -26527,7 +26500,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>    rtx src = operands[1];
>    unsigned align = UINTVAL (operands[3]);
>    rtx base;
> -  machine_mode cur_mode = BLKmode, next_mode;
> +  machine_mode mode = BLKmode, next_mode;
>  
>    /* Variable-sized or strict-align copies may use the MOPS expansion.  */
>    if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
> @@ -26550,16 +26523,12 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>    if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
>      return aarch64_expand_cpymem_mops (operands, is_memmove);
>  
> -  unsigned copy_max = 32;
> -
> -  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
> -     support or slow LDP/STP fall back to 16-byte chunks.
> -
> +  /* Default to 32-byte LDP/STP on large copies, however small copies or
> +     no SIMD support fall back to 16-byte chunks.
>       ??? Although it would be possible to use LDP/STP Qn in streaming mode
>       (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
>       whether that would improve performance.  */
> -  if (size <= 24 || !use_ldpq)
> -    copy_max = 16;
> +  bool use_qregs = size > 24 && TARGET_SIMD;
>  
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> @@ -26567,7 +26536,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>    src = adjust_automodify_address (src, VOIDmode, base, 0);
>  
> -  copy_ops ops;
> +  auto_vec<std::pair<rtx, rtx>, 16> ops;
>    int offset = 0;
>  
>    while (size > 0)
> @@ -26576,23 +26545,27 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>  	 or writing.  */
>        opt_scalar_int_mode mode_iter;
>        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> -	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
> -	  cur_mode = mode_iter.require ();
> +	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
> +	  mode = mode_iter.require ();
>  
> -      gcc_assert (cur_mode != BLKmode);
> +      gcc_assert (mode != BLKmode);
> +
> +      mode_bytes = GET_MODE_SIZE (mode).to_constant ();
>  
> -      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
> +      /* Prefer Q-register accesses.  */
> +      if (mode_bytes == 16 && use_qregs)
> +	mode = V4SImode;
>  
> -      /* Prefer Q-register accesses for the last bytes.  */
> -      if (mode_bytes == 16 && copy_max == 32)
> -	cur_mode = V4SImode;
> -      aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
> +      rtx reg = gen_reg_rtx (mode);
> +      rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
> +      rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
> +      ops.safe_push ({ load, store });
>        size -= mode_bytes;
>        offset += mode_bytes;
>  
>        /* Emit trailing copies using overlapping unaligned accesses
>  	 (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> -      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
> +      if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
>  	{
>  	  next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
>  	  int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
> @@ -26604,7 +26577,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>  
>    /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
>    int nops = ops.length();
> -  int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
> +  int inc = is_memmove || nops <= 8 ? nops : 6;
>  
>    for (int i = 0; i < nops; i += inc)
>      {
> @@ -26633,7 +26606,8 @@ aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
>        /* "Cast" the *dst to the correct mode.  */
>        *dst = adjust_address (*dst, mode, 0);
>        /* Emit the memset.  */
> -      emit_insn (aarch64_gen_store_pair (*dst, src, src));
> +      emit_move_insn (*dst, src);
> +      emit_move_insn (aarch64_move_pointer (*dst, 16), src);
>  
>        /* Move the pointers forward.  */
>        *dst = aarch64_move_pointer (*dst, 32);
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr113618.c b/gcc/testsuite/gcc.target/aarch64/pr113618.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..f582360e7c1d149ad94c78c3e66f1c4a973750bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr113618.c
> @@ -0,0 +1,36 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +
> +/*
> +** move32:
> +**	...
> +**	ldp	q([0-9]+), q([0-9]+), \[x0\]
> +**	stp	q\1, q\2, \[x1\]
> +**	...
> +*/
> +
> +void move32 (char *a, char *b)
> +{
> +    char temp[32];
> +    __builtin_memcpy (temp, a, 32);
> +    __builtin_memcpy (b, temp, 32);
> +}
> +
> +/*
> +** move64:
> +**	...
> +**	ldp	q([0-9]+), q([0-9]+), \[x0\]
> +**	ldp	q([0-9]+), q([0-9]+), \[x0, 32\]
> +**	stp	q\1, q\2, \[x1\]
> +**	stp	q\3, q\4, \[x1, 32\]
> +**	...
> +*/
> +
> +void move64 (char *a, char *b)
> +{
> +    char temp[64];
> +    __builtin_memcpy (temp, a, 64);
> +    __builtin_memcpy (b, temp, 64);
> +}
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index d17198b4a5f73f8be8aeca3258b81809ffb48eac..2194441b949a53f181fe373e07bc18341c014918 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26376,33 +26376,6 @@  aarch64_move_pointer (rtx pointer, poly_int64 amount)
 				    next, amount);
 }
 
-typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
-
-/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
-static void
-aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
-			int offset, machine_mode mode)
-{
-  /* Emit explict load/store pair instructions for 32-byte copies.  */
-  if (known_eq (GET_MODE_SIZE (mode), 32))
-    {
-      mode = V4SImode;
-      rtx src1 = adjust_address (src, mode, offset);
-      rtx dst1 = adjust_address (dst, mode, offset);
-      rtx reg1 = gen_reg_rtx (mode);
-      rtx reg2 = gen_reg_rtx (mode);
-      rtx load = aarch64_gen_load_pair (reg1, reg2, src1);
-      rtx store = aarch64_gen_store_pair (dst1, reg1, reg2);
-      ops.safe_push ({ load, store });
-      return;
-    }
-
-  rtx reg = gen_reg_rtx (mode);
-  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
-  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
-  ops.safe_push ({ load, store });
-}
-
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
    rather than memcpy.  Return true iff we succeeded.  */
@@ -26438,7 +26411,7 @@  aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   rtx src = operands[1];
   unsigned align = UINTVAL (operands[3]);
   rtx base;
-  machine_mode cur_mode = BLKmode, next_mode;
+  machine_mode mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
@@ -26465,7 +26438,7 @@  aarch64_expand_cpymem (rtx *operands, bool is_memmove)
      ??? Although it would be possible to use LDP/STP Qn in streaming mode
      (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
      whether that would improve performance.  */
-  unsigned copy_max = (size <= 24 || !TARGET_SIMD) ? 16 : 32;
+  bool use_qregs = size > 24 && TARGET_SIMD;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -26473,7 +26446,7 @@  aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
-  copy_ops ops;
+  auto_vec<std::pair<rtx, rtx>, 16> ops;
   int offset = 0;
 
   while (size > 0)
@@ -26482,23 +26455,27 @@  aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 	 or writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
-	  cur_mode = mode_iter.require ();
+	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
+	  mode = mode_iter.require ();
+
+      gcc_assert (mode != BLKmode);
 
-      gcc_assert (cur_mode != BLKmode);
+      mode_bytes = GET_MODE_SIZE (mode).to_constant ();
 
-      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+      /* Prefer Q-register accesses.  */
+      if (mode_bytes == 16 && use_qregs)
+	mode = V4SImode;
 
-      /* Prefer Q-register accesses for the last bytes.  */
-      if (mode_bytes == 16 && copy_max == 32)
-	cur_mode = V4SImode;
-      aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
+      rtx reg = gen_reg_rtx (mode);
+      rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
+      rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
+      ops.safe_push ({ load, store });
       size -= mode_bytes;
       offset += mode_bytes;
 
       /* Emit trailing copies using overlapping unaligned accesses
 	 (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
+      if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
 	{
 	  next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
 	  int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
@@ -26510,7 +26487,7 @@  aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 
   /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
   int nops = ops.length();
-  int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
+  int inc = is_memmove || nops <= 8 ? nops : 6;
 
   for (int i = 0; i < nops; i += inc)
     {
@@ -26525,23 +26502,6 @@  aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   return true;
 }
 
-/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
-static void
-aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
-{
-  /* Emit explict store pair instructions for 32-byte writes.  */
-  if (known_eq (GET_MODE_SIZE (mode), 32))
-    {
-      mode = V16QImode;
-      rtx dst1 = adjust_address (dst, mode, offset);
-      emit_insn (aarch64_gen_store_pair (dst1, src, src));
-      return;
-    }
-  if (known_lt (GET_MODE_SIZE (mode), 16))
-    src = lowpart_subreg (mode, src, GET_MODE (src));
-  emit_move_insn (adjust_address (dst, mode, offset), src);
-}
-
 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
    as for the setmem pattern.  Return true iff we succeed.  */
 static bool
@@ -26574,7 +26534,7 @@  aarch64_expand_setmem (rtx *operands)
   rtx val = operands[2], src;
   unsigned align = UINTVAL (operands[3]);
   rtx base;
-  machine_mode cur_mode = BLKmode, next_mode;
+  machine_mode mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align memset may use the MOPS expansion.  */
   if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
@@ -26595,11 +26555,8 @@  aarch64_expand_setmem (rtx *operands)
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
 
   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
-  src = expand_vector_broadcast (V16QImode, val);
-  src = force_reg (V16QImode, src);
-
-  /* Set maximum number of bytes to write per instruction.  */
-  unsigned set_max = (len <= 24) ? 16 : 32;
+  val = expand_vector_broadcast (V16QImode, val);
+  val = force_reg (V16QImode, val);
 
   int offset = 0;
   while (len > 0)
@@ -26608,24 +26565,28 @@  aarch64_expand_setmem (rtx *operands)
 	 over writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
-	  cur_mode = mode_iter.require ();
+	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16))
+	  mode = mode_iter.require ();
+
+      gcc_assert (mode != BLKmode);
 
-      gcc_assert (cur_mode != BLKmode);
+      mode_bytes = GET_MODE_SIZE (mode).to_constant ();
 
-      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+      src = val;
 
-      /* Prefer Q-register accesses for the last bytes.  */
+      /* Prefer Q-register accesses.  */
       if (mode_bytes == 16)
-	cur_mode = V16QImode;
+	mode = V16QImode;
+      else
+	src = lowpart_subreg (mode, src, GET_MODE (val));
 
-      aarch64_set_one_block (src, dst, offset, cur_mode);
+      emit_move_insn (adjust_address (dst, mode, offset), src);
       len -= mode_bytes;
       offset += mode_bytes;
 
       /* Emit trailing writes using overlapping unaligned accesses
 	 (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
+      if (len > 0 && len < 16 && !STRICT_ALIGNMENT)
 	{
 	  next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
 	  int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113618.c b/gcc/testsuite/gcc.target/aarch64/pr113618.c
new file mode 100644
index 0000000000000000000000000000000000000000..f582360e7c1d149ad94c78c3e66f1c4a973750bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113618.c
@@ -0,0 +1,36 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+
+/*
+** move32:
+**	...
+**	ldp	q([0-9]+), q([0-9]+), \[x0\]
+**	stp	q\1, q\2, \[x1\]
+**	...
+*/
+
+void move32 (char *a, char *b)
+{
+    char temp[32];
+    __builtin_memcpy (temp, a, 32);
+    __builtin_memcpy (b, temp, 32);
+}
+
+/*
+** move64:
+**	...
+**	ldp	q([0-9]+), q([0-9]+), \[x0\]
+**	ldp	q([0-9]+), q([0-9]+), \[x0, 32\]
+**	stp	q\1, q\2, \[x1\]
+**	stp	q\3, q\4, \[x1, 32\]
+**	...
+*/
+
+void move64 (char *a, char *b)
+{
+    char temp[64];
+    __builtin_memcpy (temp, a, 64);
+    __builtin_memcpy (b, temp, 64);
+}