diff mbox series

AArch64: Fix MOPS memmove operand corruption [PR111121]

Message ID PAWPR08MB8982AAADB78E0FBB98E28147831CA@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers show
Series AArch64: Fix MOPS memmove operand corruption [PR111121] | expand

Commit Message

Wilco Dijkstra Aug. 23, 2023, 2:39 p.m. UTC
A MOPS memmove may corrupt registers since there is no copy of the input operands to temporary
registers.  Fix this by calling aarch64_expand_cpymem which does this.  Also fix an issue with
STRICT_ALIGNMENT being ignored if TARGET_MOPS is true, and avoid crashing or generating a huge
expansion if aarch64_mops_memcpy_size_threshold is large.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
        PR target/111121
	* config/aarch64/aarch64.md (cpymemdi): Remove STRICT_ALIGNMENT, add param for memmove.
        (aarch64_movmemdi): Add new expander similar to aarch64_cpymemdi.
        (movmemdi): Like cpymemdi call aarch64_expand_cpymem for correct expansion.
	* config/aarch64/aarch64.cc (aarch64_expand_cpymem_mops): Add support for memmove.
        (aarch64_expand_cpymem): Add support for memmove. Handle STRICT_ALIGNMENT correctly.
        Handle TARGET_MOPS size selection correctly.
        * config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Update prototype. 

gcc/testsuite/ChangeLog/
        PR target/111121
	* gcc.target/aarch64/mops_4.c: Add memmove testcases.

---

Comments

Richard Sandiford Aug. 23, 2023, 3:03 p.m. UTC | #1
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> A MOPS memmove may corrupt registers since there is no copy of the input operands to temporary
> registers.  Fix this by calling aarch64_expand_cpymem which does this.  Also fix an issue with
> STRICT_ALIGNMENT being ignored if TARGET_MOPS is true, and avoid crashing or generating a huge
> expansion if aarch64_mops_memcpy_size_threshold is large.
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/ChangeLog/
>         PR target/111121
>         * config/aarch64/aarch64.md (cpymemdi): Remove STRICT_ALIGNMENT, add param for memmove.
>         (aarch64_movmemdi): Add new expander similar to aarch64_cpymemdi.
>         (movmemdi): Like cpymemdi call aarch64_expand_cpymem for correct expansion.
>         * config/aarch64/aarch64.cc (aarch64_expand_cpymem_mops): Add support for memmove.
>         (aarch64_expand_cpymem): Add support for memmove. Handle STRICT_ALIGNMENT correctly.
>         Handle TARGET_MOPS size selection correctly.
>         * config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Update prototype.
>
> gcc/testsuite/ChangeLog/
>         PR target/111121
>         * gcc.target/aarch64/mops_4.c: Add memmove testcases.
>
> ---
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 70303d6fd953e0c397b9138ede8858c2db2e53db..97375e81cbda078847af83bf5dd4e0d7673d6af4 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -765,7 +765,7 @@ bool aarch64_emit_approx_div (rtx, rtx, rtx);
>  bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
>  tree aarch64_vector_load_decl (tree);
>  void aarch64_expand_call (rtx, rtx, rtx, bool);
> -bool aarch64_expand_cpymem (rtx *);
> +bool aarch64_expand_cpymem (rtx *, bool);
>  bool aarch64_expand_setmem (rtx *);
>  bool aarch64_float_const_zero_rtx_p (rtx);
>  bool aarch64_float_const_rtx_p (rtx);
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index eba5d4a7e04b7af82437453a691d5607d98133c9..5e8d0a0c91bc7719de2a8c5627b354cf905a4db0 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25135,10 +25135,11 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
>    *dst = aarch64_progress_pointer (*dst);
>  }
>
> -/* Expand a cpymem using the MOPS extension.  OPERANDS are taken
> -   from the cpymem pattern.  Return true iff we succeeded.  */
> +/* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
> +   from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
> +   rather than memcpy.  Return true iff we succeeded.  */
>  static bool
> -aarch64_expand_cpymem_mops (rtx *operands)
> +aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
>  {
>    if (!TARGET_MOPS)
>      return false;
> @@ -25150,17 +25151,19 @@ aarch64_expand_cpymem_mops (rtx *operands)
>    rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
>    rtx src_mem = replace_equiv_address (operands[1], src_addr);
>    rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
> -  emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
> -
> +  if (is_memmove)
> +    emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
> +  else
> +    emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
>    return true;
>  }
>
> -/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
> -   we succeed, otherwise return false, indicating that a libcall to
> -   memcpy should be emitted.  */
> -
> +/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
> +   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
> +   if this is a memmove rather than memcpy.  Return true if we succeed,
> +   otherwise return false, indicating that a libcall should be emitted.  */
>  bool
> -aarch64_expand_cpymem (rtx *operands)
> +aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>  {
>    int mode_bits;
>    rtx dst = operands[0];
> @@ -25168,25 +25171,23 @@ aarch64_expand_cpymem (rtx *operands)
>    rtx base;
>    machine_mode cur_mode = BLKmode;
>
> -  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
> -  if (!CONST_INT_P (operands[2]))
> -    return aarch64_expand_cpymem_mops (operands);
> +  /* Variable-sized or strict align copies may use the MOPS expansion.  */
> +  if (!CONST_INT_P (operands[2]) || STRICT_ALIGNMENT)
> +    return aarch64_expand_cpymem_mops (operands, is_memmove);
>
>    unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
>
> -  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
> -  unsigned HOST_WIDE_INT max_copy_size
> -    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
> +  /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
> +  unsigned HOST_WIDE_INT max_copy_size = is_memmove ? 0 : 256;
> +  unsigned HOST_WIDE_INT max_mops_size = max_copy_size;
>
> -  bool size_p = optimize_function_for_size_p (cfun);
> +  if (TARGET_MOPS)
> +    max_mops_size = is_memmove ? aarch64_mops_memmove_size_threshold
> +                              : aarch64_mops_memcpy_size_threshold;
>
> -  /* Large constant-sized cpymem should go through MOPS when possible.
> -     It should be a win even for size optimization in the general case.
> -     For speed optimization the choice between MOPS and the SIMD sequence
> -     depends on the size of the copy, rather than number of instructions,
> -     alignment etc.  */
> -  if (size > max_copy_size)
> -    return aarch64_expand_cpymem_mops (operands);
> +  /* Large copies use library call or MOPS when available.  */
> +  if (size > max_copy_size || size > max_mops_size)
> +    return aarch64_expand_cpymem_mops (operands, is_memmove);

Could you explain this a bit more?  If I've followed the logic correctly,
max_copy_size will always be 0 for movmem, so this "if" condition will
always be true for movmem (given that the caller can be relied on to
optimise away zero-length copies).  So doesn't this function reduce to:

  aarch64_expand_cpymem_mops (operands, true);

when is_memmove is true?  If so, I think it would be clearer to do that
directly, rather than go through aarch64_expand_cpymem.  max_copy_size
is really an optimisation threshold, whereas the above seems to be
leaning on it for correctness.

If we make that change, then...

>    int copy_bits = 256;
>
> @@ -25254,6 +25255,8 @@ aarch64_expand_cpymem (rtx *operands)
>       the constant size into a register.  */
>    unsigned mops_cost = 3 + 1;
>
> +  bool size_p = optimize_function_for_size_p (cfun);
> +
>    /* If MOPS is available at this point we don't consider the libcall as it's
>       not a win even on code size.  At this point only consider MOPS if
>       optimizing for size.  For speed optimizations we will have chosen between
> @@ -25261,7 +25264,7 @@ aarch64_expand_cpymem (rtx *operands)
>    if (TARGET_MOPS)
>      {
>        if (size_p && mops_cost < nops)
> -       return aarch64_expand_cpymem_mops (operands);
> +       return aarch64_expand_cpymem_mops (operands, is_memmove);
>        emit_insn (seq);
>        return true;
>      }
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 01cf989641fce8e6c3828f6cfef62e101c4142df..97f70d39cc0ddeb330e044bae0544d85a695567d 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1609,15 +1609,30 @@ (define_expand "cpymemdi"
>     (match_operand:BLK 1 "memory_operand")
>     (match_operand:DI 2 "general_operand")
>     (match_operand:DI 3 "immediate_operand")]
> -   "!STRICT_ALIGNMENT || TARGET_MOPS"
> +   ""
>  {
> -  if (aarch64_expand_cpymem (operands))
> +  if (aarch64_expand_cpymem (operands, false))
>      DONE;
>    FAIL;
>  }
>  )
>
> -(define_insn "aarch64_movmemdi"
> +(define_expand "aarch64_movmemdi"
> +  [(parallel
> +     [(set (match_operand 2) (const_int 0))
> +      (clobber (match_dup 3))
> +      (clobber (match_dup 4))
> +      (clobber (reg:CC CC_REGNUM))
> +      (set (match_operand 0)
> +          (unspec:BLK [(match_operand 1) (match_dup 2)] UNSPEC_MOVMEM))])]
> +  "TARGET_MOPS"
> +  {
> +    operands[3] = XEXP (operands[0], 0);
> +    operands[4] = XEXP (operands[1], 0);
> +  }
> +)
> +
> +(define_insn "*aarch64_movmemdi"
>    [(parallel [
>     (set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
>     (clobber (match_operand:DI 0 "register_operand" "+&r"))
> @@ -1640,27 +1655,11 @@ (define_expand "movmemdi"
>     (match_operand:BLK 1 "memory_operand")
>     (match_operand:DI 2 "general_operand")
>     (match_operand:DI 3 "immediate_operand")]
> -   "TARGET_MOPS"
> +   ""
>  {
> -   rtx sz_reg = operands[2];
> -   /* For constant-sized memmoves check the threshold.
> -      FIXME: We should add a non-MOPS memmove expansion for smaller,
> -      constant-sized memmove to avoid going to a libcall.  */
> -   if (CONST_INT_P (sz_reg)
> -       && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold)
> -     FAIL;
> -
> -   rtx addr_dst = XEXP (operands[0], 0);
> -   rtx addr_src = XEXP (operands[1], 0);
> -
> -   if (!REG_P (sz_reg))
> -     sz_reg = force_reg (DImode, sz_reg);
> -   if (!REG_P (addr_dst))
> -     addr_dst = force_reg (DImode, addr_dst);
> -   if (!REG_P (addr_src))
> -     addr_src = force_reg (DImode, addr_src);
> -   emit_insn (gen_aarch64_movmemdi (addr_dst, addr_src, sz_reg));
> -   DONE;
> +  if (aarch64_expand_cpymem (operands, true))
> +    DONE;
> +  FAIL;
>  }

...I think we might as well keep this pattern conditional on TARGET_MOPS.

I think we can then also split:

  /* All three registers are changed by the instruction, so each one
     must be a fresh pseudo.  */
  rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
  rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
  rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
  rtx src_mem = replace_equiv_address (operands[1], src_addr);
  rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);

out of aarch64_expand_cpymem_mops into a new function (say
aarch64_prepare_mops_operands) and call it from the movmemdi
expander.  There should then be no need for the extra staging
expander (aarch64_movmemdi).

IMO the STRICT_ALIGNMENT stuff should be a separate patch,
with its own testcases.

Thanks,
Richard


>  )
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/mops_4.c b/gcc/testsuite/gcc.target/aarch64/mops_4.c
> index 1b87759cb5e8bbcbb58cf63404d1d579d44b2818..dd796115cb4093251964d881e93bf4b98ade0c32 100644
> --- a/gcc/testsuite/gcc.target/aarch64/mops_4.c
> +++ b/gcc/testsuite/gcc.target/aarch64/mops_4.c
> @@ -50,6 +50,54 @@ copy3 (int *x, int *y, long z, long *res)
>    *res = z;
>  }
>
> +/*
> +** move1:
> +**     mov     (x[0-9]+), x0
> +**     cpyp    \[\1\]!, \[x1\]!, x2!
> +**     cpym    \[\1\]!, \[x1\]!, x2!
> +**     cpye    \[\1\]!, \[x1\]!, x2!
> +**     str     x0, \[x3\]
> +**     ret
> +*/
> +void
> +move1 (int *x, int *y, long z, int **res)
> +{
> +  __builtin_memmove (x, y, z);
> +  *res = x;
> +}
> +
> +/*
> +** move2:
> +**     mov     (x[0-9]+), x1
> +**     cpyp    \[x0\]!, \[\1\]!, x2!
> +**     cpym    \[x0\]!, \[\1\]!, x2!
> +**     cpye    \[x0\]!, \[\1\]!, x2!
> +**     str     x1, \[x3\]
> +**     ret
> +*/
> +void
> +move2 (int *x, int *y, long z, int **res)
> +{
> +  __builtin_memmove (x, y, z);
> +  *res = y;
> +}
> +
> +/*
> +** move3:
> +**     mov     (x[0-9]+), x2
> +**     cpyp    \[x0\]!, \[x1\]!, \1!
> +**     cpym    \[x0\]!, \[x1\]!, \1!
> +**     cpye    \[x0\]!, \[x1\]!, \1!
> +**     str     x2, \[x3\]
> +**     ret
> +*/
> +void
> +move3 (int *x, int *y, long z, long *res)
> +{
> +  __builtin_memmove (x, y, z);
> +  *res = z;
> +}
> +
>  /*
>  ** set1:
>  **     mov     (x[0-9]+), x0
Wilco Dijkstra Aug. 23, 2023, 5:19 p.m. UTC | #2
Hi Richard,

(that's quick!)

> +  if (size > max_copy_size || size > max_mops_size)
> +    return aarch64_expand_cpymem_mops (operands, is_memmove);
>
> Could you explain this a bit more?  If I've followed the logic correctly,
> max_copy_size will always be 0 for movmem, so this "if" condition will
> always be true for movmem (given that the caller can be relied on to
> optimise away zero-length copies).  So doesn't this function reduce to:

In this patch it is zero yes, but there is no real reason for that. The goal is to
share as much code as possible. I have a patch that inlines memmove like
memcpy.

> when is_memmove is true?  If so, I think it would be clearer to do that
> directly, rather than go through aarch64_expand_cpymem.  max_copy_size
> is really an optimisation threshold, whereas the above seems to be
> leaning on it for correctness.

In principle we could for the time being add a assert (!is_memmove) if that
makes it clearer memmove isn't yet handled.

> ...I think we might as well keep this pattern conditional on TARGET_MOPS.

But then we have inconsistencies in the conditions of the expanders, which
is what led to all these bugs in the first place (I lost count, there are 4 or 5
different bugs I fixed). Ensuring everything is 100% identical between
memcpy and memmove makes the code much easier to follow.

> I think we can then also split:
>
>   /* All three registers are changed by the instruction, so each one
>      must be a fresh pseudo.  */
>   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
>   rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
>   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
>   rtx src_mem = replace_equiv_address (operands[1], src_addr);
>   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
>
> out of aarch64_expand_cpymem_mops into a new function (say
> aarch64_prepare_mops_operands) and call it from the movmemdi
> expander.  There should then be no need for the extra staging
> expander (aarch64_movmemdi).

So you're saying we could remove aarch64_cpymemdi/movmemdi if
aarch64_expand_cpymem_mops did massage the operands in the
right way so that we can immediately match the underlying instruction?

Hmm, does that actually work, as in we don't lose the extra alias info that
gets lost in the current memmove expander? (another bug/inconsistency)

And the MOPS code would be separated from aarch64_expand_cpymem
so we'd do all the MOPS size tests inside aarch64_expand_cpymem_mops
and the expander tries using MOPS first and if it fails try inline expansion?

So something like:

(define_expand "movmemdi"
....
  if (aarch64_try_mops_expansion (operands, is_memmove))
    DONE;
  if (aarch64_try_inline_copy_expansion (operands, is_memmove))
    DONE;
  FAIL;
)

> IMO the STRICT_ALIGNMENT stuff should be a separate patch,
> with its own testcases.

We will need backports to fix all these bugs, so the question is whether it
is worth doing a lot of cleanups now?

Cheers,
Wilco
Richard Sandiford Aug. 23, 2023, 6:15 p.m. UTC | #3
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
> (that's quick!)
>
>> +  if (size > max_copy_size || size > max_mops_size)
>> +    return aarch64_expand_cpymem_mops (operands, is_memmove);
>>
>> Could you explain this a bit more?  If I've followed the logic correctly,
>> max_copy_size will always be 0 for movmem, so this "if" condition will
>> always be true for movmem (given that the caller can be relied on to
>> optimise away zero-length copies).  So doesn't this function reduce to:
>
> In this patch it is zero yes, but there is no real reason for that. The goal is to
> share as much code as possible. I have a patch that inlines memmove like
> memcpy.

But I think this part of the patch belongs in that future series.
The current patch should just concentrate on fixing the bug.

It's difficult to evaluate the change at the moment, without the follow-on
change that it's preparing for.  I don't think it stands as an indepedent
improvement in its own right.

>> when is_memmove is true?  If so, I think it would be clearer to do that
>> directly, rather than go through aarch64_expand_cpymem.  max_copy_size
>> is really an optimisation threshold, whereas the above seems to be
>> leaning on it for correctness.
>
> In principle we could for the time being add a assert (!is_memmove) if that
> makes it clearer memmove isn't yet handled.

I think for this patch movmemdi should just call aarch64_expand_cpymem_mops
directly.  Let's leave the aarch64_expand_cpymem changes to other patches.

>> ...I think we might as well keep this pattern conditional on TARGET_MOPS.
>
> But then we have inconsistencies in the conditions of the expanders, which
> is what led to all these bugs in the first place (I lost count, there are 4 or 5
> different bugs I fixed). Ensuring everything is 100% identical between
> memcpy and memmove makes the code much easier to follow.

I think that too should be part of your follow-on changes to do inline
movmem expansions without TARGET_MOPS.  While all supported movmemdis
require TARGET_MOPS, I think the expander should too.

>> I think we can then also split:
>>
>>   /* All three registers are changed by the instruction, so each one
>>      must be a fresh pseudo.  */
>>   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
>>   rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
>>   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
>>   rtx src_mem = replace_equiv_address (operands[1], src_addr);
>>   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
>>
>> out of aarch64_expand_cpymem_mops into a new function (say
>> aarch64_prepare_mops_operands) and call it from the movmemdi
>> expander.  There should then be no need for the extra staging
>> expander (aarch64_movmemdi).
>
> So you're saying we could remove aarch64_cpymemdi/movmemdi if
> aarch64_expand_cpymem_mops did massage the operands in the
> right way so that we can immediately match the underlying instruction?

Yeah.  But I'd forgotten about the pesky fourth (alignment) operand
to movmemdi and cpymemdi, which we don't need for the mops patterns.
So I take that part back.  I agree it's clearer to have a separate
aarch64_movmemdi expander.

> Hmm, does that actually work, as in we don't lose the extra alias info that
> gets lost in the current memmove expander? (another bug/inconsistency)
>
> And the MOPS code would be separated from aarch64_expand_cpymem
> so we'd do all the MOPS size tests inside aarch64_expand_cpymem_mops
> and the expander tries using MOPS first and if it fails try inline expansion?
>
> So something like:
>
> (define_expand "movmemdi"
> ....
>   if (aarch64_try_mops_expansion (operands, is_memmove))
>     DONE;
>   if (aarch64_try_inline_copy_expansion (operands, is_memmove))
>     DONE;
>   FAIL;
> )
>
>> IMO the STRICT_ALIGNMENT stuff should be a separate patch,
>> with its own testcases.
>
> We will need backports to fix all these bugs, so the question is whether it
> is worth doing a lot of cleanups now?

But I think what I'm asking for is significantly simpler than the
original patch.  That should make it more backportable rather than less.

Thanks,
Richard
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 70303d6fd953e0c397b9138ede8858c2db2e53db..97375e81cbda078847af83bf5dd4e0d7673d6af4 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -765,7 +765,7 @@  bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 tree aarch64_vector_load_decl (tree);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
-bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_cpymem (rtx *, bool);
 bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index eba5d4a7e04b7af82437453a691d5607d98133c9..5e8d0a0c91bc7719de2a8c5627b354cf905a4db0 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25135,10 +25135,11 @@  aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
   *dst = aarch64_progress_pointer (*dst);
 }
 
-/* Expand a cpymem using the MOPS extension.  OPERANDS are taken
-   from the cpymem pattern.  Return true iff we succeeded.  */
+/* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
+   from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
+   rather than memcpy.  Return true iff we succeeded.  */
 static bool
-aarch64_expand_cpymem_mops (rtx *operands)
+aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
 {
   if (!TARGET_MOPS)
     return false;
@@ -25150,17 +25151,19 @@  aarch64_expand_cpymem_mops (rtx *operands)
   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
   rtx src_mem = replace_equiv_address (operands[1], src_addr);
   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
-  emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
-
+  if (is_memmove)
+    emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
+  else
+    emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
   return true;
 }
 
-/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false, indicating that a libcall to
-   memcpy should be emitted.  */
-
+/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
+   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
+   if this is a memmove rather than memcpy.  Return true if we succeed,
+   otherwise return false, indicating that a libcall should be emitted.  */
 bool
-aarch64_expand_cpymem (rtx *operands)
+aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 {
   int mode_bits;
   rtx dst = operands[0];
@@ -25168,25 +25171,23 @@  aarch64_expand_cpymem (rtx *operands)
   rtx base;
   machine_mode cur_mode = BLKmode;
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
-    return aarch64_expand_cpymem_mops (operands);
+  /* Variable-sized or strict align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || STRICT_ALIGNMENT)
+    return aarch64_expand_cpymem_mops (operands, is_memmove);
 
   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
 
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
+  unsigned HOST_WIDE_INT max_copy_size = is_memmove ? 0 : 256;
+  unsigned HOST_WIDE_INT max_mops_size = max_copy_size;
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  if (TARGET_MOPS)
+    max_mops_size = is_memmove ? aarch64_mops_memmove_size_threshold
+			       : aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
-     It should be a win even for size optimization in the general case.
-     For speed optimization the choice between MOPS and the SIMD sequence
-     depends on the size of the copy, rather than number of instructions,
-     alignment etc.  */
-  if (size > max_copy_size)
-    return aarch64_expand_cpymem_mops (operands);
+  /* Large copies use library call or MOPS when available.  */
+  if (size > max_copy_size || size > max_mops_size)
+    return aarch64_expand_cpymem_mops (operands, is_memmove);
 
   int copy_bits = 256;
 
@@ -25254,6 +25255,8 @@  aarch64_expand_cpymem (rtx *operands)
      the constant size into a register.  */
   unsigned mops_cost = 3 + 1;
 
+  bool size_p = optimize_function_for_size_p (cfun);
+
   /* If MOPS is available at this point we don't consider the libcall as it's
      not a win even on code size.  At this point only consider MOPS if
      optimizing for size.  For speed optimizations we will have chosen between
@@ -25261,7 +25264,7 @@  aarch64_expand_cpymem (rtx *operands)
   if (TARGET_MOPS)
     {
       if (size_p && mops_cost < nops)
-	return aarch64_expand_cpymem_mops (operands);
+	return aarch64_expand_cpymem_mops (operands, is_memmove);
       emit_insn (seq);
       return true;
     }
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 01cf989641fce8e6c3828f6cfef62e101c4142df..97f70d39cc0ddeb330e044bae0544d85a695567d 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1609,15 +1609,30 @@  (define_expand "cpymemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT || TARGET_MOPS"
+   ""
 {
-  if (aarch64_expand_cpymem (operands))
+  if (aarch64_expand_cpymem (operands, false))
     DONE;
   FAIL;
 }
 )
 
-(define_insn "aarch64_movmemdi"
+(define_expand "aarch64_movmemdi"
+  [(parallel
+     [(set (match_operand 2) (const_int 0))
+      (clobber (match_dup 3))
+      (clobber (match_dup 4))
+      (clobber (reg:CC CC_REGNUM))
+      (set (match_operand 0)
+	   (unspec:BLK [(match_operand 1) (match_dup 2)] UNSPEC_MOVMEM))])]
+  "TARGET_MOPS"
+  {
+    operands[3] = XEXP (operands[0], 0);
+    operands[4] = XEXP (operands[1], 0);
+  }
+)
+
+(define_insn "*aarch64_movmemdi"
   [(parallel [
    (set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
    (clobber (match_operand:DI 0 "register_operand" "+&r"))
@@ -1640,27 +1655,11 @@  (define_expand "movmemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "TARGET_MOPS"
+   ""
 {
-   rtx sz_reg = operands[2];
-   /* For constant-sized memmoves check the threshold.
-      FIXME: We should add a non-MOPS memmove expansion for smaller,
-      constant-sized memmove to avoid going to a libcall.  */
-   if (CONST_INT_P (sz_reg)
-       && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold)
-     FAIL;
-
-   rtx addr_dst = XEXP (operands[0], 0);
-   rtx addr_src = XEXP (operands[1], 0);
-
-   if (!REG_P (sz_reg))
-     sz_reg = force_reg (DImode, sz_reg);
-   if (!REG_P (addr_dst))
-     addr_dst = force_reg (DImode, addr_dst);
-   if (!REG_P (addr_src))
-     addr_src = force_reg (DImode, addr_src);
-   emit_insn (gen_aarch64_movmemdi (addr_dst, addr_src, sz_reg));
-   DONE;
+  if (aarch64_expand_cpymem (operands, true))
+    DONE;
+  FAIL;
 }
 )
 
diff --git a/gcc/testsuite/gcc.target/aarch64/mops_4.c b/gcc/testsuite/gcc.target/aarch64/mops_4.c
index 1b87759cb5e8bbcbb58cf63404d1d579d44b2818..dd796115cb4093251964d881e93bf4b98ade0c32 100644
--- a/gcc/testsuite/gcc.target/aarch64/mops_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/mops_4.c
@@ -50,6 +50,54 @@  copy3 (int *x, int *y, long z, long *res)
   *res = z;
 }
 
+/*
+** move1:
+**	mov	(x[0-9]+), x0
+**	cpyp	\[\1\]!, \[x1\]!, x2!
+**	cpym	\[\1\]!, \[x1\]!, x2!
+**	cpye	\[\1\]!, \[x1\]!, x2!
+**	str	x0, \[x3\]
+**	ret
+*/
+void
+move1 (int *x, int *y, long z, int **res)
+{
+  __builtin_memmove (x, y, z);
+  *res = x;
+}
+
+/*
+** move2:
+**	mov	(x[0-9]+), x1
+**	cpyp	\[x0\]!, \[\1\]!, x2!
+**	cpym	\[x0\]!, \[\1\]!, x2!
+**	cpye	\[x0\]!, \[\1\]!, x2!
+**	str	x1, \[x3\]
+**	ret
+*/
+void
+move2 (int *x, int *y, long z, int **res)
+{
+  __builtin_memmove (x, y, z);
+  *res = y;
+}
+
+/*
+** move3:
+**	mov	(x[0-9]+), x2
+**	cpyp	\[x0\]!, \[x1\]!, \1!
+**	cpym	\[x0\]!, \[x1\]!, \1!
+**	cpye	\[x0\]!, \[x1\]!, \1!
+**	str	x2, \[x3\]
+**	ret
+*/
+void
+move3 (int *x, int *y, long z, long *res)
+{
+  __builtin_memmove (x, y, z);
+  *res = z;
+}
+
 /*
 ** set1:
 **	mov	(x[0-9]+), x0