diff mbox series

AArch64: Improve inline memcpy expansion

Message ID VE1PR08MB55991D851E29FFD70875F7B583EE0@VE1PR08MB5599.eurprd08.prod.outlook.com
State New
Headers show
Series AArch64: Improve inline memcpy expansion | expand

Commit Message

Wilco Dijkstra Nov. 5, 2020, 3:58 p.m. UTC
Improve the inline memcpy expansion.  Use integer load/store for copies <= 24 bytes
instead of SIMD.  Set the maximum copy to expand to 256 by default, except that -Os or
no Neon expands up to 128 bytes.  When using LDP/STP of Q-registers, also use Q-register
accesses for the unaligned tail, saving 2 instructions (eg. all sizes up to 48 bytes emit
exactly 4 instructions).  Cleanup code and comments.

The codesize gain vs the GCC10 expansion is 0.05% on SPECINT2017.

Passes bootstrap and regress. OK for commit?

ChangeLog:
2020-11-03  Wilco Dijkstra  <wdijkstr@arm.com>

        * config/aarch64/aarch64.c (aarch64_expand_cpymem): Cleanup code and
        comments, tweak expansion decisions and improve tail expansion.

---

Comments

Richard Sandiford Nov. 11, 2020, 6:09 p.m. UTC | #1
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Improve the inline memcpy expansion.  Use integer load/store for copies <= 24 bytes
> instead of SIMD.  Set the maximum copy to expand to 256 by default, except that -Os or
> no Neon expands up to 128 bytes.  When using LDP/STP of Q-registers, also use Q-register
> accesses for the unaligned tail, saving 2 instructions (eg. all sizes up to 48 bytes emit
> exactly 4 instructions).  Cleanup code and comments.
>
> The codesize gain vs the GCC10 expansion is 0.05% on SPECINT2017.

Nice.  A couple of small comments inline…

> Passes bootstrap and regress. OK for commit?
>
> ChangeLog:
> 2020-11-03  Wilco Dijkstra  <wdijkstr@arm.com>
>
>         * config/aarch64/aarch64.c (aarch64_expand_cpymem): Cleanup code and
>         comments, tweak expansion decisions and improve tail expansion.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 41e2a699108146e0fa7464743607bd34e91ea9eb..9487c1cb07b0d851c0f085262179470d0d596116 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -21255,35 +21255,36 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
>  bool
>  aarch64_expand_cpymem (rtx *operands)
>  {
> -  /* These need to be signed as we need to perform arithmetic on n as
> -     signed operations.  */
> -  int n, mode_bits;
> +  int mode_bits;
>    rtx dst = operands[0];
>    rtx src = operands[1];
>    rtx base;
> -  machine_mode cur_mode = BLKmode, next_mode;
> -  bool speed_p = !optimize_function_for_size_p (cfun);
> +  machine_mode cur_mode = BLKmode;
>
> -  /* When optimizing for size, give a better estimate of the length of a
> -     memcpy call, but use the default otherwise.  Moves larger than 8 bytes
> -     will always require an even number of instructions to do now.  And each
> -     operation requires both a load+store, so divide the max number by 2.  */
> -  unsigned int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
> -
> -  /* We can't do anything smart if the amount to copy is not constant.  */
> +  /* Only expand fixed-size copies.  */
>    if (!CONST_INT_P (operands[2]))
>      return false;
>
> -  unsigned HOST_WIDE_INT tmp = INTVAL (operands[2]);
> +  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
>
> -  /* Try to keep the number of instructions low.  For all cases we will do at
> -     most two moves for the residual amount, since we'll always overlap the
> -     remainder.  */
> -  if (((tmp / 16) + (tmp % 16 ? 2 : 0)) > max_num_moves)
> +  /* Inline up to 256 bytes when optimizing for speed.  */
> +  unsigned HOST_WIDE_INT max_copy_size = 256;
> +
> +  if (optimize_function_for_size_p (cfun) || !TARGET_SIMD)
> +    max_copy_size = 128;
> +
> +  if (size > max_copy_size)
>      return false;
>
> -  /* At this point tmp is known to have to fit inside an int.  */
> -  n = tmp;
> +  int copy_bits = 256;
> +
> +  /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
> +     support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
> +  if (size <= 24 || !TARGET_SIMD

Nit: one condition per line when the condition spans multiple lines.

> +      || (size <= (max_copy_size / 2)
> +  && (aarch64_tune_params.extra_tuning_flags
> +      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)))
> +    copy_bits = GET_MODE_BITSIZE (TImode);

(Looks like the mailer has eaten some tabs here.)

As discussed in Sudi's setmem patch, I think we should make the
AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS conditional on optimising
for speed.  For size, using LDP Q and STP Q is a win regardless
of what the CPU wants.

>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> @@ -21291,15 +21292,8 @@ aarch64_expand_cpymem (rtx *operands)
>    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>    src = adjust_automodify_address (src, VOIDmode, base, 0);
>
> -  /* Convert n to bits to make the rest of the code simpler.  */
> -  n = n * BITS_PER_UNIT;
> -
> -  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
> -     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD.  */
> -  const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
> -   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> -  || !TARGET_SIMD)
> - ? GET_MODE_BITSIZE (TImode) : 256;
> +  /* Convert size to bits to make the rest of the code simpler.  */
> +  int n = size * BITS_PER_UNIT;
>
>    while (n > 0)
>      {
> @@ -21307,23 +21301,26 @@ aarch64_expand_cpymem (rtx *operands)
>   or writing.  */
>        opt_scalar_int_mode mode_iter;
>        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> -if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
> +if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
>    cur_mode = mode_iter.require ();
>
>        gcc_assert (cur_mode != BLKmode);
>
>        mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> +
> +      /* Prefer Q-register accesses for the last bytes.  */
> +      if (mode_bits == 128 && copy_bits == 256)
> +cur_mode = V4SImode;
> +
>        aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
>
>        n -= mode_bits;
>
> -      /* Do certain trailing copies as overlapping if it's going to be
> - cheaper.  i.e. less instructions to do so.  For instance doing a 15
> - byte copy it's more efficient to do two overlapping 8 byte copies than
> - 8 + 6 + 1.  */
> -      if (n > 0 && n <= 8 * BITS_PER_UNIT)
> +      /* Emit trailing copies using overlapping unaligned accesses - this is
> + smaller and faster.  */
> +      if (n > 0 && n < copy_bits / 2)
>  {
> -  next_mode = smallest_mode_for_size (n, MODE_INT);
> +  machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
>    int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();

As I mentioned in the reply I've just sent to Sudi's patch,
I think it might help to add:

	  gcc_assert (n_bits <= mode_bits);

to show why this is guaranteed not to overrun the original copy.
(I agree that the code does guarantee no overrun.)

Thanks,
Richard

>    src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
>    dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
Wilco Dijkstra Nov. 16, 2020, 1:55 p.m. UTC | #2
Hi Richard,

>> +  if (size <= 24 || !TARGET_SIMD
>
> Nit: one condition per line when the condition spans multiple lines.

Fixed.

>> +      || (size <= (max_copy_size / 2)
>> +  && (aarch64_tune_params.extra_tuning_flags
>> +      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)))
>> +    copy_bits = GET_MODE_BITSIZE (TImode);
>
> (Looks like the mailer has eaten some tabs here.)

The email contains the correct tabs at the time I send it.

> As discussed in Sudi's setmem patch, I think we should make the
> AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS conditional on optimising
> for speed.  For size, using LDP Q and STP Q is a win regardless
> of what the CPU wants.

I've changed the logic slightly based on benchmarking. It's actually better
to fallback to calling memcpy for larger sizes in this case rather than emit an
inlined Q-register memcpy.

>>    int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
>
> As I mentioned in the reply I've just sent to Sudi's patch,
> I think it might help to add:
>
>          gcc_assert (n_bits <= mode_bits);
>
> to show why this is guaranteed not to overrun the original copy.
> (I agree that the code does guarantee no overrun.)

I've added the same assert so the code remains similar.

Here is the updated version:


Improve the inline memcpy expansion.  Use integer load/store for copies <= 24 bytes
instead of SIMD.  Set the maximum copy to expand to 256 by default, except that -Os or
no Neon expands up to 128 bytes.  When using LDP/STP of Q-registers, also use Q-register
accesses for the unaligned tail, saving 2 instructions (eg. all sizes up to 48 bytes emit
exactly 4 instructions).  Cleanup code and comments.

The codesize gain vs the GCC10 expansion is 0.05% on SPECINT2017.

Passes bootstrap and regress. OK for commit?

ChangeLog:
2020-11-16  Wilco Dijkstra  <wdijkstr@arm.com>

        * config/aarch64/aarch64.c (aarch64_expand_cpymem): Cleanup code and
        comments, tweak expansion decisions and improve tail expansion.

---

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 41e2a699108146e0fa7464743607bd34e91ea9eb..4b2d5fa7d452dc53ff42308dd2781096ff8c95d2 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -21255,35 +21255,39 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
 bool
 aarch64_expand_cpymem (rtx *operands)
 {
-  /* These need to be signed as we need to perform arithmetic on n as
-     signed operations.  */
-  int n, mode_bits;
+  int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
   rtx base;
-  machine_mode cur_mode = BLKmode, next_mode;
-  bool speed_p = !optimize_function_for_size_p (cfun);
-
-  /* When optimizing for size, give a better estimate of the length of a
-     memcpy call, but use the default otherwise.  Moves larger than 8 bytes
-     will always require an even number of instructions to do now.  And each
-     operation requires both a load+store, so divide the max number by 2.  */
-  unsigned int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
+  machine_mode cur_mode = BLKmode;
 
-  /* We can't do anything smart if the amount to copy is not constant.  */
+  /* Only expand fixed-size copies.  */
   if (!CONST_INT_P (operands[2]))
     return false;
 
-  unsigned HOST_WIDE_INT tmp = INTVAL (operands[2]);
+  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
 
-  /* Try to keep the number of instructions low.  For all cases we will do at
-     most two moves for the residual amount, since we'll always overlap the
-     remainder.  */
-  if (((tmp / 16) + (tmp % 16 ? 2 : 0)) > max_num_moves)
-    return false;
+  /* Inline up to 256 bytes when optimizing for speed.  */
+  unsigned HOST_WIDE_INT max_copy_size = 256;
 
-  /* At this point tmp is known to have to fit inside an int.  */
-  n = tmp;
+  if (optimize_function_for_size_p (cfun))
+    max_copy_size = 128;
+
+  int copy_bits = 256;
+
+  /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
+     support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
+  if (size <= 24
+      || !TARGET_SIMD
+      || (aarch64_tune_params.extra_tuning_flags
+	  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+    {
+      copy_bits = 128;
+      max_copy_size = max_copy_size / 2;
+    }
+
+  if (size > max_copy_size)
+    return false;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -21291,15 +21295,8 @@ aarch64_expand_cpymem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
-  /* Convert n to bits to make the rest of the code simpler.  */
-  n = n * BITS_PER_UNIT;
-
-  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
-     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD.  */
-  const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
-			   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
-			  || !TARGET_SIMD)
-			 ? GET_MODE_BITSIZE (TImode) : 256;
+  /* Convert size to bits to make the rest of the code simpler.  */
+  int n = size * BITS_PER_UNIT;
 
   while (n > 0)
     {
@@ -21307,24 +21304,28 @@ aarch64_expand_cpymem (rtx *operands)
 	 or writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
+	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
 	  cur_mode = mode_iter.require ();
 
       gcc_assert (cur_mode != BLKmode);
 
       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
+
+      /* Prefer Q-register accesses for the last bytes.  */
+      if (mode_bits == 128 && copy_bits == 256)
+	cur_mode = V4SImode;
+
       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
 
       n -= mode_bits;
 
-      /* Do certain trailing copies as overlapping if it's going to be
-	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
-	 byte copy it's more efficient to do two overlapping 8 byte copies than
-	 8 + 6 + 1.  */
-      if (n > 0 && n <= 8 * BITS_PER_UNIT)
+      /* Emit trailing copies using overlapping unaligned accesses - this is
+	 smaller and faster.  */
+      if (n > 0 && n < copy_bits / 2)
 	{
-	  next_mode = smallest_mode_for_size (n, MODE_INT);
+	  machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
 	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
+	  gcc_assert (n_bits <= mode_bits);
 	  src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
 	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
 	  n = n_bits;
Richard Sandiford Nov. 18, 2020, 2:59 p.m. UTC | #3
Wilco Dijkstra via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>>> +      || (size <= (max_copy_size / 2)
>>> +  && (aarch64_tune_params.extra_tuning_flags
>>> +      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)))
>>> +    copy_bits = GET_MODE_BITSIZE (TImode);
>>
>> (Looks like the mailer has eaten some tabs here.)
>
> The email contains the correct tabs at the time I send it.

Yeah, sorry the noise, looks like Exchange ate them at my end.
The version on gmane was ok.

>> As discussed in Sudi's setmem patch, I think we should make the
>> AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS conditional on optimising
>> for speed.  For size, using LDP Q and STP Q is a win regardless
>> of what the CPU wants.
>
> I've changed the logic slightly based on benchmarking. It's actually better
> to fallback to calling memcpy for larger sizes in this case rather than emit an
> inlined Q-register memcpy.

OK.

>>>    int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
>>
>> As I mentioned in the reply I've just sent to Sudi's patch,
>> I think it might help to add:
>>
>>          gcc_assert (n_bits <= mode_bits);
>>
>> to show why this is guaranteed not to overrun the original copy.
>> (I agree that the code does guarantee no overrun.)
>
> I've added the same assert so the code remains similar.
>
> Here is the updated version:
>
>
> Improve the inline memcpy expansion.  Use integer load/store for copies <= 24 bytes
> instead of SIMD.  Set the maximum copy to expand to 256 by default, except that -Os or
> no Neon expands up to 128 bytes.  When using LDP/STP of Q-registers, also use Q-register
> accesses for the unaligned tail, saving 2 instructions (eg. all sizes up to 48 bytes emit
> exactly 4 instructions).  Cleanup code and comments.
>
> The codesize gain vs the GCC10 expansion is 0.05% on SPECINT2017.
>
> Passes bootstrap and regress. OK for commit?

OK, thanks.

Richard

>
> ChangeLog:
> 2020-11-16  Wilco Dijkstra  <wdijkstr@arm.com>
>
>         * config/aarch64/aarch64.c (aarch64_expand_cpymem): Cleanup code and
>         comments, tweak expansion decisions and improve tail expansion.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 41e2a699108146e0fa7464743607bd34e91ea9eb..4b2d5fa7d452dc53ff42308dd2781096ff8c95d2 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -21255,35 +21255,39 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
>  bool
>  aarch64_expand_cpymem (rtx *operands)
>  {
> -  /* These need to be signed as we need to perform arithmetic on n as
> -     signed operations.  */
> -  int n, mode_bits;
> +  int mode_bits;
>    rtx dst = operands[0];
>    rtx src = operands[1];
>    rtx base;
> -  machine_mode cur_mode = BLKmode, next_mode;
> -  bool speed_p = !optimize_function_for_size_p (cfun);
> -
> -  /* When optimizing for size, give a better estimate of the length of a
> -     memcpy call, but use the default otherwise.  Moves larger than 8 bytes
> -     will always require an even number of instructions to do now.  And each
> -     operation requires both a load+store, so divide the max number by 2.  */
> -  unsigned int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
> +  machine_mode cur_mode = BLKmode;
>  
> -  /* We can't do anything smart if the amount to copy is not constant.  */
> +  /* Only expand fixed-size copies.  */
>    if (!CONST_INT_P (operands[2]))
>      return false;
>  
> -  unsigned HOST_WIDE_INT tmp = INTVAL (operands[2]);
> +  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
>  
> -  /* Try to keep the number of instructions low.  For all cases we will do at
> -     most two moves for the residual amount, since we'll always overlap the
> -     remainder.  */
> -  if (((tmp / 16) + (tmp % 16 ? 2 : 0)) > max_num_moves)
> -    return false;
> +  /* Inline up to 256 bytes when optimizing for speed.  */
> +  unsigned HOST_WIDE_INT max_copy_size = 256;
>  
> -  /* At this point tmp is known to have to fit inside an int.  */
> -  n = tmp;
> +  if (optimize_function_for_size_p (cfun))
> +    max_copy_size = 128;
> +
> +  int copy_bits = 256;
> +
> +  /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
> +     support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
> +  if (size <= 24
> +      || !TARGET_SIMD
> +      || (aarch64_tune_params.extra_tuning_flags
> +	  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> +    {
> +      copy_bits = 128;
> +      max_copy_size = max_copy_size / 2;
> +    }
> +
> +  if (size > max_copy_size)
> +    return false;
>  
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> @@ -21291,15 +21295,8 @@ aarch64_expand_cpymem (rtx *operands)
>    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>    src = adjust_automodify_address (src, VOIDmode, base, 0);
>  
> -  /* Convert n to bits to make the rest of the code simpler.  */
> -  n = n * BITS_PER_UNIT;
> -
> -  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
> -     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD.  */
> -  const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
> -			   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> -			  || !TARGET_SIMD)
> -			 ? GET_MODE_BITSIZE (TImode) : 256;
> +  /* Convert size to bits to make the rest of the code simpler.  */
> +  int n = size * BITS_PER_UNIT;
>  
>    while (n > 0)
>      {
> @@ -21307,24 +21304,28 @@ aarch64_expand_cpymem (rtx *operands)
>  	 or writing.  */
>        opt_scalar_int_mode mode_iter;
>        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> -	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
> +	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
>  	  cur_mode = mode_iter.require ();
>  
>        gcc_assert (cur_mode != BLKmode);
>  
>        mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> +
> +      /* Prefer Q-register accesses for the last bytes.  */
> +      if (mode_bits == 128 && copy_bits == 256)
> +	cur_mode = V4SImode;
> +
>        aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
>  
>        n -= mode_bits;
>  
> -      /* Do certain trailing copies as overlapping if it's going to be
> -	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
> -	 byte copy it's more efficient to do two overlapping 8 byte copies than
> -	 8 + 6 + 1.  */
> -      if (n > 0 && n <= 8 * BITS_PER_UNIT)
> +      /* Emit trailing copies using overlapping unaligned accesses - this is
> +	 smaller and faster.  */
> +      if (n > 0 && n < copy_bits / 2)
>  	{
> -	  next_mode = smallest_mode_for_size (n, MODE_INT);
> +	  machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
>  	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> +	  gcc_assert (n_bits <= mode_bits);
>  	  src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
>  	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
>  	  n = n_bits;
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 41e2a699108146e0fa7464743607bd34e91ea9eb..9487c1cb07b0d851c0f085262179470d0d596116 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -21255,35 +21255,36 @@  aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
 bool
 aarch64_expand_cpymem (rtx *operands)
 {
-  /* These need to be signed as we need to perform arithmetic on n as
-     signed operations.  */
-  int n, mode_bits;
+  int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
   rtx base;
-  machine_mode cur_mode = BLKmode, next_mode;
-  bool speed_p = !optimize_function_for_size_p (cfun);
+  machine_mode cur_mode = BLKmode;
 
-  /* When optimizing for size, give a better estimate of the length of a
-     memcpy call, but use the default otherwise.  Moves larger than 8 bytes
-     will always require an even number of instructions to do now.  And each
-     operation requires both a load+store, so divide the max number by 2.  */
-  unsigned int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
-
-  /* We can't do anything smart if the amount to copy is not constant.  */
+  /* Only expand fixed-size copies.  */
   if (!CONST_INT_P (operands[2]))
     return false;
 
-  unsigned HOST_WIDE_INT tmp = INTVAL (operands[2]);
+  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
 
-  /* Try to keep the number of instructions low.  For all cases we will do at
-     most two moves for the residual amount, since we'll always overlap the
-     remainder.  */
-  if (((tmp / 16) + (tmp % 16 ? 2 : 0)) > max_num_moves)
+  /* Inline up to 256 bytes when optimizing for speed.  */
+  unsigned HOST_WIDE_INT max_copy_size = 256;
+
+  if (optimize_function_for_size_p (cfun) || !TARGET_SIMD)
+    max_copy_size = 128;
+
+  if (size > max_copy_size)
     return false;
 
-  /* At this point tmp is known to have to fit inside an int.  */
-  n = tmp;
+  int copy_bits = 256;
+
+  /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
+     support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
+  if (size <= 24 || !TARGET_SIMD
+      || (size <= (max_copy_size / 2)
+	  && (aarch64_tune_params.extra_tuning_flags
+	      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)))
+    copy_bits = GET_MODE_BITSIZE (TImode);
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -21291,15 +21292,8 @@  aarch64_expand_cpymem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
-  /* Convert n to bits to make the rest of the code simpler.  */
-  n = n * BITS_PER_UNIT;
-
-  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
-     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD.  */
-  const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
-			   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
-			  || !TARGET_SIMD)
-			 ? GET_MODE_BITSIZE (TImode) : 256;
+  /* Convert size to bits to make the rest of the code simpler.  */
+  int n = size * BITS_PER_UNIT;
 
   while (n > 0)
     {
@@ -21307,23 +21301,26 @@  aarch64_expand_cpymem (rtx *operands)
 	 or writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
+	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
 	  cur_mode = mode_iter.require ();
 
       gcc_assert (cur_mode != BLKmode);
 
       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
+
+      /* Prefer Q-register accesses for the last bytes.  */
+      if (mode_bits == 128 && copy_bits == 256)
+	cur_mode = V4SImode;
+
       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
 
       n -= mode_bits;
 
-      /* Do certain trailing copies as overlapping if it's going to be
-	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
-	 byte copy it's more efficient to do two overlapping 8 byte copies than
-	 8 + 6 + 1.  */
-      if (n > 0 && n <= 8 * BITS_PER_UNIT)
+      /* Emit trailing copies using overlapping unaligned accesses - this is
+	 smaller and faster.  */
+      if (n > 0 && n < copy_bits / 2)
 	{
-	  next_mode = smallest_mode_for_size (n, MODE_INT);
+	  machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
 	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
 	  src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
 	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);