diff mbox series

AArch64: Fix strict-align cpymem/setmem [PR103100]

Message ID PAWPR08MB8982A64BB093DE2368050A9583F9A@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers show
Series AArch64: Fix strict-align cpymem/setmem [PR103100] | expand

Commit Message

Wilco Dijkstra Sept. 20, 2023, 1:50 p.m. UTC
The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
    
Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
        PR target/103100
        * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
        (setmemdi): Likewise.
        * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
        strict-align.  Cleanup condition for using MOPS.
        (aarch64_expand_setmem): Likewise.

---

Comments

Richard Earnshaw (lists) Sept. 20, 2023, 3:52 p.m. UTC | #1
On 20/09/2023 14:50, Wilco Dijkstra wrote:
> 
> The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
> Clean up the condition when to use MOPS.
>     
> Passes regress/bootstrap, OK for commit?
>     
> gcc/ChangeLog/
>         PR target/103100
>         * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.

Shouldn't this be a separate patch?  It's not immediately obvious that this is a necessary part of this change.

>         (setmemdi): Likewise.
>         * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
>         strict-align.  Cleanup condition for using MOPS.
>         (aarch64_expand_setmem): Likewise.
> 
> ---
> 
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index dd6874d13a75f20d10a244578afc355b25c73da2..8f3bfb91c0f4ec43f37fe9289a66092a29a47e4d 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
>    int mode_bits;
>    rtx dst = operands[0];
>    rtx src = operands[1];
> +  unsigned align = INTVAL (operands[3]);

This should read the value with UINTVAL.  Given the useful range of the alignment, it should be OK that we're not using unsigned HWI.

>    rtx base;
>    machine_mode cur_mode = BLKmode;
> +  bool size_p = optimize_function_for_size_p (cfun);
>  
> -  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
> -  if (!CONST_INT_P (operands[2]))
> +  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
> +  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_cpymem_mops (operands);

So what about align=4 and copying, for example, 8 or 12 bytes; wouldn't we want a sequence of LDR/STR in that case?  Doesn't this fall back to MOPS too eagerly?


>  
>    unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
>  
> -  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
> -  unsigned HOST_WIDE_INT max_copy_size
> -    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
> -
> -  bool size_p = optimize_function_for_size_p (cfun);
> +  /* Try to inline up to 256 bytes.  */
> +  unsigned max_copy_size = 256;
> +  unsigned max_mops_size = aarch64_mops_memcpy_size_threshold;

I find this name slightly confusing.  Surely it's min_mops_size (since above that we want to use MOPS rather than inlined loads/stores).  But why not just use aarch64_mops_memcpy_size_threshold directly in the one place it's used?

>  
> -  /* Large constant-sized cpymem should go through MOPS when possible.
> -     It should be a win even for size optimization in the general case.
> -     For speed optimization the choice between MOPS and the SIMD sequence
> -     depends on the size of the copy, rather than number of instructions,
> -     alignment etc.  */
> -  if (size > max_copy_size)
> +  /* Large copies use MOPS when available or a library call.  */
> +  if (size > max_copy_size || (TARGET_MOPS && size > max_mops_size))
>      return aarch64_expand_cpymem_mops (operands);
>  
>    int copy_bits = 256;
> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)

Similar comments apply to this code as well.

>    unsigned HOST_WIDE_INT len;
>    rtx dst = operands[0];
>    rtx val = operands[2], src;
> +  unsigned align = INTVAL (operands[3]);
>    rtx base;
>    machine_mode cur_mode = BLKmode, next_mode;
>  
> -  /* If we don't have SIMD registers or the size is variable use the MOPS
> -     inlined sequence if possible.  */
> -  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
> +  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
> +  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
> +      || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_setmem_mops (operands);
>  
>    bool size_p = optimize_function_for_size_p (cfun);
> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)

And here.

>    /* Default the maximum to 256-bytes when considering only libcall vs
>       SIMD broadcast sequence.  */
>    unsigned max_set_size = 256;
> +  unsigned max_mops_size = aarch64_mops_memset_size_threshold;
>  
>    len = INTVAL (operands[1]);
> -  if (len > max_set_size && !TARGET_MOPS)
> -    return false;
> +
> +  /* Large memset uses MOPS when available or a library call.  */
> +  if (len > max_set_size || (TARGET_MOPS && len > max_mops_size))
> +    return aarch64_expand_setmem_mops (operands);
>  
>    int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
>    /* The MOPS sequence takes:
> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
>       the arguments + 1 for the call.  */
>    unsigned libcall_cost = 4;
>  
> -  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
> -     when available.  */
> -  if (TARGET_MOPS
> -      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
> -    return aarch64_expand_setmem_mops (operands);
> -
>    /* Attempt a sequence with a vector broadcast followed by stores.
>       Count the number of operations involved to see if it's worth it
>       against the alternatives.  A simple counter simd_ops on the
> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
>        simd_ops++;
>        n -= mode_bits;
>  
> -      /* Do certain trailing copies as overlapping if it's going to be
> -	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
> -	 byte copy it's more efficient to do two overlapping 8 byte copies than
> -	 8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
> +      /* Emit trailing writes using overlapping unaligned accesses
> +	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
>        if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
>  	{
>  	  next_mode = smallest_mode_for_size (n, MODE_INT);
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
>     (match_operand:BLK 1 "memory_operand")
>     (match_operand:DI 2 "general_operand")
>     (match_operand:DI 3 "immediate_operand")]
> -   "!STRICT_ALIGNMENT || TARGET_MOPS"
> +   ""
>  {
>    if (aarch64_expand_cpymem (operands))
>      DONE;
> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
>          (match_operand:QI  2 "nonmemory_operand")) ;; Value
>     (use (match_operand:DI  1 "general_operand")) ;; Length
>     (match_operand          3 "immediate_operand")] ;; Align
> - "TARGET_SIMD || TARGET_MOPS"
> + ""
>   {
>    if (aarch64_expand_setmem (operands))
>      DONE;
> 
> 

Are there any additional tests for this?
R.
Wilco Dijkstra Sept. 20, 2023, 5:08 p.m. UTC | #2
Hi Richard,

>         * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.

> Shouldn't this be a separate patch?  It's not immediately obvious that this is a necessary part of this change.

You mean this?

@@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT || TARGET_MOPS"
+   ""

Yes that's necessary since that is the bug.

> +  unsigned align = INTVAL (operands[3]);
>
>This should read the value with UINTVAL.  Given the useful range of the alignment, it should be OK that we're not using unsigned HWI.

I'll fix that.

> +  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_cpymem_mops (operands);
>
> So what about align=4 and copying, for example, 8 or 12 bytes; wouldn't we want a sequence of LDR/STR in that case?  Doesn't this fall back to MOPS too eagerly?

The goal was to fix the issue in way that is both obvious and can be easily backported.
Further improvements can be made to handle other alignments, but it is
slightly tricky (eg. align == 4 won't emit LDP/STP directly using current code
and thus would need additional work to generalize the LDP path).
  
>> +  unsigned max_mops_size = aarch64_mops_memcpy_size_threshold;
>
>I find this name slightly confusing.  Surely it's min_mops_size (since above that we want to use MOPS rather than inlined loads/stores).  But why not just use aarch64_mops_memcpy_size_threshold directly in the one place it's used?

The reason is that in a follow-on patch I check aarch64_mops_memcpy_size_threshold
too, so for now this acts as a shortcut for the ridiculously long name.

> Are there any additional tests for this?

There are existing tests that check the expansion which fail if you completely
block expansions with STRICT_ALIGNMENT.

Cheers,
Wilco
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index dd6874d13a75f20d10a244578afc355b25c73da2..8f3bfb91c0f4ec43f37fe9289a66092a29a47e4d 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@  aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = INTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_cpymem_mops (operands);
 
   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
 
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
-
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned max_mops_size = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
-     It should be a win even for size optimization in the general case.
-     For speed optimization the choice between MOPS and the SIMD sequence
-     depends on the size of the copy, rather than number of instructions,
-     alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > max_mops_size))
     return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25445,12 +25441,13 @@  aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = INTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
-     inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+      || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@  aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
      SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned max_mops_size = aarch64_mops_memset_size_threshold;
 
   len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-    return false;
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > max_mops_size))
+    return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@  aarch64_expand_setmem (rtx *operands)
      the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
-     when available.  */
-  if (TARGET_MOPS
-      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-    return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
      Count the number of operations involved to see if it's worth it
      against the alternatives.  A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@  aarch64_expand_setmem (rtx *operands)
       simd_ops++;
       n -= mode_bits;
 
-      /* Do certain trailing copies as overlapping if it's going to be
-	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
-	 byte copy it's more efficient to do two overlapping 8 byte copies than
-	 8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+      /* Emit trailing writes using overlapping unaligned accesses
+	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
 	{
 	  next_mode = smallest_mode_for_size (n, MODE_INT);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1627,7 +1627,7 @@  (define_expand "cpymemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT || TARGET_MOPS"
+   ""
 {
   if (aarch64_expand_cpymem (operands))
     DONE;
@@ -1724,7 +1724,7 @@  (define_expand "setmemdi"
         (match_operand:QI  2 "nonmemory_operand")) ;; Value
    (use (match_operand:DI  1 "general_operand")) ;; Length
    (match_operand          3 "immediate_operand")] ;; Align
- "TARGET_SIMD || TARGET_MOPS"
+ ""
  {
   if (aarch64_expand_setmem (operands))
     DONE;