Message ID | 1402044609-30902-1-git-send-email-james.greenhalgh@arm.com |
---|---|
State | New |
Headers | show |
On 06/06/14 09:50, James Greenhalgh wrote: > > Hi, > > The move_by_pieces infrastructure performs a copy by repeatedly trying > the largest safe copy it can make. So for a 15-byte copy we might see: > > offset amount bytes copied > 0 8 0-7 > 8 4 8-11 > 12 2 12-13 > 14 1 14 > > However, we can implement a 15-byte copy as so: > > offset amount bytes copied > 0 8 0-7 > 7 8 7-14 > > Which can prove more efficient for both space and speed. > > In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and > implement the movmem pattern name to expand small block copy cases. Note, this > optimization does not apply for -mstrict-align targets, which must continue > copying byte-by-byte. > > Setting MOVE_RATIO low in this way causes a few tests to begin failing, > both of these are documented in the test-case as expected to fail for > low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations. > > Bootstrapped on aarch64-unknown-linux-gnu with no issues. > > OK for trunk? > This is OK. Hmm, I notice sra-12 fails on ARM as well. Is that for the same reason? If so, perhaps you could prepare a patch for that as well (consider it pre-approved). R. > Thanks, > James > > --- > gcc/ > > 2014-06-06 James Greenhalgh <james.greenhalgh@arm.com> > > * config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New. > * config/aarch64/aarch64.c (aarch64_move_pointer): New. > (aarch64_progress_pointer): Likewise. > (aarch64_copy_one_part_and_move_pointers): Likewise. > (aarch64_expand_movmen): Likewise. > * config/aarch64/aarch64.h (MOVE_RATIO): Set low. > * config/aarch64/aarch64.md (movmem<mode>): New. > > gcc/testsuite/ > > 2014-06-06 James Greenhalgh <james.greenhalgh@arm.com> > > * gcc.dg/tree-ssa/pr42585.c: Skip for AArch64. > * gcc.dg/tree-ssa/sra-12.c: Likewise. > > > 0001-AArch64-Implement-movmem-for-the-benefit-of-inline-m.patch > > > diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h > index 68d488d..c4f75b3 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -180,6 +180,7 @@ bool aarch64_cannot_change_mode_class (enum machine_mode, > enum aarch64_symbol_type > aarch64_classify_symbolic_expression (rtx, enum aarch64_symbol_context); > bool aarch64_constant_address_p (rtx); > +bool aarch64_expand_movmem (rtx *); > bool aarch64_float_const_zero_rtx_p (rtx); > bool aarch64_function_arg_regno_p (unsigned); > bool aarch64_gen_movmemqi (rtx *); > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index b26e5f5..0ae21cd 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -9434,6 +9434,164 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2) > return false; > } > > +/* Return a new RTX holding the result of moving POINTER forward by > + AMOUNT bytes. */ > + > +static rtx > +aarch64_move_pointer (rtx pointer, int amount) > +{ > + rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount); > + > + return adjust_automodify_address (pointer, GET_MODE (pointer), > + next, amount); > +} > + > +/* Return a new RTX holding the result of moving POINTER forward by the > + size of the mode it points to. */ > + > +static rtx > +aarch64_progress_pointer (rtx pointer) > +{ > + HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer)); > + > + return aarch64_move_pointer (pointer, amount); > +} > + > +/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by > + MODE bytes. */ > + > +static void > +aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst, > + enum machine_mode mode) > +{ > + rtx reg = gen_reg_rtx (mode); > + > + /* "Cast" the pointers to the correct mode. */ > + *src = adjust_address (*src, mode, 0); > + *dst = adjust_address (*dst, mode, 0); > + /* Emit the memcpy. */ > + emit_move_insn (reg, *src); > + emit_move_insn (*dst, reg); > + /* Move the pointers forward. */ > + *src = aarch64_progress_pointer (*src); > + *dst = aarch64_progress_pointer (*dst); > +} > + > +/* Expand movmem, as if from a __builtin_memcpy. Return true if > + we succeed, otherwise return false. */ > + > +bool > +aarch64_expand_movmem (rtx *operands) > +{ > + unsigned int n; > + rtx dst = operands[0]; > + rtx src = operands[1]; > + rtx base; > + bool speed_p = !optimize_function_for_size_p (cfun); > + > + /* When optimizing for size, give a better estimate of the length of a > + memcpy call, but use the default otherwise. */ > + unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2; > + > + /* We can't do anything smart if the amount to copy is not constant. */ > + if (!CONST_INT_P (operands[2])) > + return false; > + > + n = UINTVAL (operands[2]); > + > + /* Try to keep the number of instructions low. For cases below 16 bytes we > + need to make at most two moves. For cases above 16 bytes it will be one > + move for each 16 byte chunk, then at most two additional moves. */ > + if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions) > + return false; > + > + base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); > + dst = adjust_automodify_address (dst, VOIDmode, base, 0); > + > + base = copy_to_mode_reg (Pmode, XEXP (src, 0)); > + src = adjust_automodify_address (src, VOIDmode, base, 0); > + > + /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a > + 1-byte chunk. */ > + if (n < 4) > + { > + if (n >= 2) > + { > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode); > + n -= 2; > + } > + > + if (n == 1) > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode); > + > + return true; > + } > + > + /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second > + 4-byte chunk, partially overlapping with the previously copied chunk. */ > + if (n < 8) > + { > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); > + n -= 4; > + if (n > 0) > + { > + int move = n - 4; > + > + src = aarch64_move_pointer (src, move); > + dst = aarch64_move_pointer (dst, move); > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); > + } > + return true; > + } > + > + /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of > + them, then (if applicable) an 8-byte chunk. */ > + while (n >= 8) > + { > + if (n / 16) > + { > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode); > + n -= 16; > + } > + else > + { > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode); > + n -= 8; > + } > + } > + > + /* Finish the final bytes of the copy. We can always do this in one > + instruction. We either copy the exact amount we need, or partially > + overlap with the previous chunk we copied and copy 8-bytes. */ > + if (n == 0) > + return true; > + else if (n == 1) > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode); > + else if (n == 2) > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode); > + else if (n == 4) > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); > + else > + { > + if (n == 3) > + { > + src = aarch64_move_pointer (src, -1); > + dst = aarch64_move_pointer (dst, -1); > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); > + } > + else > + { > + int move = n - 8; > + > + src = aarch64_move_pointer (src, move); > + dst = aarch64_move_pointer (dst, move); > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode); > + } > + } > + > + return true; > +} > + > #undef TARGET_ADDRESS_COST > #define TARGET_ADDRESS_COST aarch64_address_cost > > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > index ced5a5e..c5d144e 100644 > --- a/gcc/config/aarch64/aarch64.h > +++ b/gcc/config/aarch64/aarch64.h > @@ -659,12 +659,14 @@ do { \ > /* The base cost overhead of a memcpy call, for MOVE_RATIO and friends. */ > #define AARCH64_CALL_RATIO 8 > > -/* When optimizing for size, give a better estimate of the length of a memcpy > - call, but use the default otherwise. But move_by_pieces_ninsns() counts > - memory-to-memory moves, and we'll have to generate a load & store for each, > - so halve the value to take that into account. */ > +/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure. > + move_by_pieces will continually copy the largest safe chunks. So a > + 7-byte copy is a 4-byte + 2-byte + byte copy. This proves inefficient > + for both size and speed of copy, so we will instead use the "movmem" > + standard name to implement the copy. This logic does not apply when > + targeting -mstrict-align, so keep a sensible default in that case. */ > #define MOVE_RATIO(speed) \ > - (((speed) ? 15 : AARCH64_CALL_RATIO) / 2) > + (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)) > > /* For CLEAR_RATIO, when optimizing for size, give a better estimate > of the length of a memset call, but use the default otherwise. */ > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index 6e605c19f0acbe88d53f460cb513d24dde6d658f..661d784b93e60fd2f636f5b5f03c10c6d53493dd 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -883,6 +883,24 @@ (define_split > } > ) > > +;; 0 is dst > +;; 1 is src > +;; 2 is size of move in bytes > +;; 3 is alignment > + > +(define_expand "movmemdi" > + [(match_operand:BLK 0 "memory_operand") > + (match_operand:BLK 1 "memory_operand") > + (match_operand:DI 2 "immediate_operand") > + (match_operand:DI 3 "immediate_operand")] > + "!STRICT_ALIGNMENT" > +{ > + if (aarch64_expand_movmem (operands)) > + DONE; > + FAIL; > +} > +) > + > ;; Operands 1 and 3 are tied together by the final condition; so we allow > ;; fairly lax checking on the second memory operation. > (define_insn "load_pair<mode>" > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c > index a970c85..07f575d 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c > @@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr) > /* Whether the structs are totally scalarized or not depends on the > MOVE_RATIO macro definition in the back end. The scalarization will > not take place when using small values for MOVE_RATIO. */ > -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */ > -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */ > +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */ > +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */ > /* { dg-final { cleanup-tree-dump "optimized" } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c > index 59e5e6a..45aa963 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c > @@ -21,5 +21,5 @@ int foo (struct S *p) > *p = l; > } > > -/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */ > +/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */ > /* { dg-final { cleanup-tree-dump "release_ssa" } } */ >
On Fri, Jun 6, 2014 at 1:50 AM, James Greenhalgh <james.greenhalgh@arm.com> wrote: > > Hi, > > The move_by_pieces infrastructure performs a copy by repeatedly trying > the largest safe copy it can make. So for a 15-byte copy we might see: > > offset amount bytes copied > 0 8 0-7 > 8 4 8-11 > 12 2 12-13 > 14 1 14 > > However, we can implement a 15-byte copy as so: > > offset amount bytes copied > 0 8 0-7 > 7 8 7-14 > > Which can prove more efficient for both space and speed. > > In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and > implement the movmem pattern name to expand small block copy cases. Note, this > optimization does not apply for -mstrict-align targets, which must continue > copying byte-by-byte. > > Setting MOVE_RATIO low in this way causes a few tests to begin failing, > both of these are documented in the test-case as expected to fail for > low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations. I think you should reevaluate setting MOVE_RATIO this low. It is used for SRA and IPA-SRA which both are very useful; more useful than memmove optimizations can do. In fact this optimization is not even valid for volatile variables. Here is a testcase for the volatile issue: struct __attribute__((packed)) t15{ long long t8; int t4; short t2; unsigned char t1; }; volatile struct t15 t15; int f(struct t15 *a) { t15 = *a; } Notice how we are writing to byte 7 twice to t15 in the outputted code. Thanks, Andrew Pinski > > Bootstrapped on aarch64-unknown-linux-gnu with no issues. > > OK for trunk? > > Thanks, > James > > --- > gcc/ > > 2014-06-06 James Greenhalgh <james.greenhalgh@arm.com> > > * config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New. > * config/aarch64/aarch64.c (aarch64_move_pointer): New. > (aarch64_progress_pointer): Likewise. > (aarch64_copy_one_part_and_move_pointers): Likewise. > (aarch64_expand_movmen): Likewise. > * config/aarch64/aarch64.h (MOVE_RATIO): Set low. > * config/aarch64/aarch64.md (movmem<mode>): New. > > gcc/testsuite/ > > 2014-06-06 James Greenhalgh <james.greenhalgh@arm.com> > > * gcc.dg/tree-ssa/pr42585.c: Skip for AArch64. > * gcc.dg/tree-ssa/sra-12.c: Likewise.
On Fri, Aug 1, 2014 at 8:38 AM, Andrew Pinski <pinskia@gmail.com> wrote: > On Fri, Jun 6, 2014 at 1:50 AM, James Greenhalgh > <james.greenhalgh@arm.com> wrote: >> >> Hi, >> >> The move_by_pieces infrastructure performs a copy by repeatedly trying >> the largest safe copy it can make. So for a 15-byte copy we might see: >> >> offset amount bytes copied >> 0 8 0-7 >> 8 4 8-11 >> 12 2 12-13 >> 14 1 14 >> >> However, we can implement a 15-byte copy as so: >> >> offset amount bytes copied >> 0 8 0-7 >> 7 8 7-14 >> >> Which can prove more efficient for both space and speed. >> >> In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and >> implement the movmem pattern name to expand small block copy cases. Note, this >> optimization does not apply for -mstrict-align targets, which must continue >> copying byte-by-byte. >> >> Setting MOVE_RATIO low in this way causes a few tests to begin failing, >> both of these are documented in the test-case as expected to fail for >> low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations. > > > I think you should reevaluate setting MOVE_RATIO this low. It is used > for SRA and IPA-SRA which both are very useful; more useful than > memmove optimizations can do. Maybe we should finally decouple SRA and IPA-SRA from MOVE_RATIO and have a --param to control the heuristic a target can adjust separately. (we can still default to MOVE_RATIO here). Ok, maybe we need two params, one for size and one for speed optimization (though the size effects are hard to estimate). Richard. > In fact this optimization is not even valid for volatile variables. > Here is a testcase for the volatile issue: > struct __attribute__((packed)) t15{ > long long t8; > int t4; > short t2; > unsigned char t1; > }; > volatile struct t15 t15; > int f(struct t15 *a) > { > t15 = *a; > } > > Notice how we are writing to byte 7 twice to t15 in the outputted code. > > Thanks, > Andrew Pinski > >> >> Bootstrapped on aarch64-unknown-linux-gnu with no issues. >> >> OK for trunk? >> >> Thanks, >> James >> >> --- >> gcc/ >> >> 2014-06-06 James Greenhalgh <james.greenhalgh@arm.com> >> >> * config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New. >> * config/aarch64/aarch64.c (aarch64_move_pointer): New. >> (aarch64_progress_pointer): Likewise. >> (aarch64_copy_one_part_and_move_pointers): Likewise. >> (aarch64_expand_movmen): Likewise. >> * config/aarch64/aarch64.h (MOVE_RATIO): Set low. >> * config/aarch64/aarch64.md (movmem<mode>): New. >> >> gcc/testsuite/ >> >> 2014-06-06 James Greenhalgh <james.greenhalgh@arm.com> >> >> * gcc.dg/tree-ssa/pr42585.c: Skip for AArch64. >> * gcc.dg/tree-ssa/sra-12.c: Likewise.
> On Jun 6, 2014, at 1:50 AM, James Greenhalgh <james.greenhalgh@arm.com> wrote: > > > Hi, > > The move_by_pieces infrastructure performs a copy by repeatedly trying > the largest safe copy it can make. So for a 15-byte copy we might see: > > offset amount bytes copied > 0 8 0-7 > 8 4 8-11 > 12 2 12-13 > 14 1 14 > > However, we can implement a 15-byte copy as so: > > offset amount bytes copied > 0 8 0-7 > 7 8 7-14 > > Which can prove more efficient for both space and speed. > > In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and > implement the movmem pattern name to expand small block copy cases. Note, this > optimization does not apply for -mstrict-align targets, which must continue > copying byte-by-byte. Why not change move_by_pieces instead of having a target specific code? This seems like a better option. You can check is unaligned slow target macro to see if you want to do this optimization too. As I mentioned in the other email make sure you check the volatile ness of the from and to before doing this optimization. Thanks, Andrew > > Setting MOVE_RATIO low in this way causes a few tests to begin failing, > both of these are documented in the test-case as expected to fail for > low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations. > > Bootstrapped on aarch64-unknown-linux-gnu with no issues. > > OK for trunk? > > Thanks, > James > > --- > gcc/ > > 2014-06-06 James Greenhalgh <james.greenhalgh@arm.com> > > * config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New. > * config/aarch64/aarch64.c (aarch64_move_pointer): New. > (aarch64_progress_pointer): Likewise. > (aarch64_copy_one_part_and_move_pointers): Likewise. > (aarch64_expand_movmen): Likewise. > * config/aarch64/aarch64.h (MOVE_RATIO): Set low. > * config/aarch64/aarch64.md (movmem<mode>): New. > > gcc/testsuite/ > > 2014-06-06 James Greenhalgh <james.greenhalgh@arm.com> > > * gcc.dg/tree-ssa/pr42585.c: Skip for AArch64. > * gcc.dg/tree-ssa/sra-12.c: Likewise. > <0001-AArch64-Implement-movmem-for-the-benefit-of-inline-m.patch>
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 68d488d..c4f75b3 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -180,6 +180,7 @@ bool aarch64_cannot_change_mode_class (enum machine_mode, enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx, enum aarch64_symbol_context); bool aarch64_constant_address_p (rtx); +bool aarch64_expand_movmem (rtx *); bool aarch64_float_const_zero_rtx_p (rtx); bool aarch64_function_arg_regno_p (unsigned); bool aarch64_gen_movmemqi (rtx *); diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index b26e5f5..0ae21cd 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -9434,6 +9434,164 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2) return false; } +/* Return a new RTX holding the result of moving POINTER forward by + AMOUNT bytes. */ + +static rtx +aarch64_move_pointer (rtx pointer, int amount) +{ + rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount); + + return adjust_automodify_address (pointer, GET_MODE (pointer), + next, amount); +} + +/* Return a new RTX holding the result of moving POINTER forward by the + size of the mode it points to. */ + +static rtx +aarch64_progress_pointer (rtx pointer) +{ + HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer)); + + return aarch64_move_pointer (pointer, amount); +} + +/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by + MODE bytes. */ + +static void +aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst, + enum machine_mode mode) +{ + rtx reg = gen_reg_rtx (mode); + + /* "Cast" the pointers to the correct mode. */ + *src = adjust_address (*src, mode, 0); + *dst = adjust_address (*dst, mode, 0); + /* Emit the memcpy. */ + emit_move_insn (reg, *src); + emit_move_insn (*dst, reg); + /* Move the pointers forward. */ + *src = aarch64_progress_pointer (*src); + *dst = aarch64_progress_pointer (*dst); +} + +/* Expand movmem, as if from a __builtin_memcpy. Return true if + we succeed, otherwise return false. */ + +bool +aarch64_expand_movmem (rtx *operands) +{ + unsigned int n; + rtx dst = operands[0]; + rtx src = operands[1]; + rtx base; + bool speed_p = !optimize_function_for_size_p (cfun); + + /* When optimizing for size, give a better estimate of the length of a + memcpy call, but use the default otherwise. */ + unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2; + + /* We can't do anything smart if the amount to copy is not constant. */ + if (!CONST_INT_P (operands[2])) + return false; + + n = UINTVAL (operands[2]); + + /* Try to keep the number of instructions low. For cases below 16 bytes we + need to make at most two moves. For cases above 16 bytes it will be one + move for each 16 byte chunk, then at most two additional moves. */ + if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions) + return false; + + base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); + dst = adjust_automodify_address (dst, VOIDmode, base, 0); + + base = copy_to_mode_reg (Pmode, XEXP (src, 0)); + src = adjust_automodify_address (src, VOIDmode, base, 0); + + /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a + 1-byte chunk. */ + if (n < 4) + { + if (n >= 2) + { + aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode); + n -= 2; + } + + if (n == 1) + aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode); + + return true; + } + + /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second + 4-byte chunk, partially overlapping with the previously copied chunk. */ + if (n < 8) + { + aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); + n -= 4; + if (n > 0) + { + int move = n - 4; + + src = aarch64_move_pointer (src, move); + dst = aarch64_move_pointer (dst, move); + aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); + } + return true; + } + + /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of + them, then (if applicable) an 8-byte chunk. */ + while (n >= 8) + { + if (n / 16) + { + aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode); + n -= 16; + } + else + { + aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode); + n -= 8; + } + } + + /* Finish the final bytes of the copy. We can always do this in one + instruction. We either copy the exact amount we need, or partially + overlap with the previous chunk we copied and copy 8-bytes. */ + if (n == 0) + return true; + else if (n == 1) + aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode); + else if (n == 2) + aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode); + else if (n == 4) + aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); + else + { + if (n == 3) + { + src = aarch64_move_pointer (src, -1); + dst = aarch64_move_pointer (dst, -1); + aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); + } + else + { + int move = n - 8; + + src = aarch64_move_pointer (src, move); + dst = aarch64_move_pointer (dst, move); + aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode); + } + } + + return true; +} + #undef TARGET_ADDRESS_COST #define TARGET_ADDRESS_COST aarch64_address_cost diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index ced5a5e..c5d144e 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -659,12 +659,14 @@ do { \ /* The base cost overhead of a memcpy call, for MOVE_RATIO and friends. */ #define AARCH64_CALL_RATIO 8 -/* When optimizing for size, give a better estimate of the length of a memcpy - call, but use the default otherwise. But move_by_pieces_ninsns() counts - memory-to-memory moves, and we'll have to generate a load & store for each, - so halve the value to take that into account. */ +/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure. + move_by_pieces will continually copy the largest safe chunks. So a + 7-byte copy is a 4-byte + 2-byte + byte copy. This proves inefficient + for both size and speed of copy, so we will instead use the "movmem" + standard name to implement the copy. This logic does not apply when + targeting -mstrict-align, so keep a sensible default in that case. */ #define MOVE_RATIO(speed) \ - (((speed) ? 15 : AARCH64_CALL_RATIO) / 2) + (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)) /* For CLEAR_RATIO, when optimizing for size, give a better estimate of the length of a memset call, but use the default otherwise. */ diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 6e605c19f0acbe88d53f460cb513d24dde6d658f..661d784b93e60fd2f636f5b5f03c10c6d53493dd 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -883,6 +883,24 @@ (define_split } ) +;; 0 is dst +;; 1 is src +;; 2 is size of move in bytes +;; 3 is alignment + +(define_expand "movmemdi" + [(match_operand:BLK 0 "memory_operand") + (match_operand:BLK 1 "memory_operand") + (match_operand:DI 2 "immediate_operand") + (match_operand:DI 3 "immediate_operand")] + "!STRICT_ALIGNMENT" +{ + if (aarch64_expand_movmem (operands)) + DONE; + FAIL; +} +) + ;; Operands 1 and 3 are tied together by the final condition; so we allow ;; fairly lax checking on the second memory operation. (define_insn "load_pair<mode>" diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c index a970c85..07f575d 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c @@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr) /* Whether the structs are totally scalarized or not depends on the MOVE_RATIO macro definition in the back end. The scalarization will not take place when using small values for MOVE_RATIO. */ -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */ -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */ +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */ +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */ /* { dg-final { cleanup-tree-dump "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c index 59e5e6a..45aa963 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c @@ -21,5 +21,5 @@ int foo (struct S *p) *p = l; } -/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */ +/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */ /* { dg-final { cleanup-tree-dump "release_ssa" } } */