Message ID | 20200814225905.40150-1-acsawdey@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | rs6000: unaligned VSX in memcpy/memmove expansion | expand |
On Fri, 2020-08-14 at 17:59 -0500, Aaron Sawdey via Gcc-patches wrote: Hi, > This patch adds a few new instructions to inline expansion of > memcpy/memmove. Generation of all these is controlled by s/is/are/ ? > the option -mblock-ops-unaligned-vsx which is set on by default if the > target has TARGET_EFFICIENT_UNALIGNED_VSX. > * unaligned vsx load/store (V2DImode) > * unaligned vsx pair load/store (POImode) which is also controlled > by -mblock-ops-vector-pair in case it is not wanted at some point. > The default for this option is also for it to be on if the target has > TARGET_EFFICIENT_UNALIGNED_VSX. 'this option' meaing the -mblock-ops-vecftor-pair option? > * unaligned vsx lxvl/stxvl but generally only to do the remainder > of a copy/move we stated with some vsx loads/stores, and also prefer > to use lb/lh/lw/ld if the remainder is 1/2/4/8 bytes. > > Testing of this is actually accomplished by gcc.dg/memcmp-1.c which does > two memcpy() for each memcmp(). If the memcpy() calls don't do the right > thing then the memcmp() will fail unexpectedly. > > Regstrap passed on ppc64le power9 and the memcmp-1.c test passes on > power10 simulator, ok for trunk? > > Thanks! > Aaron > > gcc/ChangeLog: > > * config/rs6000/rs6000-string.c (gen_lxvl_stxvl_move): > Helper function. > (expand_block_move): Add lxvl/stxvl, vector pair, and > unaligned VSX. > * config/rs6000/rs6000.c (rs6000_option_override_internal): > Default value for -mblock-ops-vector-pair. > * config/rs6000/rs6000.opt: Add -mblock-ops-vector-pair. > --- > gcc/config/rs6000/rs6000-string.c | 105 ++++++++++++++++++++++++++---- > gcc/config/rs6000/rs6000.c | 14 +++- > gcc/config/rs6000/rs6000.opt | 4 ++ > 3 files changed, 107 insertions(+), 16 deletions(-) > > diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c > index c35d93180ca..ce6db2ba14d 100644 > --- a/gcc/config/rs6000/rs6000-string.c > +++ b/gcc/config/rs6000/rs6000-string.c > @@ -2708,6 +2708,36 @@ gen_lvx_v4si_move (rtx dest, rtx src) > return gen_altivec_lvx_v4si_internal (dest, src); > } > > +static rtx > +gen_lxvl_stxvl_move (rtx dest, rtx src, int length) > +{ > + gcc_assert (MEM_P (dest) ^ MEM_P (src)); > + gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode); > + gcc_assert (length <= 16); > + > + bool is_store = MEM_P (dest); > + > + /* If the address form is not a simple register, make it so. */ Possibly just cosmetic - Would ' /* Force dest and src to be simple registers if necessary. */' make more sense? > + if (is_store) > + { > + dest = XEXP (dest, 0); > + if (!REG_P (dest)) > + dest = force_reg (Pmode, dest); > + } > + else > + { > + src = XEXP (src, 0); > + if (!REG_P (src)) > + src = force_reg (Pmode, src); > + } > + > + rtx len = force_reg (DImode, gen_int_mode (length, DImode)); > + if (is_store) > + return gen_stxvl (src, dest, len); > + else > + return gen_lxvl (dest, src, len); > +} > + > /* Expand a block move operation, and return 1 if successful. Return 0 > if we should let the compiler generate normal code. > ok > @@ -2750,18 +2780,57 @@ expand_block_move (rtx operands[], bool might_overlap) > if (bytes > rs6000_block_move_inline_limit) > return 0; > > + int orig_bytes = bytes; > for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes) > { > union { > - rtx (*movmemsi) (rtx, rtx, rtx, rtx); > rtx (*mov) (rtx, rtx); > + rtx (*movlen) (rtx, rtx, int); > } gen_func; > machine_mode mode = BLKmode; > rtx src, dest; > - > - /* Altivec first, since it will be faster than a string move > - when it applies, and usually not significantly larger. */ > - if (TARGET_ALTIVEC && bytes >= 16 && align >= 128) > + bool move_with_length = false; > + > + /* Use POImode for paired vsx load/store. Use V2DI for single > + unaligned vsx load/store, for consistency with what other > + expansions (compare) already do, and so we can use lxvd2x on > + p8. Order is VSX pair unaligned, VSX unaligned, Altivec, vsx > + with length < 16 (if allowed), then smaller gpr > + load/store. */ s/vsx/VSX/ s/smaller// ? > + > + if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX > + && TARGET_BLOCK_OPS_VECTOR_PAIR > + && bytes >= 32 > + && (align >= 256 || !STRICT_ALIGNMENT)) > + { > + move_bytes = 32; > + mode = POImode; > + gen_func.mov = gen_movpoi; > + } > + else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX > + && VECTOR_MEM_VSX_P (V2DImode) > + && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT)) > + { > + move_bytes = 16; > + mode = V2DImode; > + gen_func.mov = gen_vsx_movv2di_64bit; > + } > + else if (TARGET_BLOCK_OPS_UNALIGNED_VSX > + && TARGET_POWER10 && bytes < 16 > + && orig_bytes > 16 > + && !(bytes == 1 || bytes == 2 > + || bytes == 4 || bytes == 8) > + && (align >= 128 || !STRICT_ALIGNMENT)) > + { > + /* Only use lxvl/stxvl if it could replace multiple ordinary > + loads+stores. Also don't use it unless we likely already > + did one vsx copy so we aren't mixing gpr and vsx. */ > + move_bytes = bytes; > + mode = V16QImode; > + gen_func.movlen = gen_lxvl_stxvl_move; > + move_with_length = true; > + } > + else if (TARGET_ALTIVEC && bytes >= 16 && align >= 128) > { > move_bytes = 16; > mode = V4SImode; > @@ -2818,7 +2887,13 @@ expand_block_move (rtx operands[], bool might_overlap) > gen_func.mov = gen_movqi; > } > > - /* Mode is always set to something other than BLKmode by one of the > + /* If we can't succeed in doing it in one pass, we can't do it in the > + might_overlap case. Bail out and return failure. */ Looks like this comment is an existing one, since we touch it, i'd suggest ... s/it/the move/ > + if (might_overlap && (num_reg+1) >= MAX_MOVE_REG > + && bytes > move_bytes) > + return 0; > + > + /* Mode is always set to something other than BLKmode by one of the > cases of the if statement above. */ > gcc_assert (mode != BLKmode); > > @@ -2826,15 +2901,17 @@ expand_block_move (rtx operands[], bool might_overlap) > dest = adjust_address (orig_dest, mode, offset); > > rtx tmp_reg = gen_reg_rtx (mode); > - > - loads[num_reg] = (*gen_func.mov) (tmp_reg, src); > - stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); > > - /* If we didn't succeed in doing it in one pass, we can't do it in the > - might_overlap case. Bail out and return failure. */ > - if (might_overlap && num_reg >= MAX_MOVE_REG > - && bytes > move_bytes) > - return 0; > + if (move_with_length) > + { > + loads[num_reg] = (*gen_func.movlen) (tmp_reg, src, move_bytes); > + stores[num_reg++] = (*gen_func.movlen) (dest, tmp_reg, move_bytes); > + } > + else > + { > + loads[num_reg] = (*gen_func.mov) (tmp_reg, src); > + stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); > + } > > /* Emit loads and stores saved up. */ > if (num_reg >= MAX_MOVE_REG || bytes == move_bytes) ok > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c > index fe93cf6ff2b..1c1caa90ede 100644 > --- a/gcc/config/rs6000/rs6000.c > +++ b/gcc/config/rs6000/rs6000.c > @@ -4018,6 +4018,14 @@ rs6000_option_override_internal (bool global_init_p) > rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX; > } > > + if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR)) > + { > + if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX) > + rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_VECTOR_PAIR; > + else > + rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR; > + } > + Is TARGET_MMA correct in there? otherwise ok > /* Use long double size to select the appropriate long double. We use > TYPE_PRECISION to differentiate the 3 different long double types. We map > 128 into the precision used for TFmode. */ > @@ -23222,8 +23230,10 @@ struct rs6000_opt_mask { > static struct rs6000_opt_mask const rs6000_opt_masks[] = > { > { "altivec", OPTION_MASK_ALTIVEC, false, true }, > - { "block-ops-unaligned-vsx", OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX, > - false, true }, > + { "block-ops-unaligned-vsx", OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX, > + false, true }, > + { "block-ops-vector-pair", OPTION_MASK_BLOCK_OPS_VECTOR_PAIR, > + false, true }, > { "cmpb", OPTION_MASK_CMPB, false, true }, > { "crypto", OPTION_MASK_CRYPTO, false, true }, > { "direct-move", OPTION_MASK_DIRECT_MOVE, false, true }, ok. > diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt > index 9d3e740e930..4354cf1b898 100644 > --- a/gcc/config/rs6000/rs6000.opt > +++ b/gcc/config/rs6000/rs6000.opt > @@ -328,6 +328,10 @@ mblock-ops-unaligned-vsx > Target Report Mask(BLOCK_OPS_UNALIGNED_VSX) Var(rs6000_isa_flags) > Generate unaligned VSX load/store for inline expansion of memcpy/memmove. > > +mblock-ops-vector-pair > +Target Undocumented Mask(BLOCK_OPS_VECTOR_PAIR) Var(rs6000_isa_flags) > +Generate unaligned VSX load/store for inline expansion of memcpy/memmove. > + ok Thanks, -Will > mblock-compare-inline-limit= > Target Report Var(rs6000_block_compare_inline_limit) Init(63) RejectNegative Joined UInteger Save > Max number of bytes to compare without loops.
Hi! On Fri, Aug 14, 2020 at 05:59:05PM -0500, Aaron Sawdey via Gcc-patches wrote: > +static rtx > +gen_lxvl_stxvl_move (rtx dest, rtx src, int length) > +{ > + gcc_assert (MEM_P (dest) ^ MEM_P (src)); Maybe just "!="? > + gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode); > + gcc_assert (length <= 16); > + > + bool is_store = MEM_P (dest); > + > + /* If the address form is not a simple register, make it so. */ > + if (is_store) > + { > + dest = XEXP (dest, 0); > + if (!REG_P (dest)) > + dest = force_reg (Pmode, dest); So this changes what "dest" means. Maybe it is clearer if you have a separate variable "addr"? That you can use for dest and src as well, whichever is memory. > + if (is_store) > + return gen_stxvl (src, dest, len); > + else > + return gen_lxvl (dest, src, len); (doubled space -- well I guess you wanted to align the code) > + /* If we can't succeed in doing it in one pass, we can't do it in the > + might_overlap case. Bail out and return failure. */ > + if (might_overlap && (num_reg+1) >= MAX_MOVE_REG > + && bytes > move_bytes) > + return 0; The "num_reg+1" isn't obvious, and the comment doesn't say (we usually write is as "num_reg + 1" fwiw, and the parens are superfluous). Looks good, thanks! Okay for trunk with or without such changes. Segher
diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c index c35d93180ca..ce6db2ba14d 100644 --- a/gcc/config/rs6000/rs6000-string.c +++ b/gcc/config/rs6000/rs6000-string.c @@ -2708,6 +2708,36 @@ gen_lvx_v4si_move (rtx dest, rtx src) return gen_altivec_lvx_v4si_internal (dest, src); } +static rtx +gen_lxvl_stxvl_move (rtx dest, rtx src, int length) +{ + gcc_assert (MEM_P (dest) ^ MEM_P (src)); + gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode); + gcc_assert (length <= 16); + + bool is_store = MEM_P (dest); + + /* If the address form is not a simple register, make it so. */ + if (is_store) + { + dest = XEXP (dest, 0); + if (!REG_P (dest)) + dest = force_reg (Pmode, dest); + } + else + { + src = XEXP (src, 0); + if (!REG_P (src)) + src = force_reg (Pmode, src); + } + + rtx len = force_reg (DImode, gen_int_mode (length, DImode)); + if (is_store) + return gen_stxvl (src, dest, len); + else + return gen_lxvl (dest, src, len); +} + /* Expand a block move operation, and return 1 if successful. Return 0 if we should let the compiler generate normal code. @@ -2750,18 +2780,57 @@ expand_block_move (rtx operands[], bool might_overlap) if (bytes > rs6000_block_move_inline_limit) return 0; + int orig_bytes = bytes; for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes) { union { - rtx (*movmemsi) (rtx, rtx, rtx, rtx); rtx (*mov) (rtx, rtx); + rtx (*movlen) (rtx, rtx, int); } gen_func; machine_mode mode = BLKmode; rtx src, dest; - - /* Altivec first, since it will be faster than a string move - when it applies, and usually not significantly larger. */ - if (TARGET_ALTIVEC && bytes >= 16 && align >= 128) + bool move_with_length = false; + + /* Use POImode for paired vsx load/store. Use V2DI for single + unaligned vsx load/store, for consistency with what other + expansions (compare) already do, and so we can use lxvd2x on + p8. Order is VSX pair unaligned, VSX unaligned, Altivec, vsx + with length < 16 (if allowed), then smaller gpr + load/store. */ + + if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX + && TARGET_BLOCK_OPS_VECTOR_PAIR + && bytes >= 32 + && (align >= 256 || !STRICT_ALIGNMENT)) + { + move_bytes = 32; + mode = POImode; + gen_func.mov = gen_movpoi; + } + else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX + && VECTOR_MEM_VSX_P (V2DImode) + && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT)) + { + move_bytes = 16; + mode = V2DImode; + gen_func.mov = gen_vsx_movv2di_64bit; + } + else if (TARGET_BLOCK_OPS_UNALIGNED_VSX + && TARGET_POWER10 && bytes < 16 + && orig_bytes > 16 + && !(bytes == 1 || bytes == 2 + || bytes == 4 || bytes == 8) + && (align >= 128 || !STRICT_ALIGNMENT)) + { + /* Only use lxvl/stxvl if it could replace multiple ordinary + loads+stores. Also don't use it unless we likely already + did one vsx copy so we aren't mixing gpr and vsx. */ + move_bytes = bytes; + mode = V16QImode; + gen_func.movlen = gen_lxvl_stxvl_move; + move_with_length = true; + } + else if (TARGET_ALTIVEC && bytes >= 16 && align >= 128) { move_bytes = 16; mode = V4SImode; @@ -2818,7 +2887,13 @@ expand_block_move (rtx operands[], bool might_overlap) gen_func.mov = gen_movqi; } - /* Mode is always set to something other than BLKmode by one of the + /* If we can't succeed in doing it in one pass, we can't do it in the + might_overlap case. Bail out and return failure. */ + if (might_overlap && (num_reg+1) >= MAX_MOVE_REG + && bytes > move_bytes) + return 0; + + /* Mode is always set to something other than BLKmode by one of the cases of the if statement above. */ gcc_assert (mode != BLKmode); @@ -2826,15 +2901,17 @@ expand_block_move (rtx operands[], bool might_overlap) dest = adjust_address (orig_dest, mode, offset); rtx tmp_reg = gen_reg_rtx (mode); - - loads[num_reg] = (*gen_func.mov) (tmp_reg, src); - stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); - /* If we didn't succeed in doing it in one pass, we can't do it in the - might_overlap case. Bail out and return failure. */ - if (might_overlap && num_reg >= MAX_MOVE_REG - && bytes > move_bytes) - return 0; + if (move_with_length) + { + loads[num_reg] = (*gen_func.movlen) (tmp_reg, src, move_bytes); + stores[num_reg++] = (*gen_func.movlen) (dest, tmp_reg, move_bytes); + } + else + { + loads[num_reg] = (*gen_func.mov) (tmp_reg, src); + stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); + } /* Emit loads and stores saved up. */ if (num_reg >= MAX_MOVE_REG || bytes == move_bytes) diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index fe93cf6ff2b..1c1caa90ede 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -4018,6 +4018,14 @@ rs6000_option_override_internal (bool global_init_p) rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX; } + if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR)) + { + if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX) + rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_VECTOR_PAIR; + else + rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR; + } + /* Use long double size to select the appropriate long double. We use TYPE_PRECISION to differentiate the 3 different long double types. We map 128 into the precision used for TFmode. */ @@ -23222,8 +23230,10 @@ struct rs6000_opt_mask { static struct rs6000_opt_mask const rs6000_opt_masks[] = { { "altivec", OPTION_MASK_ALTIVEC, false, true }, - { "block-ops-unaligned-vsx", OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX, - false, true }, + { "block-ops-unaligned-vsx", OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX, + false, true }, + { "block-ops-vector-pair", OPTION_MASK_BLOCK_OPS_VECTOR_PAIR, + false, true }, { "cmpb", OPTION_MASK_CMPB, false, true }, { "crypto", OPTION_MASK_CRYPTO, false, true }, { "direct-move", OPTION_MASK_DIRECT_MOVE, false, true }, diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index 9d3e740e930..4354cf1b898 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -328,6 +328,10 @@ mblock-ops-unaligned-vsx Target Report Mask(BLOCK_OPS_UNALIGNED_VSX) Var(rs6000_isa_flags) Generate unaligned VSX load/store for inline expansion of memcpy/memmove. +mblock-ops-vector-pair +Target Undocumented Mask(BLOCK_OPS_VECTOR_PAIR) Var(rs6000_isa_flags) +Generate unaligned VSX load/store for inline expansion of memcpy/memmove. + mblock-compare-inline-limit= Target Report Var(rs6000_block_compare_inline_limit) Init(63) RejectNegative Joined UInteger Save Max number of bytes to compare without loops.