diff mbox series

rs6000: unaligned VSX in memcpy/memmove expansion

Message ID 20200814225905.40150-1-acsawdey@linux.ibm.com
State New
Headers show
Series rs6000: unaligned VSX in memcpy/memmove expansion | expand

Commit Message

Aaron Sawdey Aug. 14, 2020, 10:59 p.m. UTC
This patch adds a few new instructions to inline expansion of
memcpy/memmove. Generation of all these is controlled by
the option -mblock-ops-unaligned-vsx which is set on by default if the
target has TARGET_EFFICIENT_UNALIGNED_VSX.
 * unaligned vsx load/store (V2DImode)
 * unaligned vsx pair load/store (POImode) which is also controlled
   by -mblock-ops-vector-pair in case it is not wanted at some point.
   The default for this option is also for it to be on if the target has
   TARGET_EFFICIENT_UNALIGNED_VSX.
 * unaligned vsx lxvl/stxvl but generally only to do the remainder
   of a copy/move we stated with some vsx loads/stores, and also prefer
   to use lb/lh/lw/ld if the remainder is 1/2/4/8 bytes.

Testing of this is actually accomplished by gcc.dg/memcmp-1.c which does
two memcpy() for each memcmp(). If the memcpy() calls don't do the right
thing then the memcmp() will fail unexpectedly.

Regstrap passed on ppc64le power9 and the memcmp-1.c test passes on
power10 simulator, ok for trunk?

Thanks!
    Aaron

gcc/ChangeLog:

	* config/rs6000/rs6000-string.c (gen_lxvl_stxvl_move):
	Helper function.
	(expand_block_move): Add lxvl/stxvl, vector pair, and
	unaligned VSX.
	* config/rs6000/rs6000.c (rs6000_option_override_internal):
	Default value for -mblock-ops-vector-pair.
	* config/rs6000/rs6000.opt: Add -mblock-ops-vector-pair.
---
 gcc/config/rs6000/rs6000-string.c | 105 ++++++++++++++++++++++++++----
 gcc/config/rs6000/rs6000.c        |  14 +++-
 gcc/config/rs6000/rs6000.opt      |   4 ++
 3 files changed, 107 insertions(+), 16 deletions(-)

Comments

will schmidt Aug. 17, 2020, 3:23 p.m. UTC | #1
On Fri, 2020-08-14 at 17:59 -0500, Aaron Sawdey via Gcc-patches wrote:

Hi,

> This patch adds a few new instructions to inline expansion of
> memcpy/memmove. Generation of all these is controlled by

s/is/are/ ?

> the option -mblock-ops-unaligned-vsx which is set on by default if the
> target has TARGET_EFFICIENT_UNALIGNED_VSX.
>  * unaligned vsx load/store (V2DImode)
>  * unaligned vsx pair load/store (POImode) which is also controlled
>    by -mblock-ops-vector-pair in case it is not wanted at some point.
>    The default for this option is also for it to be on if the target has
>    TARGET_EFFICIENT_UNALIGNED_VSX.

'this option' meaing the -mblock-ops-vecftor-pair option? 


>  * unaligned vsx lxvl/stxvl but generally only to do the remainder
>    of a copy/move we stated with some vsx loads/stores, and also prefer
>    to use lb/lh/lw/ld if the remainder is 1/2/4/8 bytes.
> 
> Testing of this is actually accomplished by gcc.dg/memcmp-1.c which does
> two memcpy() for each memcmp(). If the memcpy() calls don't do the right
> thing then the memcmp() will fail unexpectedly.



> 
> Regstrap passed on ppc64le power9 and the memcmp-1.c test passes on
> power10 simulator, ok for trunk?
> 
> Thanks!
>     Aaron
> 
> gcc/ChangeLog:
> 
> 	* config/rs6000/rs6000-string.c (gen_lxvl_stxvl_move):
> 	Helper function.
> 	(expand_block_move): Add lxvl/stxvl, vector pair, and
> 	unaligned VSX.
> 	* config/rs6000/rs6000.c (rs6000_option_override_internal):
> 	Default value for -mblock-ops-vector-pair.
> 	* config/rs6000/rs6000.opt: Add -mblock-ops-vector-pair.
> ---
>  gcc/config/rs6000/rs6000-string.c | 105 ++++++++++++++++++++++++++----
>  gcc/config/rs6000/rs6000.c        |  14 +++-
>  gcc/config/rs6000/rs6000.opt      |   4 ++
>  3 files changed, 107 insertions(+), 16 deletions(-)
> 
> diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c
> index c35d93180ca..ce6db2ba14d 100644
> --- a/gcc/config/rs6000/rs6000-string.c
> +++ b/gcc/config/rs6000/rs6000-string.c
> @@ -2708,6 +2708,36 @@ gen_lvx_v4si_move (rtx dest, rtx src)
>      return gen_altivec_lvx_v4si_internal (dest, src);
>  }
> 
> +static rtx
> +gen_lxvl_stxvl_move (rtx dest, rtx src, int length)
> +{
> +  gcc_assert (MEM_P (dest) ^ MEM_P (src));
> +  gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode);
> +  gcc_assert (length <= 16);
> +
> +  bool is_store = MEM_P (dest);
> +
> +  /* If the address form is not a simple register, make it so.  */

Possibly just cosmetic - Would ' /*  Force dest and src to be simple
registers if necessary.  */' make more sense?

> +  if (is_store)
> +    {
> +      dest = XEXP (dest, 0);
> +      if (!REG_P (dest))
> +	dest = force_reg (Pmode, dest);
> +    }
> +  else
> +    {
> +      src = XEXP (src, 0);
> +      if (!REG_P (src))
> +	src = force_reg (Pmode, src);
> +    }



> +
> +  rtx len = force_reg (DImode, gen_int_mode (length, DImode));
> +  if (is_store)
> +    return gen_stxvl (src, dest, len);
> +  else
> +    return  gen_lxvl (dest, src, len);
> +}
> +
>  /* Expand a block move operation, and return 1 if successful.  Return 0
>     if we should let the compiler generate normal code.
> 

ok

> @@ -2750,18 +2780,57 @@ expand_block_move (rtx operands[], bool might_overlap)
>    if (bytes > rs6000_block_move_inline_limit)
>      return 0;
> 
> +  int orig_bytes = bytes;
>    for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
>      {
>        union {
> -	rtx (*movmemsi) (rtx, rtx, rtx, rtx);
>  	rtx (*mov) (rtx, rtx);
> +	rtx (*movlen) (rtx, rtx, int);
>        } gen_func;
>        machine_mode mode = BLKmode;
>        rtx src, dest;
> -
> -      /* Altivec first, since it will be faster than a string move
> -	 when it applies, and usually not significantly larger.  */
> -      if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
> +      bool move_with_length = false;
> +
> +      /* Use POImode for paired vsx load/store.  Use V2DI for single
> +	 unaligned vsx load/store, for consistency with what other
> +	 expansions (compare) already do, and so we can use lxvd2x on
> +	 p8.  Order is VSX pair unaligned, VSX unaligned, Altivec, vsx
> +	 with length < 16 (if allowed), then smaller gpr
> +	 load/store.  */

s/vsx/VSX/
s/smaller// ?


> +
> +      if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX
> +	  && TARGET_BLOCK_OPS_VECTOR_PAIR
> +	  && bytes >= 32
> +	  && (align >= 256 || !STRICT_ALIGNMENT))
> +	{
> +	  move_bytes = 32;
> +	  mode = POImode;
> +	  gen_func.mov = gen_movpoi;
> +	}
> +      else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX
> +	       && VECTOR_MEM_VSX_P (V2DImode)
> +	       && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT))
> +	{
> +	  move_bytes = 16;
> +	  mode = V2DImode;
> +	  gen_func.mov = gen_vsx_movv2di_64bit;
> +	}
> +      else if (TARGET_BLOCK_OPS_UNALIGNED_VSX
> +	       && TARGET_POWER10 && bytes < 16
> +	       && orig_bytes > 16
> +	       && !(bytes == 1 || bytes == 2
> +		    || bytes == 4 || bytes == 8)
> +	       && (align >= 128 || !STRICT_ALIGNMENT))
> +	{
> +	  /* Only use lxvl/stxvl if it could replace multiple ordinary
> +	     loads+stores.  Also don't use it unless we likely already
> +	     did one vsx copy so we aren't mixing gpr and vsx.  */
> +	  move_bytes = bytes;
> +	  mode = V16QImode;
> +	  gen_func.movlen = gen_lxvl_stxvl_move;
> +	  move_with_length = true;
> +	}
> +      else if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
>  	{
>  	  move_bytes = 16;
>  	  mode = V4SImode;
> @@ -2818,7 +2887,13 @@ expand_block_move (rtx operands[], bool might_overlap)
>  	  gen_func.mov = gen_movqi;
>  	}
> 
> -      /* Mode is always set to something other than BLKmode by one of the 
> +      /* If we can't succeed in doing it in one pass, we can't do it in the
> +	 might_overlap case.  Bail out and return failure.  */

Looks like this comment is an existing one, since we touch it, i'd
suggest ...
s/it/the move/ 

> +      if (might_overlap && (num_reg+1) >= MAX_MOVE_REG
> +	  && bytes > move_bytes)
> +	return 0;
> +
> +      /* Mode is always set to something other than BLKmode by one of the
>  	 cases of the if statement above.  */
>        gcc_assert (mode != BLKmode);
> 
> @@ -2826,15 +2901,17 @@ expand_block_move (rtx operands[], bool might_overlap)
>        dest = adjust_address (orig_dest, mode, offset);
> 
>        rtx tmp_reg = gen_reg_rtx (mode);
> -      
> -      loads[num_reg]    = (*gen_func.mov) (tmp_reg, src);
> -      stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
> 
> -      /* If we didn't succeed in doing it in one pass, we can't do it in the 
> -	 might_overlap case.  Bail out and return failure.  */
> -      if (might_overlap && num_reg >= MAX_MOVE_REG
> -	  && bytes > move_bytes)
> -	return 0;
> +      if (move_with_length)
> +	{
> +	  loads[num_reg]    = (*gen_func.movlen) (tmp_reg, src, move_bytes);
> +	  stores[num_reg++] = (*gen_func.movlen) (dest, tmp_reg, move_bytes);
> +	}
> +      else
> +	{
> +	  loads[num_reg]    = (*gen_func.mov) (tmp_reg, src);
> +	  stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
> +	}
> 
>        /* Emit loads and stores saved up.  */
>        if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)

ok

> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index fe93cf6ff2b..1c1caa90ede 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -4018,6 +4018,14 @@ rs6000_option_override_internal (bool global_init_p)
>  	rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX;
>      }
> 
> +  if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR))
> +    {
> +      if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX)
> +	rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
> +      else
> +	rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
> +    }
> +

Is TARGET_MMA correct in there?  

otherwise ok


>    /* Use long double size to select the appropriate long double.  We use
>       TYPE_PRECISION to differentiate the 3 different long double types.  We map
>       128 into the precision used for TFmode.  */
> @@ -23222,8 +23230,10 @@ struct rs6000_opt_mask {
>  static struct rs6000_opt_mask const rs6000_opt_masks[] =
>  {
>    { "altivec",			OPTION_MASK_ALTIVEC,		false, true  },
> -  { "block-ops-unaligned-vsx",  OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX,
> -                                                                false, true  },
> +  { "block-ops-unaligned-vsx",	OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX,
> +								false, true  },
> +  { "block-ops-vector-pair",	OPTION_MASK_BLOCK_OPS_VECTOR_PAIR,
> +								false, true  },
>    { "cmpb",			OPTION_MASK_CMPB,		false, true  },
>    { "crypto",			OPTION_MASK_CRYPTO,		false, true  },
>    { "direct-move",		OPTION_MASK_DIRECT_MOVE,	false, true  },

ok.

> diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
> index 9d3e740e930..4354cf1b898 100644
> --- a/gcc/config/rs6000/rs6000.opt
> +++ b/gcc/config/rs6000/rs6000.opt
> @@ -328,6 +328,10 @@ mblock-ops-unaligned-vsx
>  Target Report Mask(BLOCK_OPS_UNALIGNED_VSX) Var(rs6000_isa_flags)
>  Generate unaligned VSX load/store for inline expansion of memcpy/memmove.
> 
> +mblock-ops-vector-pair
> +Target Undocumented Mask(BLOCK_OPS_VECTOR_PAIR) Var(rs6000_isa_flags)
> +Generate unaligned VSX load/store for inline expansion of memcpy/memmove.
> +
ok


Thanks,
-Will


>  mblock-compare-inline-limit=
>  Target Report Var(rs6000_block_compare_inline_limit) Init(63) RejectNegative Joined UInteger Save
>  Max number of bytes to compare without loops.
Segher Boessenkool Aug. 17, 2020, 10:59 p.m. UTC | #2
Hi!

On Fri, Aug 14, 2020 at 05:59:05PM -0500, Aaron Sawdey via Gcc-patches wrote:
> +static rtx
> +gen_lxvl_stxvl_move (rtx dest, rtx src, int length)
> +{
> +  gcc_assert (MEM_P (dest) ^ MEM_P (src));

Maybe just "!="?

> +  gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode);
> +  gcc_assert (length <= 16);
> +
> +  bool is_store = MEM_P (dest);
> +
> +  /* If the address form is not a simple register, make it so.  */
> +  if (is_store)
> +    {
> +      dest = XEXP (dest, 0);
> +      if (!REG_P (dest))
> +	dest = force_reg (Pmode, dest);

So this changes what "dest" means.

Maybe it is clearer if you have a separate variable "addr"?  That you
can use for dest and src as well, whichever is memory.

> +  if (is_store)
> +    return gen_stxvl (src, dest, len);
> +  else
> +    return  gen_lxvl (dest, src, len);

(doubled space -- well I guess you wanted to align the code)

> +      /* If we can't succeed in doing it in one pass, we can't do it in the
> +	 might_overlap case.  Bail out and return failure.  */
> +      if (might_overlap && (num_reg+1) >= MAX_MOVE_REG
> +	  && bytes > move_bytes)
> +	return 0;

The "num_reg+1" isn't obvious, and the comment doesn't say (we usually
write is as "num_reg + 1" fwiw, and the parens are superfluous).


Looks good, thanks!  Okay for trunk with or without such changes.


Segher
diff mbox series

Patch

diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c
index c35d93180ca..ce6db2ba14d 100644
--- a/gcc/config/rs6000/rs6000-string.c
+++ b/gcc/config/rs6000/rs6000-string.c
@@ -2708,6 +2708,36 @@  gen_lvx_v4si_move (rtx dest, rtx src)
     return gen_altivec_lvx_v4si_internal (dest, src);
 }
 
+static rtx
+gen_lxvl_stxvl_move (rtx dest, rtx src, int length)
+{
+  gcc_assert (MEM_P (dest) ^ MEM_P (src));
+  gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode);
+  gcc_assert (length <= 16);
+
+  bool is_store = MEM_P (dest);
+
+  /* If the address form is not a simple register, make it so.  */
+  if (is_store)
+    {
+      dest = XEXP (dest, 0);
+      if (!REG_P (dest))
+	dest = force_reg (Pmode, dest);
+    }
+  else
+    {
+      src = XEXP (src, 0);
+      if (!REG_P (src))
+	src = force_reg (Pmode, src);
+    }
+
+  rtx len = force_reg (DImode, gen_int_mode (length, DImode));
+  if (is_store)
+    return gen_stxvl (src, dest, len);
+  else
+    return  gen_lxvl (dest, src, len);
+}
+
 /* Expand a block move operation, and return 1 if successful.  Return 0
    if we should let the compiler generate normal code.
 
@@ -2750,18 +2780,57 @@  expand_block_move (rtx operands[], bool might_overlap)
   if (bytes > rs6000_block_move_inline_limit)
     return 0;
 
+  int orig_bytes = bytes;
   for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
     {
       union {
-	rtx (*movmemsi) (rtx, rtx, rtx, rtx);
 	rtx (*mov) (rtx, rtx);
+	rtx (*movlen) (rtx, rtx, int);
       } gen_func;
       machine_mode mode = BLKmode;
       rtx src, dest;
-
-      /* Altivec first, since it will be faster than a string move
-	 when it applies, and usually not significantly larger.  */
-      if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+      bool move_with_length = false;
+
+      /* Use POImode for paired vsx load/store.  Use V2DI for single
+	 unaligned vsx load/store, for consistency with what other
+	 expansions (compare) already do, and so we can use lxvd2x on
+	 p8.  Order is VSX pair unaligned, VSX unaligned, Altivec, vsx
+	 with length < 16 (if allowed), then smaller gpr
+	 load/store.  */
+
+      if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX
+	  && TARGET_BLOCK_OPS_VECTOR_PAIR
+	  && bytes >= 32
+	  && (align >= 256 || !STRICT_ALIGNMENT))
+	{
+	  move_bytes = 32;
+	  mode = POImode;
+	  gen_func.mov = gen_movpoi;
+	}
+      else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX
+	       && VECTOR_MEM_VSX_P (V2DImode)
+	       && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT))
+	{
+	  move_bytes = 16;
+	  mode = V2DImode;
+	  gen_func.mov = gen_vsx_movv2di_64bit;
+	}
+      else if (TARGET_BLOCK_OPS_UNALIGNED_VSX
+	       && TARGET_POWER10 && bytes < 16
+	       && orig_bytes > 16
+	       && !(bytes == 1 || bytes == 2
+		    || bytes == 4 || bytes == 8)
+	       && (align >= 128 || !STRICT_ALIGNMENT))
+	{
+	  /* Only use lxvl/stxvl if it could replace multiple ordinary
+	     loads+stores.  Also don't use it unless we likely already
+	     did one vsx copy so we aren't mixing gpr and vsx.  */
+	  move_bytes = bytes;
+	  mode = V16QImode;
+	  gen_func.movlen = gen_lxvl_stxvl_move;
+	  move_with_length = true;
+	}
+      else if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
 	{
 	  move_bytes = 16;
 	  mode = V4SImode;
@@ -2818,7 +2887,13 @@  expand_block_move (rtx operands[], bool might_overlap)
 	  gen_func.mov = gen_movqi;
 	}
 
-      /* Mode is always set to something other than BLKmode by one of the 
+      /* If we can't succeed in doing it in one pass, we can't do it in the
+	 might_overlap case.  Bail out and return failure.  */
+      if (might_overlap && (num_reg+1) >= MAX_MOVE_REG
+	  && bytes > move_bytes)
+	return 0;
+
+      /* Mode is always set to something other than BLKmode by one of the
 	 cases of the if statement above.  */
       gcc_assert (mode != BLKmode);
 
@@ -2826,15 +2901,17 @@  expand_block_move (rtx operands[], bool might_overlap)
       dest = adjust_address (orig_dest, mode, offset);
 
       rtx tmp_reg = gen_reg_rtx (mode);
-      
-      loads[num_reg]    = (*gen_func.mov) (tmp_reg, src);
-      stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
 
-      /* If we didn't succeed in doing it in one pass, we can't do it in the 
-	 might_overlap case.  Bail out and return failure.  */
-      if (might_overlap && num_reg >= MAX_MOVE_REG
-	  && bytes > move_bytes)
-	return 0;
+      if (move_with_length)
+	{
+	  loads[num_reg]    = (*gen_func.movlen) (tmp_reg, src, move_bytes);
+	  stores[num_reg++] = (*gen_func.movlen) (dest, tmp_reg, move_bytes);
+	}
+      else
+	{
+	  loads[num_reg]    = (*gen_func.mov) (tmp_reg, src);
+	  stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
+	}
 
       /* Emit loads and stores saved up.  */
       if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index fe93cf6ff2b..1c1caa90ede 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -4018,6 +4018,14 @@  rs6000_option_override_internal (bool global_init_p)
 	rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX;
     }
 
+  if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR))
+    {
+      if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX)
+	rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
+      else
+	rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
+    }
+
   /* Use long double size to select the appropriate long double.  We use
      TYPE_PRECISION to differentiate the 3 different long double types.  We map
      128 into the precision used for TFmode.  */
@@ -23222,8 +23230,10 @@  struct rs6000_opt_mask {
 static struct rs6000_opt_mask const rs6000_opt_masks[] =
 {
   { "altivec",			OPTION_MASK_ALTIVEC,		false, true  },
-  { "block-ops-unaligned-vsx",  OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX,
-                                                                false, true  },
+  { "block-ops-unaligned-vsx",	OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX,
+								false, true  },
+  { "block-ops-vector-pair",	OPTION_MASK_BLOCK_OPS_VECTOR_PAIR,
+								false, true  },
   { "cmpb",			OPTION_MASK_CMPB,		false, true  },
   { "crypto",			OPTION_MASK_CRYPTO,		false, true  },
   { "direct-move",		OPTION_MASK_DIRECT_MOVE,	false, true  },
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 9d3e740e930..4354cf1b898 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -328,6 +328,10 @@  mblock-ops-unaligned-vsx
 Target Report Mask(BLOCK_OPS_UNALIGNED_VSX) Var(rs6000_isa_flags)
 Generate unaligned VSX load/store for inline expansion of memcpy/memmove.
 
+mblock-ops-vector-pair
+Target Undocumented Mask(BLOCK_OPS_VECTOR_PAIR) Var(rs6000_isa_flags)
+Generate unaligned VSX load/store for inline expansion of memcpy/memmove.
+
 mblock-compare-inline-limit=
 Target Report Var(rs6000_block_compare_inline_limit) Init(63) RejectNegative Joined UInteger Save
 Max number of bytes to compare without loops.