diff mbox

[rs6000] Avoid vectorizing versioned copy loops with vectorization factor 2

Message ID f4ee0d29-6cdd-6b4f-167a-3fec1b38358f@linux.vnet.ibm.com
State New
Headers show

Commit Message

Bill Schmidt May 3, 2017, 7:43 p.m. UTC
Hi,

We recently became aware of some poor code generation as a result of
unprofitable (for POWER) loop vectorization.  When a loop is simply copying
data with 64-bit loads and stores, vectorizing with 128-bit loads and stores
generally does not provide any benefit on modern POWER processors.
Furthermore, if there is a requirement to version the loop for aliasing,
alignment, etc., the cost of the versioning test is almost certainly a
performance loss for such loops.  The user code example included such a copy
loop, executed only a few times on average, within an outer loop that was
executed many times on average, causing a tremendous slowdown.

This patch very specifically targets these kinds of loops and no others,
and artificially inflates the vectorization cost to ensure vectorization
does not appear profitable.  This is done within the target model cost
hooks to avoid affecting other targets.  A new test case is included that
demonstrates the refusal to vectorize.

We've done SPEC performance testing to verify that the patch does not
degrade such workloads.  Results were all in the noise range.  The
customer code performance loss was verified to have been reversed.

Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no regressions.
Is this ok for trunk?

Thanks,
Bill


[gcc]

2017-05-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* config/rs6000/rs6000.c (rs6000_vect_nonmem): New static var.
	(rs6000_init_cost): Initialize rs6000_vect_nonmem.
	(rs6000_add_stmt_cost): Update rs6000_vect_nonmem.
	(rs6000_finish_cost): Avoid vectorizing simple copy loops with
	VF=2 that require versioning.

[gcc/testsuite]

2017-05-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* gcc.target/powerpc/veresioned-copy-loop.c: New file.

Comments

Bill Schmidt May 4, 2017, 5:26 p.m. UTC | #1
...only without the typo in the ChangeLog below...

> On May 3, 2017, at 2:43 PM, Bill Schmidt <wschmidt@linux.vnet.ibm.com> wrote:
> 
> Hi,
> 
> We recently became aware of some poor code generation as a result of
> unprofitable (for POWER) loop vectorization.  When a loop is simply copying
> data with 64-bit loads and stores, vectorizing with 128-bit loads and stores
> generally does not provide any benefit on modern POWER processors.
> Furthermore, if there is a requirement to version the loop for aliasing,
> alignment, etc., the cost of the versioning test is almost certainly a
> performance loss for such loops.  The user code example included such a copy
> loop, executed only a few times on average, within an outer loop that was
> executed many times on average, causing a tremendous slowdown.
> 
> This patch very specifically targets these kinds of loops and no others,
> and artificially inflates the vectorization cost to ensure vectorization
> does not appear profitable.  This is done within the target model cost
> hooks to avoid affecting other targets.  A new test case is included that
> demonstrates the refusal to vectorize.
> 
> We've done SPEC performance testing to verify that the patch does not
> degrade such workloads.  Results were all in the noise range.  The
> customer code performance loss was verified to have been reversed.
> 
> Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no regressions.
> Is this ok for trunk?
> 
> Thanks,
> Bill
> 
> 
> [gcc]
> 
> 2017-05-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> 
> 	* config/rs6000/rs6000.c (rs6000_vect_nonmem): New static var.
> 	(rs6000_init_cost): Initialize rs6000_vect_nonmem.
> 	(rs6000_add_stmt_cost): Update rs6000_vect_nonmem.
> 	(rs6000_finish_cost): Avoid vectorizing simple copy loops with
> 	VF=2 that require versioning.
> 
> [gcc/testsuite]
> 
> 2017-05-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> 
> 	* gcc.target/powerpc/veresioned-copy-loop.c: New file.

^^ fixed to "versioned".

Bill

> 
> 
> Index: gcc/config/rs6000/rs6000.c
> ===================================================================
> --- gcc/config/rs6000/rs6000.c	(revision 247560)
> +++ gcc/config/rs6000/rs6000.c	(working copy)
> @@ -5873,6 +5873,8 @@ rs6000_density_test (rs6000_cost_data *data)
> 
> /* Implement targetm.vectorize.init_cost.  */
> 
> +static bool rs6000_vect_nonmem;
> +
> static void *
> rs6000_init_cost (struct loop *loop_info)
> {
> @@ -5881,6 +5883,7 @@ rs6000_init_cost (struct loop *loop_info)
>   data->cost[vect_prologue] = 0;
>   data->cost[vect_body]     = 0;
>   data->cost[vect_epilogue] = 0;
> +  rs6000_vect_nonmem = false;
>   return data;
> }
> 
> @@ -5907,6 +5910,19 @@ rs6000_add_stmt_cost (void *data, int count, enum
> 
>       retval = (unsigned) (count * stmt_cost);
>       cost_data->cost[where] += retval;
> +
> +      /* Check whether we're doing something other than just a copy loop.
> +	 Not all such loops may be profitably vectorized; see
> +	 rs6000_finish_cost.  */
> +      if ((where == vect_body
> +	   && (kind == vector_stmt || kind == vec_to_scalar || kind == vec_perm
> +	       || kind == vec_promote_demote || kind == vec_construct
> +	       || kind == scalar_to_vec))
> +	  || (where != vect_body
> +	      && (kind == vec_to_scalar || kind == vec_perm
> +		  || kind == vec_promote_demote || kind == vec_construct
> +		  || kind == scalar_to_vec)))
> +	rs6000_vect_nonmem = true;
>     }
> 
>   return retval;
> @@ -5923,6 +5939,19 @@ rs6000_finish_cost (void *data, unsigned *prologue
>   if (cost_data->loop_info)
>     rs6000_density_test (cost_data);
> 
> +  /* Don't vectorize minimum-vectorization-factor, simple copy loops
> +     that require versioning for any reason.  The vectorization is at
> +     best a wash inside the loop, and the versioning checks make
> +     profitability highly unlikely and potentially quite harmful.  */
> +  if (cost_data->loop_info)
> +    {
> +      loop_vec_info vec_info = loop_vec_info_for_loop (cost_data->loop_info);
> +      if (!rs6000_vect_nonmem
> +	  && LOOP_VINFO_VECT_FACTOR (vec_info) == 2
> +	  && LOOP_REQUIRES_VERSIONING (vec_info))
> +	cost_data->cost[vect_body] += 10000;
> +    }
> +
>   *prologue_cost = cost_data->cost[vect_prologue];
>   *body_cost     = cost_data->cost[vect_body];
>   *epilogue_cost = cost_data->cost[vect_epilogue];
> Index: gcc/testsuite/gcc.target/powerpc/versioned-copy-loop.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/versioned-copy-loop.c	(nonexistent)
> +++ gcc/testsuite/gcc.target/powerpc/versioned-copy-loop.c	(working copy)
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target powerpc_p8vector_ok } */
> +/* { dg-options "-O3 -fdump-tree-vect-details" } */
> +
> +/* Verify that a pure copy loop with a vectorization factor of two
> +   that requires alignment will not be vectorized.  See the cost
> +   model hooks in rs6000.c.  */
> +
> +typedef long unsigned int size_t;
> +typedef unsigned char uint8_t;
> +
> +extern void *memcpy (void *__restrict __dest, const void *__restrict __src,
> +       size_t __n) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
> +
> +void foo (void *dstPtr, const void *srcPtr, void *dstEnd)
> +{
> +    uint8_t *d = (uint8_t*)dstPtr;
> +    const uint8_t *s = (const uint8_t*)srcPtr;
> +    uint8_t* const e = (uint8_t*)dstEnd;
> +
> +    do
> +      {
> +	memcpy (d, s, 8);
> +	d += 8;
> +	s += 8;
> +      }
> +    while (d < e);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
>
Segher Boessenkool May 5, 2017, 3:30 p.m. UTC | #2
Hi Bill,

On Wed, May 03, 2017 at 02:43:09PM -0500, Bill Schmidt wrote:
> We recently became aware of some poor code generation as a result of
> unprofitable (for POWER) loop vectorization.  When a loop is simply copying
> data with 64-bit loads and stores, vectorizing with 128-bit loads and stores
> generally does not provide any benefit on modern POWER processors.
> Furthermore, if there is a requirement to version the loop for aliasing,
> alignment, etc., the cost of the versioning test is almost certainly a
> performance loss for such loops.  The user code example included such a copy
> loop, executed only a few times on average, within an outer loop that was
> executed many times on average, causing a tremendous slowdown.
> 
> This patch very specifically targets these kinds of loops and no others,
> and artificially inflates the vectorization cost to ensure vectorization
> does not appear profitable.  This is done within the target model cost
> hooks to avoid affecting other targets.  A new test case is included that
> demonstrates the refusal to vectorize.
> 
> We've done SPEC performance testing to verify that the patch does not
> degrade such workloads.  Results were all in the noise range.  The
> customer code performance loss was verified to have been reversed.
> 
> Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no regressions.
> Is this ok for trunk?

> 2017-05-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> 
> 	* config/rs6000/rs6000.c (rs6000_vect_nonmem): New static var.
> 	(rs6000_init_cost): Initialize rs6000_vect_nonmem.
> 	(rs6000_add_stmt_cost): Update rs6000_vect_nonmem.
> 	(rs6000_finish_cost): Avoid vectorizing simple copy loops with
> 	VF=2 that require versioning.
> 
> [gcc/testsuite]
> 
> 2017-05-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> 
> 	* gcc.target/powerpc/veresioned-copy-loop.c: New file.

> --- gcc/config/rs6000/rs6000.c	(revision 247560)
> +++ gcc/config/rs6000/rs6000.c	(working copy)
> @@ -5873,6 +5873,8 @@ rs6000_density_test (rs6000_cost_data *data)
>  
>  /* Implement targetm.vectorize.init_cost.  */
>  
> +static bool rs6000_vect_nonmem;

Please put a comment on this, saying what it is for.

> +      /* Check whether we're doing something other than just a copy loop.
> +	 Not all such loops may be profitably vectorized; see
> +	 rs6000_finish_cost.  */
> +      if ((where == vect_body
> +	   && (kind == vector_stmt || kind == vec_to_scalar || kind == vec_perm
> +	       || kind == vec_promote_demote || kind == vec_construct
> +	       || kind == scalar_to_vec))
> +	  || (where != vect_body
> +	      && (kind == vec_to_scalar || kind == vec_perm
> +		  || kind == vec_promote_demote || kind == vec_construct
> +		  || kind == scalar_to_vec)))
> +	rs6000_vect_nonmem = true;

Perhaps

+      if ((kind == vec_to_scalar || kind == vec_perm
+	       || kind == vec_promote_demote || kind == vec_construct
+	       || kind == scalar_to_vec)
+	   || (where == vect_body && kind == vector_stmt))
> +	rs6000_vect_nonmem = true;

if you agree that is clearer.

Okay for trunk with the comment added, and the condition either or not
simplified.  Thanks,


Segher
Bill Schmidt May 13, 2017, 9:44 p.m. UTC | #3
Thanks!  This was committed to trunk last week as r247671.  As we discussed
offline, I've also backported to GCC 7 (r248010) and GCC 6 (r248011).

Bill

> On May 5, 2017, at 10:30 AM, Segher Boessenkool <segher@kernel.crashing.org> wrote:
> 
> Hi Bill,
> 
> On Wed, May 03, 2017 at 02:43:09PM -0500, Bill Schmidt wrote:
>> We recently became aware of some poor code generation as a result of
>> unprofitable (for POWER) loop vectorization.  When a loop is simply copying
>> data with 64-bit loads and stores, vectorizing with 128-bit loads and stores
>> generally does not provide any benefit on modern POWER processors.
>> Furthermore, if there is a requirement to version the loop for aliasing,
>> alignment, etc., the cost of the versioning test is almost certainly a
>> performance loss for such loops.  The user code example included such a copy
>> loop, executed only a few times on average, within an outer loop that was
>> executed many times on average, causing a tremendous slowdown.
>> 
>> This patch very specifically targets these kinds of loops and no others,
>> and artificially inflates the vectorization cost to ensure vectorization
>> does not appear profitable.  This is done within the target model cost
>> hooks to avoid affecting other targets.  A new test case is included that
>> demonstrates the refusal to vectorize.
>> 
>> We've done SPEC performance testing to verify that the patch does not
>> degrade such workloads.  Results were all in the noise range.  The
>> customer code performance loss was verified to have been reversed.
>> 
>> Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no regressions.
>> Is this ok for trunk?
> 
>> 2017-05-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
>> 
>> 	* config/rs6000/rs6000.c (rs6000_vect_nonmem): New static var.
>> 	(rs6000_init_cost): Initialize rs6000_vect_nonmem.
>> 	(rs6000_add_stmt_cost): Update rs6000_vect_nonmem.
>> 	(rs6000_finish_cost): Avoid vectorizing simple copy loops with
>> 	VF=2 that require versioning.
>> 
>> [gcc/testsuite]
>> 
>> 2017-05-03  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
>> 
>> 	* gcc.target/powerpc/veresioned-copy-loop.c: New file.
> 
>> --- gcc/config/rs6000/rs6000.c	(revision 247560)
>> +++ gcc/config/rs6000/rs6000.c	(working copy)
>> @@ -5873,6 +5873,8 @@ rs6000_density_test (rs6000_cost_data *data)
>> 
>> /* Implement targetm.vectorize.init_cost.  */
>> 
>> +static bool rs6000_vect_nonmem;
> 
> Please put a comment on this, saying what it is for.
> 
>> +      /* Check whether we're doing something other than just a copy loop.
>> +	 Not all such loops may be profitably vectorized; see
>> +	 rs6000_finish_cost.  */
>> +      if ((where == vect_body
>> +	   && (kind == vector_stmt || kind == vec_to_scalar || kind == vec_perm
>> +	       || kind == vec_promote_demote || kind == vec_construct
>> +	       || kind == scalar_to_vec))
>> +	  || (where != vect_body
>> +	      && (kind == vec_to_scalar || kind == vec_perm
>> +		  || kind == vec_promote_demote || kind == vec_construct
>> +		  || kind == scalar_to_vec)))
>> +	rs6000_vect_nonmem = true;
> 
> Perhaps
> 
> +      if ((kind == vec_to_scalar || kind == vec_perm
> +	       || kind == vec_promote_demote || kind == vec_construct
> +	       || kind == scalar_to_vec)
> +	   || (where == vect_body && kind == vector_stmt))
>> +	rs6000_vect_nonmem = true;
> 
> if you agree that is clearer.
> 
> Okay for trunk with the comment added, and the condition either or not
> simplified.  Thanks,
> 
> 
> Segher
>
diff mbox

Patch

Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 247560)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -5873,6 +5873,8 @@  rs6000_density_test (rs6000_cost_data *data)
 
 /* Implement targetm.vectorize.init_cost.  */
 
+static bool rs6000_vect_nonmem;
+
 static void *
 rs6000_init_cost (struct loop *loop_info)
 {
@@ -5881,6 +5883,7 @@  rs6000_init_cost (struct loop *loop_info)
   data->cost[vect_prologue] = 0;
   data->cost[vect_body]     = 0;
   data->cost[vect_epilogue] = 0;
+  rs6000_vect_nonmem = false;
   return data;
 }
 
@@ -5907,6 +5910,19 @@  rs6000_add_stmt_cost (void *data, int count, enum
 
       retval = (unsigned) (count * stmt_cost);
       cost_data->cost[where] += retval;
+
+      /* Check whether we're doing something other than just a copy loop.
+	 Not all such loops may be profitably vectorized; see
+	 rs6000_finish_cost.  */
+      if ((where == vect_body
+	   && (kind == vector_stmt || kind == vec_to_scalar || kind == vec_perm
+	       || kind == vec_promote_demote || kind == vec_construct
+	       || kind == scalar_to_vec))
+	  || (where != vect_body
+	      && (kind == vec_to_scalar || kind == vec_perm
+		  || kind == vec_promote_demote || kind == vec_construct
+		  || kind == scalar_to_vec)))
+	rs6000_vect_nonmem = true;
     }
 
   return retval;
@@ -5923,6 +5939,19 @@  rs6000_finish_cost (void *data, unsigned *prologue
   if (cost_data->loop_info)
     rs6000_density_test (cost_data);
 
+  /* Don't vectorize minimum-vectorization-factor, simple copy loops
+     that require versioning for any reason.  The vectorization is at
+     best a wash inside the loop, and the versioning checks make
+     profitability highly unlikely and potentially quite harmful.  */
+  if (cost_data->loop_info)
+    {
+      loop_vec_info vec_info = loop_vec_info_for_loop (cost_data->loop_info);
+      if (!rs6000_vect_nonmem
+	  && LOOP_VINFO_VECT_FACTOR (vec_info) == 2
+	  && LOOP_REQUIRES_VERSIONING (vec_info))
+	cost_data->cost[vect_body] += 10000;
+    }
+
   *prologue_cost = cost_data->cost[vect_prologue];
   *body_cost     = cost_data->cost[vect_body];
   *epilogue_cost = cost_data->cost[vect_epilogue];
Index: gcc/testsuite/gcc.target/powerpc/versioned-copy-loop.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/versioned-copy-loop.c	(nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/versioned-copy-loop.c	(working copy)
@@ -0,0 +1,30 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-O3 -fdump-tree-vect-details" } */
+
+/* Verify that a pure copy loop with a vectorization factor of two
+   that requires alignment will not be vectorized.  See the cost
+   model hooks in rs6000.c.  */
+
+typedef long unsigned int size_t;
+typedef unsigned char uint8_t;
+
+extern void *memcpy (void *__restrict __dest, const void *__restrict __src,
+       size_t __n) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
+
+void foo (void *dstPtr, const void *srcPtr, void *dstEnd)
+{
+    uint8_t *d = (uint8_t*)dstPtr;
+    const uint8_t *s = (const uint8_t*)srcPtr;
+    uint8_t* const e = (uint8_t*)dstEnd;
+
+    do
+      {
+	memcpy (d, s, 8);
+	d += 8;
+	s += 8;
+      }
+    while (d < e);
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */