diff mbox

Handle unpropagated assignments in SLP

Message ID 87h900yznu.fsf@linaro.org
State New
Headers show

Commit Message

Richard Sandiford June 1, 2017, 6:45 a.m. UTC
Some of the SVE patches extend SLP to predicated operations created by
ifcvt.  However, ifcvt currently forces the mask into a temporary:

		mask = ifc_temp_var (TREE_TYPE (mask), mask, &gsi);

and at the moment SLP doesn't handle simple assignments like:

   SSA_NAME = SSA_NAME
   SSA_NAME = <constant>

(It does of course handle:

   SSA_NAME = SSA_NAME op SSA_NAME
   SSA_NAME = SSA_NAME op <constant>)

I realise copy propagation should usually ensure that these simple
assignments don't occur, but normal loop vectorisation handles them
just fine, and SLP does too once we get over the initial validity check.
I thought this patch might be useful even if we decide that we don't want
ifcvt to create a temporary mask in such cases.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Thanks,
Richard


2017-06-01  Richard Sandiford  <richard.sandiford@linaro.org>

gcc/
	* tree-vect-slp.c (vect_build_slp_tree_1): Allow mixtures of SSA
	names and constants, without treating them as separate operations.
	Explicitly reject stores.

gcc/testsuite/
	* gcc.dg/vect/slp-temp-1.c: New test.
	* gcc.dg/vect/slp-temp-2.c: Likewise.
	* gcc.dg/vect/slp-temp-3.c: Likewise.

Comments

Richard Biener June 1, 2017, 8:03 a.m. UTC | #1
On Thu, Jun 1, 2017 at 8:45 AM, Richard Sandiford
<richard.sandiford@linaro.org> wrote:
> Some of the SVE patches extend SLP to predicated operations created by
> ifcvt.  However, ifcvt currently forces the mask into a temporary:
>
>                 mask = ifc_temp_var (TREE_TYPE (mask), mask, &gsi);
>
> and at the moment SLP doesn't handle simple assignments like:
>
>    SSA_NAME = SSA_NAME
>    SSA_NAME = <constant>
>
> (It does of course handle:
>
>    SSA_NAME = SSA_NAME op SSA_NAME
>    SSA_NAME = SSA_NAME op <constant>)
>
> I realise copy propagation should usually ensure that these simple
> assignments don't occur, but normal loop vectorisation handles them
> just fine, and SLP does too once we get over the initial validity check.
> I thought this patch might be useful even if we decide that we don't want
> ifcvt to create a temporary mask in such cases.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
>
> Thanks,
> Richard
>
>
> 2017-06-01  Richard Sandiford  <richard.sandiford@linaro.org>
>
> gcc/
>         * tree-vect-slp.c (vect_build_slp_tree_1): Allow mixtures of SSA
>         names and constants, without treating them as separate operations.
>         Explicitly reject stores.
>
> gcc/testsuite/
>         * gcc.dg/vect/slp-temp-1.c: New test.
>         * gcc.dg/vect/slp-temp-2.c: Likewise.
>         * gcc.dg/vect/slp-temp-3.c: Likewise.
>
> Index: gcc/tree-vect-slp.c
> ===================================================================
> --- gcc/tree-vect-slp.c 2017-05-18 07:51:12.387750673 +0100
> +++ gcc/tree-vect-slp.c 2017-06-01 07:21:44.094320070 +0100
> @@ -671,6 +671,13 @@ vect_build_slp_tree_1 (vec_info *vinfo,
>                first_op1 = gimple_assign_rhs2 (stmt);
>              }
>         }
> +      else if ((TREE_CODE_CLASS (rhs_code) == tcc_constant
> +               || rhs_code == SSA_NAME)
> +              && (TREE_CODE_CLASS (first_stmt_code) == tcc_constant
> +                  || first_stmt_code == SSA_NAME))
> +       /* Merging two simple rvalues is OK and doesn't count as two
> +          operations.  */
> +       ;


But this doesn't help in the case one stmt has the copy/constant
propagated and one not ...

>        else
>         {
>           if (first_stmt_code != rhs_code
> @@ -800,11 +807,14 @@ vect_build_slp_tree_1 (vec_info *vinfo,
>             }
>
>           /* Not memory operation.  */
> -         if (TREE_CODE_CLASS (rhs_code) != tcc_binary
> -             && TREE_CODE_CLASS (rhs_code) != tcc_unary
> -             && TREE_CODE_CLASS (rhs_code) != tcc_expression
> -             && TREE_CODE_CLASS (rhs_code) != tcc_comparison
> -             && rhs_code != CALL_EXPR)
> +         if (REFERENCE_CLASS_P (lhs)
> +             || (TREE_CODE_CLASS (rhs_code) != tcc_binary
> +                 && TREE_CODE_CLASS (rhs_code) != tcc_unary
> +                 && TREE_CODE_CLASS (rhs_code) != tcc_expression
> +                 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
> +                 && TREE_CODE_CLASS (rhs_code) != tcc_constant
> +                 && rhs_code != CALL_EXPR
> +                 && rhs_code != SSA_NAME))

I think this whole block is dead code ... we can't ever visit stores and the
only case the rhs_code == tcc_reference check above would miss is
plain decls (but non-indexed decls should be rejected by dataref analysis
already).

I think a better fix is to ensure we do not have non-copy/constant propagated
IL fed into the vectorizer.

Richard.

>             {
>               if (dump_enabled_p ())
>                 {
> Index: gcc/testsuite/gcc.dg/vect/slp-temp-1.c
> ===================================================================
> --- /dev/null   2017-06-01 07:09:35.344016119 +0100
> +++ gcc/testsuite/gcc.dg/vect/slp-temp-1.c      2017-06-01 07:39:24.406603119 +0100
> @@ -0,0 +1,71 @@
> +/* { dg-do run { target { lp64 || ilp32 } } } */
> +/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */
> +
> +void __GIMPLE (startwith ("loop"))
> +f (int *x, int n)
> +{
> +  int i_in;
> +  int i_out;
> +  int double_i;
> +
> +  long unsigned int index_0;
> +  long unsigned int offset_0;
> +  int *addr_0;
> +  int temp_0;
> +
> +  long unsigned int index_1;
> +  long unsigned int offset_1;
> +  int *addr_1;
> +  int temp_1;
> +
> + entry:
> +  goto loop;
> +
> + loop:
> +  i_in = __PHI (entry: 0, latch: i_out);
> +  double_i = i_in * 2;
> +
> +  index_0 = (long unsigned int) double_i;
> +  offset_0 = index_0 * 4ul;
> +  addr_0 = x_1(D) + offset_0;
> +  temp_0 = 1;
> +  *addr_0 = temp_0;
> +
> +  index_1 = index_0 + 1ul;
> +  offset_1 = index_1 * 4ul;
> +  addr_1 = x_1(D) + offset_1;
> +  temp_1 = 3;
> +  *addr_1 = temp_1;
> +
> +  i_out = i_in + 1;
> +  if (n_2(D) > i_out)
> +    goto latch;
> +  else
> +    goto exit;
> +
> + latch:
> +  goto loop;
> +
> + exit:
> +  return;
> +}
> +
> +#define N 1024
> +
> +int
> +main (void)
> +{
> +  int a[N * 2];
> +  f (a, N);
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (a[i * 2] != 1
> +         || a[i * 2 + 1] != 3)
> +       __builtin_abort ();
> +      asm volatile ("" ::: "memory");
> +    }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */
> Index: gcc/testsuite/gcc.dg/vect/slp-temp-2.c
> ===================================================================
> --- /dev/null   2017-06-01 07:09:35.344016119 +0100
> +++ gcc/testsuite/gcc.dg/vect/slp-temp-2.c      2017-06-01 07:39:30.914219838 +0100
> @@ -0,0 +1,71 @@
> +/* { dg-do run { target { lp64 || ilp32 } } } */
> +/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */
> +
> +void __GIMPLE (startwith ("loop"))
> +f (int *x, int n, int foo)
> +{
> +  int i_in;
> +  int i_out;
> +  int double_i;
> +
> +  long unsigned int index_0;
> +  long unsigned int offset_0;
> +  int *addr_0;
> +  int temp_0;
> +
> +  long unsigned int index_1;
> +  long unsigned int offset_1;
> +  int *addr_1;
> +  int temp_1;
> +
> + entry:
> +  goto loop;
> +
> + loop:
> +  i_in = __PHI (entry: 0, latch: i_out);
> +  double_i = i_in * 2;
> +
> +  index_0 = (long unsigned int) double_i;
> +  offset_0 = index_0 * 4ul;
> +  addr_0 = x_1(D) + offset_0;
> +  temp_0 = 1;
> +  *addr_0 = temp_0;
> +
> +  index_1 = index_0 + 1ul;
> +  offset_1 = index_1 * 4ul;
> +  addr_1 = x_1(D) + offset_1;
> +  temp_1 = foo_2(D);
> +  *addr_1 = temp_1;
> +
> +  i_out = i_in + 1;
> +  if (n_3(D) > i_out)
> +    goto latch;
> +  else
> +    goto exit;
> +
> + latch:
> +  goto loop;
> +
> + exit:
> +  return;
> +}
> +
> +#define N 1024
> +
> +int
> +main (void)
> +{
> +  int a[N * 2];
> +  f (a, N, 11);
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (a[i * 2] != 1
> +         || a[i * 2 + 1] != 11)
> +       __builtin_abort ();
> +      asm volatile ("" ::: "memory");
> +    }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */
> Index: gcc/testsuite/gcc.dg/vect/slp-temp-3.c
> ===================================================================
> --- /dev/null   2017-06-01 07:09:35.344016119 +0100
> +++ gcc/testsuite/gcc.dg/vect/slp-temp-3.c      2017-06-01 07:39:40.925589561 +0100
> @@ -0,0 +1,84 @@
> +/* { dg-do run { target { lp64 || ilp32 } } } */
> +/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */
> +
> +void __GIMPLE (startwith ("loop")) __attribute__ ((noinline, noclone))
> +f (int *x, int n)
> +{
> +  int i_in;
> +  int i_out;
> +  int double_i;
> +
> +  long unsigned int index_0;
> +  long unsigned int offset_0;
> +  int *addr_0;
> +  int old_0;
> +  int new_0;
> +  int temp_0;
> +
> +  long unsigned int index_1;
> +  long unsigned int offset_1;
> +  int *addr_1;
> +  int old_1;
> +  int new_1;
> +  int temp_1;
> +
> + entry:
> +  goto loop;
> +
> + loop:
> +  i_in = __PHI (entry: 0, latch: i_out);
> +  double_i = i_in * 2;
> +
> +  index_0 = (long unsigned int) double_i;
> +  offset_0 = index_0 * 4ul;
> +  addr_0 = x_1(D) + offset_0;
> +  old_0 = *addr_0;
> +  new_0 = old_0 + 1;
> +  temp_0 = new_0;
> +  *addr_0 = temp_0;
> +
> +  index_1 = index_0 + 1ul;
> +  offset_1 = index_1 * 4ul;
> +  addr_1 = x_1(D) + offset_1;
> +  old_1 = *addr_1;
> +  new_1 = old_1 + 3;
> +  temp_1 = new_1;
> +  *addr_1 = temp_1;
> +
> +  i_out = i_in + 1;
> +  if (n_2(D) > i_out)
> +    goto latch;
> +  else
> +    goto exit;
> +
> + latch:
> +  goto loop;
> +
> + exit:
> +  return;
> +}
> +
> +#define N 1024
> +
> +int
> +main (void)
> +{
> +  int a[N * 2];
> +  for (int i = 0; i < N * 2; ++i)
> +    {
> +      a[i] = i * 4;
> +      asm volatile ("" ::: "memory");
> +    }
> +  f (a, N);
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (a[i * 2] != i * 8 + 1
> +         || a[i * 2 + 1] != i * 8 + 7)
> +       __builtin_abort ();
> +      asm volatile ("" ::: "memory");
> +    }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */
diff mbox

Patch

Index: gcc/tree-vect-slp.c
===================================================================
--- gcc/tree-vect-slp.c	2017-05-18 07:51:12.387750673 +0100
+++ gcc/tree-vect-slp.c	2017-06-01 07:21:44.094320070 +0100
@@ -671,6 +671,13 @@  vect_build_slp_tree_1 (vec_info *vinfo,
               first_op1 = gimple_assign_rhs2 (stmt);
             }
 	}
+      else if ((TREE_CODE_CLASS (rhs_code) == tcc_constant
+		|| rhs_code == SSA_NAME)
+	       && (TREE_CODE_CLASS (first_stmt_code) == tcc_constant
+		   || first_stmt_code == SSA_NAME))
+	/* Merging two simple rvalues is OK and doesn't count as two
+	   operations.  */
+	;
       else
 	{
 	  if (first_stmt_code != rhs_code
@@ -800,11 +807,14 @@  vect_build_slp_tree_1 (vec_info *vinfo,
 	    }
 
 	  /* Not memory operation.  */
-	  if (TREE_CODE_CLASS (rhs_code) != tcc_binary
-	      && TREE_CODE_CLASS (rhs_code) != tcc_unary
-	      && TREE_CODE_CLASS (rhs_code) != tcc_expression
-	      && TREE_CODE_CLASS (rhs_code) != tcc_comparison
-	      && rhs_code != CALL_EXPR)
+	  if (REFERENCE_CLASS_P (lhs)
+	      || (TREE_CODE_CLASS (rhs_code) != tcc_binary
+		  && TREE_CODE_CLASS (rhs_code) != tcc_unary
+		  && TREE_CODE_CLASS (rhs_code) != tcc_expression
+		  && TREE_CODE_CLASS (rhs_code) != tcc_comparison
+		  && TREE_CODE_CLASS (rhs_code) != tcc_constant
+		  && rhs_code != CALL_EXPR
+		  && rhs_code != SSA_NAME))
 	    {
 	      if (dump_enabled_p ())
 		{
Index: gcc/testsuite/gcc.dg/vect/slp-temp-1.c
===================================================================
--- /dev/null	2017-06-01 07:09:35.344016119 +0100
+++ gcc/testsuite/gcc.dg/vect/slp-temp-1.c	2017-06-01 07:39:24.406603119 +0100
@@ -0,0 +1,71 @@ 
+/* { dg-do run { target { lp64 || ilp32 } } } */
+/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */
+
+void __GIMPLE (startwith ("loop"))
+f (int *x, int n)
+{
+  int i_in;
+  int i_out;
+  int double_i;
+
+  long unsigned int index_0;
+  long unsigned int offset_0;
+  int *addr_0;
+  int temp_0;
+
+  long unsigned int index_1;
+  long unsigned int offset_1;
+  int *addr_1;
+  int temp_1;
+
+ entry:
+  goto loop;
+
+ loop:
+  i_in = __PHI (entry: 0, latch: i_out);
+  double_i = i_in * 2;
+
+  index_0 = (long unsigned int) double_i;
+  offset_0 = index_0 * 4ul;
+  addr_0 = x_1(D) + offset_0;
+  temp_0 = 1;
+  *addr_0 = temp_0;
+
+  index_1 = index_0 + 1ul;
+  offset_1 = index_1 * 4ul;
+  addr_1 = x_1(D) + offset_1;
+  temp_1 = 3;
+  *addr_1 = temp_1;
+
+  i_out = i_in + 1;
+  if (n_2(D) > i_out)
+    goto latch;
+  else
+    goto exit;
+
+ latch:
+  goto loop;
+
+ exit:
+  return;
+}
+
+#define N 1024
+
+int
+main (void)
+{
+  int a[N * 2];
+  f (a, N);
+  for (int i = 0; i < N; ++i)
+    {
+      if (a[i * 2] != 1
+	  || a[i * 2 + 1] != 3)
+	__builtin_abort ();
+      asm volatile ("" ::: "memory");
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */
Index: gcc/testsuite/gcc.dg/vect/slp-temp-2.c
===================================================================
--- /dev/null	2017-06-01 07:09:35.344016119 +0100
+++ gcc/testsuite/gcc.dg/vect/slp-temp-2.c	2017-06-01 07:39:30.914219838 +0100
@@ -0,0 +1,71 @@ 
+/* { dg-do run { target { lp64 || ilp32 } } } */
+/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */
+
+void __GIMPLE (startwith ("loop"))
+f (int *x, int n, int foo)
+{
+  int i_in;
+  int i_out;
+  int double_i;
+
+  long unsigned int index_0;
+  long unsigned int offset_0;
+  int *addr_0;
+  int temp_0;
+
+  long unsigned int index_1;
+  long unsigned int offset_1;
+  int *addr_1;
+  int temp_1;
+
+ entry:
+  goto loop;
+
+ loop:
+  i_in = __PHI (entry: 0, latch: i_out);
+  double_i = i_in * 2;
+
+  index_0 = (long unsigned int) double_i;
+  offset_0 = index_0 * 4ul;
+  addr_0 = x_1(D) + offset_0;
+  temp_0 = 1;
+  *addr_0 = temp_0;
+
+  index_1 = index_0 + 1ul;
+  offset_1 = index_1 * 4ul;
+  addr_1 = x_1(D) + offset_1;
+  temp_1 = foo_2(D);
+  *addr_1 = temp_1;
+
+  i_out = i_in + 1;
+  if (n_3(D) > i_out)
+    goto latch;
+  else
+    goto exit;
+
+ latch:
+  goto loop;
+
+ exit:
+  return;
+}
+
+#define N 1024
+
+int
+main (void)
+{
+  int a[N * 2];
+  f (a, N, 11);
+  for (int i = 0; i < N; ++i)
+    {
+      if (a[i * 2] != 1
+	  || a[i * 2 + 1] != 11)
+	__builtin_abort ();
+      asm volatile ("" ::: "memory");
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */
Index: gcc/testsuite/gcc.dg/vect/slp-temp-3.c
===================================================================
--- /dev/null	2017-06-01 07:09:35.344016119 +0100
+++ gcc/testsuite/gcc.dg/vect/slp-temp-3.c	2017-06-01 07:39:40.925589561 +0100
@@ -0,0 +1,84 @@ 
+/* { dg-do run { target { lp64 || ilp32 } } } */
+/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */
+
+void __GIMPLE (startwith ("loop")) __attribute__ ((noinline, noclone))
+f (int *x, int n)
+{
+  int i_in;
+  int i_out;
+  int double_i;
+
+  long unsigned int index_0;
+  long unsigned int offset_0;
+  int *addr_0;
+  int old_0;
+  int new_0;
+  int temp_0;
+
+  long unsigned int index_1;
+  long unsigned int offset_1;
+  int *addr_1;
+  int old_1;
+  int new_1;
+  int temp_1;
+
+ entry:
+  goto loop;
+
+ loop:
+  i_in = __PHI (entry: 0, latch: i_out);
+  double_i = i_in * 2;
+
+  index_0 = (long unsigned int) double_i;
+  offset_0 = index_0 * 4ul;
+  addr_0 = x_1(D) + offset_0;
+  old_0 = *addr_0;
+  new_0 = old_0 + 1;
+  temp_0 = new_0;
+  *addr_0 = temp_0;
+
+  index_1 = index_0 + 1ul;
+  offset_1 = index_1 * 4ul;
+  addr_1 = x_1(D) + offset_1;
+  old_1 = *addr_1;
+  new_1 = old_1 + 3;
+  temp_1 = new_1;
+  *addr_1 = temp_1;
+
+  i_out = i_in + 1;
+  if (n_2(D) > i_out)
+    goto latch;
+  else
+    goto exit;
+
+ latch:
+  goto loop;
+
+ exit:
+  return;
+}
+
+#define N 1024
+
+int
+main (void)
+{
+  int a[N * 2];
+  for (int i = 0; i < N * 2; ++i)
+    {
+      a[i] = i * 4;
+      asm volatile ("" ::: "memory");
+    }
+  f (a, N);
+  for (int i = 0; i < N; ++i)
+    {
+      if (a[i * 2] != i * 8 + 1
+	  || a[i * 2 + 1] != i * 8 + 7)
+	__builtin_abort ();
+      asm volatile ("" ::: "memory");
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */