diff mbox

Fix PR66251 (wrong code with strided group stores)

Message ID alpine.LSU.2.20.1505221709220.27315@wotan.suse.de
State New
Headers show

Commit Message

Michael Matz May 22, 2015, 3:13 p.m. UTC
Hi,

between Richis improvements of grouped accesses, and mine to strided 
stores is an interaction that now leads to ICEs and wrong code after both 
are in, for instance PR66251.  The added testcases reflects this 
situation, and uses both, narrowing and widening (narrowing would still 
ICE, widening right now produce only wrong code).  The patch fixes the 
testcase(s).

It's currently regstrapping on x86_64-linux, okay for trunk if that 
passes?


Ciao,
Michael.

	PR middle-end/66251

	* tree-vect-stmts.c (vect_model_store_cost): Handled strided group
	stores.
	(vect_create_vectorized_demotion_stmts): Always set
	STMT_VINFO_VEC_STMT, also with SLP.
	(vectorizable_store): Handle strided group stores.

testsuite/:
	PR middle-end/66251
	* gcc.dg/vect/pr66251.c: New test.

Comments

Richard Biener May 22, 2015, 4:31 p.m. UTC | #1
On May 22, 2015 5:13:16 PM GMT+02:00, Michael Matz <matz@suse.de> wrote:
>Hi,
>
>between Richis improvements of grouped accesses, and mine to strided 
>stores is an interaction that now leads to ICEs and wrong code after
>both 
>are in, for instance PR66251.  The added testcases reflects this 
>situation, and uses both, narrowing and widening (narrowing would still
>
>ICE, widening right now produce only wrong code).  The patch fixes the 
>testcase(s).
>
>It's currently regstrapping on x86_64-linux, okay for trunk if that 
>passes?

OK.

Thanks,
Richard.

>
>Ciao,
>Michael.
>
>	PR middle-end/66251
>
>	* tree-vect-stmts.c (vect_model_store_cost): Handled strided group
>	stores.
>	(vect_create_vectorized_demotion_stmts): Always set
>	STMT_VINFO_VEC_STMT, also with SLP.
>	(vectorizable_store): Handle strided group stores.
>
>testsuite/:
>	PR middle-end/66251
>	* gcc.dg/vect/pr66251.c: New test.
>
>Index: tree-vect-stmts.c
>===================================================================
>--- tree-vect-stmts.c	(revision 223577)
>+++ tree-vect-stmts.c	(working copy)
>@@ -1000,7 +1000,8 @@ vect_model_store_cost (stmt_vec_info stm
>    equivalent to the cost of GROUP_SIZE separate stores.  If a grouped
>     access is instead being provided by a permute-and-store operation,
>      include the cost of the permutes.  */
>-  if (!store_lanes_p && group_size > 1)
>+  if (!store_lanes_p && group_size > 1
>+      && !STMT_VINFO_STRIDED_P (stmt_info))
>     {
>       /* Uses a high and low interleave or shuffle operations for each
> 	 needed permute.  */
>@@ -1014,21 +1015,24 @@ vect_model_store_cost (stmt_vec_info stm
>                          group_size);
>     }
> 
>+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>   /* Costs of the stores.  */
>-  if (STMT_VINFO_STRIDED_P (stmt_info))
>+  if (STMT_VINFO_STRIDED_P (stmt_info)
>+      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
>     {
>       /* N scalar stores plus extracting the elements.  */
>-      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>       inside_cost += record_stmt_cost (body_cost_vec,
> 				       ncopies * TYPE_VECTOR_SUBPARTS (vectype),
> 				       scalar_store, stmt_info, 0, vect_body);
>-      inside_cost += record_stmt_cost (body_cost_vec,
>-				       ncopies * TYPE_VECTOR_SUBPARTS (vectype),
>-				       vec_to_scalar, stmt_info, 0, vect_body);
>     }
>   else
>  vect_get_store_cost (first_dr, ncopies, &inside_cost, body_cost_vec);
> 
>+  if (STMT_VINFO_STRIDED_P (stmt_info))
>+    inside_cost += record_stmt_cost (body_cost_vec,
>+				     ncopies * TYPE_VECTOR_SUBPARTS (vectype),
>+				     vec_to_scalar, stmt_info, 0, vect_body);
>+
>   if (dump_enabled_p ())
>     dump_printf_loc (MSG_NOTE, vect_location,
>                      "vect_model_store_cost: inside_cost = %d, "
>@@ -3377,15 +3381,13 @@ vect_create_vectorized_demotion_stmts (v
> 	     (or in STMT_VINFO_RELATED_STMT chain).  */
> 	  if (slp_node)
> 	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
>+
>+	  if (!*prev_stmt_info)
>+	    STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
> 	  else
>-	    {
>-	      if (!*prev_stmt_info)
>-		STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
>-	      else
>-		STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
>+	    STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
> 
>-	      *prev_stmt_info = vinfo_for_stmt (new_stmt);
>-	    }
>+	  *prev_stmt_info = vinfo_for_stmt (new_stmt);
> 	}
>     }
> 
>@@ -5155,15 +5157,27 @@ vectorizable_store (gimple stmt, gimple_
>     {
>       grouped_store = true;
>       first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
>-      if (!slp && !PURE_SLP_STMT (stmt_info))
>+      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
>+      if (!slp
>+	  && !PURE_SLP_STMT (stmt_info)
>+	  && !STMT_VINFO_STRIDED_P (stmt_info))
> 	{
>-	  group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
> 	  if (vect_store_lanes_supported (vectype, group_size))
> 	    store_lanes_p = true;
> 	  else if (!vect_grouped_store_supported (vectype, group_size))
> 	    return false;
> 	}
> 
>+      if (STMT_VINFO_STRIDED_P (stmt_info)
>+	  && (slp || PURE_SLP_STMT (stmt_info))
>+	  && (group_size > nunits
>+	      || nunits % group_size != 0))
>+	{
>+	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>+			   "unhandled strided group store\n");
>+	  return false;
>+	}
>+
>       if (first_stmt == stmt)
> 	{
>      /* STMT is the leader of the group. Check the operands of all the
>@@ -5286,10 +5300,23 @@ vectorizable_store (gimple stmt, gimple_
> 	     ...
>          */
> 
>+      unsigned nstores = nunits;
>+      tree ltype = elem_type;
>+      if (slp)
>+	{
>+	  nstores = nunits / group_size;
>+	  if (group_size < nunits)
>+	    ltype = build_vector_type (elem_type, group_size);
>+	  else
>+	    ltype = vectype;
>+	  ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
>+	  ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
>+	}
>+
>       ivstep = stride_step;
>       ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
> 			    build_int_cst (TREE_TYPE (ivstep),
>-					   ncopies * nunits));
>+					   ncopies * nstores));
> 
>       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> 
>@@ -5315,22 +5342,22 @@ vectorizable_store (gimple stmt, gimple_
> 	  else
> 	    vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
> 
>-	  for (i = 0; i < nunits; i++)
>+	  for (i = 0; i < nstores; i++)
> 	    {
> 	      tree newref, newoff;
> 	      gimple incr, assign;
>-	      tree size = TYPE_SIZE (elem_type);
>+	      tree size = TYPE_SIZE (ltype);
> 	      /* Extract the i'th component.  */
>	      tree pos = fold_build2 (MULT_EXPR, bitsizetype, bitsize_int (i),
> 				      size);
>-	      tree elem = fold_build3 (BIT_FIELD_REF, elem_type, vec_oprnd,
>+	      tree elem = fold_build3 (BIT_FIELD_REF, ltype, vec_oprnd,
> 				       size, pos);
> 
> 	      elem = force_gimple_operand_gsi (gsi, elem, true,
> 					       NULL_TREE, true,
> 					       GSI_SAME_STMT);
> 
>-	      newref = build2 (MEM_REF, TREE_TYPE (vectype),
>+	      newref = build2 (MEM_REF, ltype,
> 			       running_off, alias_off);
> 
> 	      /* And store it to *running_off.  */
>Index: testsuite/gcc.dg/vect/pr66251.c
>===================================================================
>--- testsuite/gcc.dg/vect/pr66251.c	(revision 0)
>+++ testsuite/gcc.dg/vect/pr66251.c	(working copy)
>@@ -0,0 +1,79 @@
>+/* { dg-require-effective-target vect_int } */
>+/* { dg-require-effective-target vect_double } */
>+/* { dg-require-effective-target vect_floatint_cvt } */
>+/* { dg-require-effective-target vect_intfloat_cvt } */
>+/* { dg-require-effective-target vect_pack_trunc } */
>+/* { dg-require-effective-target vect_unpack } */
>+/* { dg-require-effective-target vect_hw_misalign } */
>+
>+#include "tree-vect.h"
>+
>+void __attribute__((noinline,noclone))
>+test1(_Complex double *a, _Complex int *b, int stride, int n)
>+{
>+  int i;
>+  for (i = 0; i < n; i++)
>+    {
>+      a[i*stride] = b[i*stride];
>+    }
>+}
>+
>+void __attribute__((noinline,noclone))
>+test2(_Complex int *a, _Complex double *b, int stride, int n)
>+{
>+  int i;
>+  for (i = 0; i < n; i++)
>+    {
>+      a[i*stride] = b[i*stride];
>+    }
>+}
>+
>+_Complex int ia[256];
>+_Complex double da[256];
>+
>+extern void abort (void);
>+
>+int main ()
>+{
>+  int i;
>+  int stride;
>+
>+  check_vect ();
>+
>+  for (stride = 1; stride < 15; stride++)
>+    {
>+      for (i = 0; i < 256; i++)
>+	{
>+	  __real__ ia[i] = (i + stride) % 19;
>+	  __imag__ ia[i] = (i + stride) % 23;
>+	  __asm__ volatile ("");
>+	}
>+
>+      test1(da, ia, stride, 256/stride);
>+
>+      for (i = 0; i < 256/stride; i++)
>+	{
>+	  if (da[i*stride] != ia[i*stride])
>+	    abort ();
>+	}
>+
>+      for (i = 0; i < 256; i++)
>+	{
>+	  __real__ da[i] = (i + stride + 1) % 29;
>+	  __imag__ da[i] = (i + stride + 1) % 31;
>+	  __asm__ volatile ("");
>+	}
>+
>+      test2(ia, da, stride, 256/stride);
>+
>+      for (i = 0; i < 256/stride; i++)
>+	{
>+	  if (da[i*stride] != ia[i*stride])
>+	    abort ();
>+	}
>+    }
>+  return 0;
>+}
>+
>+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } }
>*/
>+/* { dg-final { cleanup-tree-dump "vect" } } */
Michael Matz May 26, 2015, 4:02 p.m. UTC | #2
Hi,

On Fri, 22 May 2015, Richard Biener wrote:

> >It's currently regstrapping on x86_64-linux, okay for trunk if that 
> >passes?
> 
> OK.

r223704 now.  I've tried to also add the fortran runtime testcase from the 
PR that now exists, but failed.  The gfortran.dg/vect testsuite is strange 
and contains no existing dg-do run tests, and I failed to make it work in 
the end (or better, I failed to make the testcase fail without patch).  It 
does fail before and work after the patch outside the testsuite.


Ciao,
Michael.
diff mbox

Patch

Index: tree-vect-stmts.c
===================================================================
--- tree-vect-stmts.c	(revision 223577)
+++ tree-vect-stmts.c	(working copy)
@@ -1000,7 +1000,8 @@  vect_model_store_cost (stmt_vec_info stm
      equivalent to the cost of GROUP_SIZE separate stores.  If a grouped
      access is instead being provided by a permute-and-store operation,
      include the cost of the permutes.  */
-  if (!store_lanes_p && group_size > 1)
+  if (!store_lanes_p && group_size > 1
+      && !STMT_VINFO_STRIDED_P (stmt_info))
     {
       /* Uses a high and low interleave or shuffle operations for each
 	 needed permute.  */
@@ -1014,21 +1015,24 @@  vect_model_store_cost (stmt_vec_info stm
                          group_size);
     }
 
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   /* Costs of the stores.  */
-  if (STMT_VINFO_STRIDED_P (stmt_info))
+  if (STMT_VINFO_STRIDED_P (stmt_info)
+      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
       /* N scalar stores plus extracting the elements.  */
-      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
       inside_cost += record_stmt_cost (body_cost_vec,
 				       ncopies * TYPE_VECTOR_SUBPARTS (vectype),
 				       scalar_store, stmt_info, 0, vect_body);
-      inside_cost += record_stmt_cost (body_cost_vec,
-				       ncopies * TYPE_VECTOR_SUBPARTS (vectype),
-				       vec_to_scalar, stmt_info, 0, vect_body);
     }
   else
     vect_get_store_cost (first_dr, ncopies, &inside_cost, body_cost_vec);
 
+  if (STMT_VINFO_STRIDED_P (stmt_info))
+    inside_cost += record_stmt_cost (body_cost_vec,
+				     ncopies * TYPE_VECTOR_SUBPARTS (vectype),
+				     vec_to_scalar, stmt_info, 0, vect_body);
+
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
                      "vect_model_store_cost: inside_cost = %d, "
@@ -3377,15 +3381,13 @@  vect_create_vectorized_demotion_stmts (v
 	     (or in STMT_VINFO_RELATED_STMT chain).  */
 	  if (slp_node)
 	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+
+	  if (!*prev_stmt_info)
+	    STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
 	  else
-	    {
-	      if (!*prev_stmt_info)
-		STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
-	      else
-		STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
+	    STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
 
-	      *prev_stmt_info = vinfo_for_stmt (new_stmt);
-	    }
+	  *prev_stmt_info = vinfo_for_stmt (new_stmt);
 	}
     }
 
@@ -5155,15 +5157,27 @@  vectorizable_store (gimple stmt, gimple_
     {
       grouped_store = true;
       first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
-      if (!slp && !PURE_SLP_STMT (stmt_info))
+      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
+      if (!slp
+	  && !PURE_SLP_STMT (stmt_info)
+	  && !STMT_VINFO_STRIDED_P (stmt_info))
 	{
-	  group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
 	  if (vect_store_lanes_supported (vectype, group_size))
 	    store_lanes_p = true;
 	  else if (!vect_grouped_store_supported (vectype, group_size))
 	    return false;
 	}
 
+      if (STMT_VINFO_STRIDED_P (stmt_info)
+	  && (slp || PURE_SLP_STMT (stmt_info))
+	  && (group_size > nunits
+	      || nunits % group_size != 0))
+	{
+	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			   "unhandled strided group store\n");
+	  return false;
+	}
+
       if (first_stmt == stmt)
 	{
           /* STMT is the leader of the group. Check the operands of all the
@@ -5286,10 +5300,23 @@  vectorizable_store (gimple stmt, gimple_
 	     ...
          */
 
+      unsigned nstores = nunits;
+      tree ltype = elem_type;
+      if (slp)
+	{
+	  nstores = nunits / group_size;
+	  if (group_size < nunits)
+	    ltype = build_vector_type (elem_type, group_size);
+	  else
+	    ltype = vectype;
+	  ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
+	  ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+	}
+
       ivstep = stride_step;
       ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
 			    build_int_cst (TREE_TYPE (ivstep),
-					   ncopies * nunits));
+					   ncopies * nstores));
 
       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 
@@ -5315,22 +5342,22 @@  vectorizable_store (gimple stmt, gimple_
 	  else
 	    vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
 
-	  for (i = 0; i < nunits; i++)
+	  for (i = 0; i < nstores; i++)
 	    {
 	      tree newref, newoff;
 	      gimple incr, assign;
-	      tree size = TYPE_SIZE (elem_type);
+	      tree size = TYPE_SIZE (ltype);
 	      /* Extract the i'th component.  */
 	      tree pos = fold_build2 (MULT_EXPR, bitsizetype, bitsize_int (i),
 				      size);
-	      tree elem = fold_build3 (BIT_FIELD_REF, elem_type, vec_oprnd,
+	      tree elem = fold_build3 (BIT_FIELD_REF, ltype, vec_oprnd,
 				       size, pos);
 
 	      elem = force_gimple_operand_gsi (gsi, elem, true,
 					       NULL_TREE, true,
 					       GSI_SAME_STMT);
 
-	      newref = build2 (MEM_REF, TREE_TYPE (vectype),
+	      newref = build2 (MEM_REF, ltype,
 			       running_off, alias_off);
 
 	      /* And store it to *running_off.  */
Index: testsuite/gcc.dg/vect/pr66251.c
===================================================================
--- testsuite/gcc.dg/vect/pr66251.c	(revision 0)
+++ testsuite/gcc.dg/vect/pr66251.c	(working copy)
@@ -0,0 +1,79 @@ 
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-require-effective-target vect_floatint_cvt } */
+/* { dg-require-effective-target vect_intfloat_cvt } */
+/* { dg-require-effective-target vect_pack_trunc } */
+/* { dg-require-effective-target vect_unpack } */
+/* { dg-require-effective-target vect_hw_misalign } */
+
+#include "tree-vect.h"
+
+void __attribute__((noinline,noclone))
+test1(_Complex double *a, _Complex int *b, int stride, int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    {
+      a[i*stride] = b[i*stride];
+    }
+}
+
+void __attribute__((noinline,noclone))
+test2(_Complex int *a, _Complex double *b, int stride, int n)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    {
+      a[i*stride] = b[i*stride];
+    }
+}
+
+_Complex int ia[256];
+_Complex double da[256];
+
+extern void abort (void);
+
+int main ()
+{
+  int i;
+  int stride;
+
+  check_vect ();
+
+  for (stride = 1; stride < 15; stride++)
+    {
+      for (i = 0; i < 256; i++)
+	{
+	  __real__ ia[i] = (i + stride) % 19;
+	  __imag__ ia[i] = (i + stride) % 23;
+	  __asm__ volatile ("");
+	}
+
+      test1(da, ia, stride, 256/stride);
+
+      for (i = 0; i < 256/stride; i++)
+	{
+	  if (da[i*stride] != ia[i*stride])
+	    abort ();
+	}
+
+      for (i = 0; i < 256; i++)
+	{
+	  __real__ da[i] = (i + stride + 1) % 29;
+	  __imag__ da[i] = (i + stride + 1) % 31;
+	  __asm__ volatile ("");
+	}
+
+      test2(ia, da, stride, 256/stride);
+
+      for (i = 0; i < 256/stride; i++)
+	{
+	  if (da[i*stride] != ia[i*stride])
+	    abort ();
+	}
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */