diff mbox series

tree-optimization/114107 - avoid peeling for gaps in more cases

Message ID 20240607115857.4B39B133F3@imap1.dmz-prg2.suse.org
State New
Headers show
Series tree-optimization/114107 - avoid peeling for gaps in more cases | expand

Commit Message

Richard Biener June 7, 2024, 11:58 a.m. UTC
The following refactors the code to detect necessary peeling for
gaps, in particular the PR103116 case when there is no gap but
the group size is smaller than the vector size.  The testcase in
PR114107 shows we fail to SLP

  for (int i=0; i<n; i++)
    for (int k=0; k<4; k++)
      data[4*i+k] *= factor[i];

because peeling one scalar iteration isn't enough to cover a gap
of 3 elements of factor[i].  But the code detecting this is placed
after the logic that detects cases we handle properly already as
we'd code generate { factor[i], 0., 0., 0. } for V4DFmode vectorization
already.  In fact the check to detect when peeling a single iteration
isn't enough seems improperly guarded as it should apply to all cases.

I'm not sure we correctly handle VMAT_CONTIGUOUS_REVERSE but I
checked that VMAT_STRIDED_SLP and VMAT_ELEMENTWISE correctly avoid
touching excess elements within the same constraints as
VMAT_CONTIGUOUS but the PR103116 case only covered VMAT_CONTIGUOUS
which is too optimistic I think.

With this change we can use SLP for the above testcase and the
PR103116 testcases no longer require an epilogue on x86-64.  It
might be different on other targets so I made those testcases
runtime FAIL only instead of relying on dump scanning there's
currently no easy way to properly constrain.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

	PR tree-optimization/114107
	PR tree-optimization/110445
	* tree-vect-stmts.cc (get_group_load_store_type): Refactor
	contiguous access case.  Make sure peeling for gap constraints
	are always tested and consistently relax when we know we can
	avoid touching excess elements during code generation.

	* gcc.dg/vect/pr114107.c: New testcase.
	* gcc.dg/vect/pr103116-1.c: Adjust.
	* gcc.dg/vect/pr103116-2.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/pr103116-1.c |   4 +-
 gcc/testsuite/gcc.dg/vect/pr103116-2.c |   3 +-
 gcc/testsuite/gcc.dg/vect/pr114107.c   |  31 +++++++
 gcc/tree-vect-stmts.cc                 | 110 ++++++++++++-------------
 4 files changed, 90 insertions(+), 58 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr114107.c
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/pr103116-1.c b/gcc/testsuite/gcc.dg/vect/pr103116-1.c
index d3639fc8cfd..280ec57eb4b 100644
--- a/gcc/testsuite/gcc.dg/vect/pr103116-1.c
+++ b/gcc/testsuite/gcc.dg/vect/pr103116-1.c
@@ -47,4 +47,6 @@  main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump "Data access with gaps requires scalar epilogue loop" "vect" { target { vect_perm && vect_int } } } } */
+/* When the target can compose a vector from its half we do not require
+   a scalar epilogue, but there's no effective target for this.  */
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target { vect_perm && vect_int } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr103116-2.c b/gcc/testsuite/gcc.dg/vect/pr103116-2.c
index aa9797a9407..ee5b82922f9 100644
--- a/gcc/testsuite/gcc.dg/vect/pr103116-2.c
+++ b/gcc/testsuite/gcc.dg/vect/pr103116-2.c
@@ -56,4 +56,5 @@  main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump "peeling for gaps insufficient for access" "vect" { target { vect_perm_short } } } } */
+/* Whether or not peeling for gaps is required depends on the ability of
+   the target to compose a vector from two-element bits.  */
diff --git a/gcc/testsuite/gcc.dg/vect/pr114107.c b/gcc/testsuite/gcc.dg/vect/pr114107.c
new file mode 100644
index 00000000000..65175d9a680
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr114107.c
@@ -0,0 +1,31 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+
+void rescale_x4 (double* __restrict data,
+		 const double* __restrict factor, int n)
+{
+  for (int i=0; i<n; i++)
+    {
+      data[4*i] *= factor[i];
+      data[4*i+1] *= factor[i];
+      data[4*i+2] *= factor[i];
+      data[4*i+3] *= factor[i];
+    }
+}
+
+void rescale_x4_s (double* __restrict data,
+		   const double* __restrict factor, int n, int s)
+{
+  for (int i=0; i<n; i++)
+    {
+      data[s*i] *= factor[s*i];
+      data[s*i+1] *= factor[s*i];
+      data[s*i+2] *= factor[s*i];
+      data[s*i+3] *= factor[s*i];
+    }
+}
+
+/* All targets should be able to compose a vector from scalar elements, but
+   depending on vector size different permutes might be necessary.  */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-not "Data access with gaps requires scalar epilogue loop" "vect" { target vect_perm } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 5098b7fab6a..69ba906a33f 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2047,6 +2047,36 @@  get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 	}
       else
 	{
+	  int cmp = compare_step_with_zero (vinfo, stmt_info);
+	  if (cmp < 0)
+	    {
+	      if (single_element_p)
+		/* ???  The VMAT_CONTIGUOUS_REVERSE code generation is
+		   only correct for single element "interleaving" SLP.  */
+		*memory_access_type = get_negative_load_store_type
+			     (vinfo, stmt_info, vectype, vls_type, 1, poffset);
+	      else
+		{
+		  /* Try to use consecutive accesses of DR_GROUP_SIZE elements,
+		     separated by the stride, until we have a complete vector.
+		     Fall back to scalar accesses if that isn't possible.  */
+		  if (multiple_p (nunits, group_size))
+		    *memory_access_type = VMAT_STRIDED_SLP;
+		  else
+		    *memory_access_type = VMAT_ELEMENTWISE;
+		}
+	    }
+	  else if (cmp == 0 && loop_vinfo)
+	    {
+	      gcc_assert (vls_type == VLS_LOAD);
+	      *memory_access_type = VMAT_INVARIANT;
+	      /* Invariant accesses perform only component accesses, alignment
+		 is irrelevant for them.  */
+	      *alignment_support_scheme = dr_unaligned_supported;
+	    }
+	  else
+	    *memory_access_type = VMAT_CONTIGUOUS;
+
 	  overrun_p = loop_vinfo && gap != 0;
 	  if (overrun_p && vls_type != VLS_LOAD)
 	    {
@@ -2065,6 +2095,20 @@  get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 			/ vect_get_scalar_dr_size (first_dr_info)))
 	    overrun_p = false;
 
+	  /* When we have a contiguous access across loop iterations
+	     but the access in the loop doesn't cover the full vector
+	     we can end up with no gap recorded but still excess
+	     elements accessed, see PR103116.  Make sure we peel for
+	     gaps if necessary and sufficient and give up if not.
+
+	     If there is a combination of the access not covering the full
+	     vector and a gap recorded then we may need to peel twice.  */
+	  if (loop_vinfo
+	      && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
+	      && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+			      nunits))
+	    overrun_p = true;
+
 	  /* If the gap splits the vector in half and the target
 	     can do half-vector operations avoid the epilogue peeling
 	     by simply loading half of the vector only.  Usually
@@ -2097,68 +2141,22 @@  get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 				 "Peeling for outer loop is not supported\n");
 	      return false;
 	    }
-	  int cmp = compare_step_with_zero (vinfo, stmt_info);
-	  if (cmp < 0)
-	    {
-	      if (single_element_p)
-		/* ???  The VMAT_CONTIGUOUS_REVERSE code generation is
-		   only correct for single element "interleaving" SLP.  */
-		*memory_access_type = get_negative_load_store_type
-			     (vinfo, stmt_info, vectype, vls_type, 1, poffset);
-	      else
-		{
-		  /* Try to use consecutive accesses of DR_GROUP_SIZE elements,
-		     separated by the stride, until we have a complete vector.
-		     Fall back to scalar accesses if that isn't possible.  */
-		  if (multiple_p (nunits, group_size))
-		    *memory_access_type = VMAT_STRIDED_SLP;
-		  else
-		    *memory_access_type = VMAT_ELEMENTWISE;
-		}
-	    }
-	  else if (cmp == 0 && loop_vinfo)
-	    {
-	      gcc_assert (vls_type == VLS_LOAD);
-	      *memory_access_type = VMAT_INVARIANT;
-	      /* Invariant accesses perform only component accesses, alignment
-		 is irrelevant for them.  */
-	      *alignment_support_scheme = dr_unaligned_supported;
-	    }
-	  else
-	    *memory_access_type = VMAT_CONTIGUOUS;
-
-	  /* When we have a contiguous access across loop iterations
-	     but the access in the loop doesn't cover the full vector
-	     we can end up with no gap recorded but still excess
-	     elements accessed, see PR103116.  Make sure we peel for
-	     gaps if necessary and sufficient and give up if not.
-
-	     If there is a combination of the access not covering the full
-	     vector and a gap recorded then we may need to peel twice.  */
-	  if (loop_vinfo
-	      && *memory_access_type == VMAT_CONTIGUOUS
-	      && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
-	      && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo),
-			      nunits))
-	    {
-	      unsigned HOST_WIDE_INT cnunits, cvf;
-	      if (!can_overrun_p
-		  || !nunits.is_constant (&cnunits)
+	  unsigned HOST_WIDE_INT cnunits, cvf;
+	  if (overrun_p
+	      && (!nunits.is_constant (&cnunits)
 		  || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
 		  /* Peeling for gaps assumes that a single scalar iteration
 		     is enough to make sure the last vector iteration doesn't
 		     access excess elements.
 		     ???  Enhancements include peeling multiple iterations
 		     or using masked loads with a static mask.  */
-		  || (group_size * cvf) % cnunits + group_size - gap < cnunits)
-		{
-		  if (dump_enabled_p ())
-		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				     "peeling for gaps insufficient for "
-				     "access\n");
-		  return false;
-		}
-	      overrun_p = true;
+		  || (group_size * cvf) % cnunits + group_size - gap < cnunits))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "peeling for gaps insufficient for "
+				 "access\n");
+	      return false;
 	    }
 
 	  /* If this is single-element interleaving with an element