diff mbox series

tree-optimization/115385 - handle more gaps with peeling of a single iteration

Message ID 20240611090227.C045C385DDDB@sourceware.org
State New
Headers show
Series tree-optimization/115385 - handle more gaps with peeling of a single iteration | expand

Commit Message

Richard Biener June 11, 2024, 9:02 a.m. UTC
The following makes peeling of a single scalar iteration handle more
gaps, including non-power-of-two cases.  This can be done by rounding
up the remaining access to the next power-of-two which ensures that
the next scalar iteration will pick at least the number of excess
elements we access.

I've added a correctness testcase and one x86 specific scanning for
the optimization.

Bootstrapped and tested on x86_64-unknown-linux-gnu, I plan to
push this tomorrow after eying the CI.  Checking SPEC CPU2017
on x86 shows we have no case left (from 33 before) where peeling
for gaps is insufficient.  This of course relies on sufficient
vec_init optab coverage.

Richard.

	PR tree-optimization/115385
	* tree-vect-stmts.cc (get_group_load_store_type): Peeling
	of a single scalar iteration is sufficient if we can narrow
	the access to the next power of two of the bits in the last
	access.
	(vectorizable_load): Ensure that the last access is narrowed.

	* gcc.dg/vect/pr115385.c: New testcase.
	* gcc.target/i386/vect-pr115385.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/pr115385.c          | 88 +++++++++++++++++++
 gcc/testsuite/gcc.target/i386/vect-pr115385.c | 53 +++++++++++
 gcc/tree-vect-stmts.cc                        | 34 ++++++-
 3 files changed, 173 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr115385.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-pr115385.c
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/pr115385.c b/gcc/testsuite/gcc.dg/vect/pr115385.c
new file mode 100644
index 00000000000..a18cd665d7d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr115385.c
@@ -0,0 +1,88 @@ 
+/* { dg-require-effective-target mmap } */
+
+#include <sys/mman.h>
+#include <stdio.h>
+
+#define COUNT 511
+#define MMAP_SIZE 0x20000
+#define ADDRESS 0x1122000000
+#define TYPE unsigned char
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+void __attribute__((noipa)) foo(TYPE * __restrict x,
+                                TYPE *y, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[16*i+0] = y[3*i+0];
+      x[16*i+1] = y[3*i+1];
+      x[16*i+2] = y[3*i+2];
+      x[16*i+3] = y[3*i+0];
+      x[16*i+4] = y[3*i+1];
+      x[16*i+5] = y[3*i+2];
+      x[16*i+6] = y[3*i+0];
+      x[16*i+7] = y[3*i+1];
+      x[16*i+8] = y[3*i+2];
+      x[16*i+9] = y[3*i+0];
+      x[16*i+10] = y[3*i+1];
+      x[16*i+11] = y[3*i+2];
+      x[16*i+12] = y[3*i+0];
+      x[16*i+13] = y[3*i+1];
+      x[16*i+14] = y[3*i+2];
+      x[16*i+15] = y[3*i+0];
+    }
+}
+
+void __attribute__((noipa)) bar(TYPE * __restrict x,
+                                TYPE *y, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[16*i+0] = y[5*i+0];
+      x[16*i+1] = y[5*i+1];
+      x[16*i+2] = y[5*i+2];
+      x[16*i+3] = y[5*i+3];
+      x[16*i+4] = y[5*i+4];
+      x[16*i+5] = y[5*i+0];
+      x[16*i+6] = y[5*i+1];
+      x[16*i+7] = y[5*i+2];
+      x[16*i+8] = y[5*i+3];
+      x[16*i+9] = y[5*i+4];
+      x[16*i+10] = y[5*i+0];
+      x[16*i+11] = y[5*i+1];
+      x[16*i+12] = y[5*i+2];
+      x[16*i+13] = y[5*i+3];
+      x[16*i+14] = y[5*i+4];
+      x[16*i+15] = y[5*i+0];
+    }
+}
+
+TYPE x[COUNT * 16];
+
+int
+main (void)
+{
+  void *y;
+  TYPE *end_y;
+
+  y = mmap ((void *) ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE,
+            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (y == MAP_FAILED)
+    {
+      perror ("mmap");
+      return 1;
+    }
+
+  end_y = (TYPE *) ((char *) y + MMAP_SIZE);
+
+  foo (x, end_y - COUNT * 3, COUNT);
+  bar (x, end_y - COUNT * 5, COUNT);
+
+  return 0;
+}
+
+/* We always require a scalar epilogue here but we don't know which
+   targets support vector composition this way.  */
diff --git a/gcc/testsuite/gcc.target/i386/vect-pr115385.c b/gcc/testsuite/gcc.target/i386/vect-pr115385.c
new file mode 100644
index 00000000000..a6be9ce4e54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-pr115385.c
@@ -0,0 +1,53 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -msse4.1 -mno-avx -fdump-tree-vect-details" } */
+
+void __attribute__((noipa)) foo(unsigned char * __restrict x,
+                                unsigned char *y, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[16*i+0] = y[3*i+0];
+      x[16*i+1] = y[3*i+1];
+      x[16*i+2] = y[3*i+2];
+      x[16*i+3] = y[3*i+0];
+      x[16*i+4] = y[3*i+1];
+      x[16*i+5] = y[3*i+2];
+      x[16*i+6] = y[3*i+0];
+      x[16*i+7] = y[3*i+1];
+      x[16*i+8] = y[3*i+2];
+      x[16*i+9] = y[3*i+0];
+      x[16*i+10] = y[3*i+1];
+      x[16*i+11] = y[3*i+2];
+      x[16*i+12] = y[3*i+0];
+      x[16*i+13] = y[3*i+1];
+      x[16*i+14] = y[3*i+2];
+      x[16*i+15] = y[3*i+0];
+    }
+}
+
+void __attribute__((noipa)) bar(unsigned char * __restrict x,
+                                unsigned char *y, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[16*i+0] = y[5*i+0];
+      x[16*i+1] = y[5*i+1];
+      x[16*i+2] = y[5*i+2];
+      x[16*i+3] = y[5*i+3];
+      x[16*i+4] = y[5*i+4];
+      x[16*i+5] = y[5*i+0];
+      x[16*i+6] = y[5*i+1];
+      x[16*i+7] = y[5*i+2];
+      x[16*i+8] = y[5*i+3];
+      x[16*i+9] = y[5*i+4];
+      x[16*i+10] = y[5*i+0];
+      x[16*i+11] = y[5*i+1];
+      x[16*i+12] = y[5*i+2];
+      x[16*i+13] = y[5*i+3];
+      x[16*i+14] = y[5*i+4];
+      x[16*i+15] = y[5*i+0];
+    }
+}
+
+/* { dg-final { scan-tree-dump "Data access with gaps requires scalar epilogue loop" "vect"} } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect"} } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 54106a2be37..8aa41833433 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2142,7 +2142,7 @@  get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 				 "Peeling for outer loop is not supported\n");
 	      return false;
 	    }
-	  unsigned HOST_WIDE_INT cnunits, cvf;
+	  unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
 	  if (overrun_p
 	      && (!nunits.is_constant (&cnunits)
 		  || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
@@ -2151,7 +2151,16 @@  get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 		     access excess elements.
 		     ???  Enhancements include peeling multiple iterations
 		     or using masked loads with a static mask.  */
-		  || (group_size * cvf) % cnunits + group_size - gap < cnunits))
+		  || ((group_size * cvf) % cnunits + group_size - gap < cnunits
+		      /* But peeling a single scalar iteration is enough if
+			 we can use the next power-of-two sized partial
+			 access.  */
+		      && ((cremain = (group_size * cvf - gap) % cnunits), true
+			  && ((cpart_size = (1 << ceil_log2 (cremain)))
+			      != cnunits)
+			  && vector_vector_composition_type
+			       (vectype, cnunits / cpart_size,
+				&half_vtype) == NULL_TREE))))
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -11599,6 +11608,27 @@  vectorizable_load (vec_info *vinfo,
 			      gcc_assert (new_vtype
 					  || LOOP_VINFO_PEELING_FOR_GAPS
 					       (loop_vinfo));
+			    /* But still reduce the access size to the next
+			       required power-of-two so peeling a single
+			       scalar iteration is sufficient.  */
+			    unsigned HOST_WIDE_INT cremain;
+			    if (remain.is_constant (&cremain))
+			      {
+				unsigned HOST_WIDE_INT cpart_size
+				  = 1 << ceil_log2 (cremain);
+				if (known_gt (nunits, cpart_size)
+				    && constant_multiple_p (nunits, cpart_size,
+							    &num))
+				  {
+				    tree ptype;
+				    new_vtype
+				      = vector_vector_composition_type (vectype,
+									num,
+									&ptype);
+				    if (new_vtype)
+				      ltype = ptype;
+				  }
+			      }
 			  }
 		      }
 		    tree offset