diff mbox

[1/7] Remove unnecessary peeling for gaps check

Message ID 878ty6n8gi.fsf@e105548-lin.cambridge.arm.com
State New
Headers show

Commit Message

Richard Sandiford June 15, 2016, 8:48 a.m. UTC
I recently relaxed the peeling-for-gaps conditions for LD3 but
kept them as-is for load-and-permute.  I don't think the conditons
are needed for load-and-permute either though.  No current load-and-
permute should load outside the group, so if there is no gap at the end,
the final vector element loaded will correspond to an element loaded
by the original scalar loop.

The patch for PR68559 (a missed optimisation PR) increased the peeled
cases from "exact_log2 (groupsize) == -1" to "vf % group_size == 0", so
before that fix, we didn't peel for gaps if there was no gap at the end
of the group and if the group size was a power of 2.

The only current non-power-of-2 load-and-permute size is 3, which
doesn't require loading more than 3 vectors.

The testcase is based on gcc.dg/vect/pr49038.c.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Thanks,
Richard


gcc/
	* tree-vect-stmts.c (vectorizable_load): Remove unnecessary
	peeling-for-gaps condition.

gcc/testsuite/
	* gcc.dg/vect/group-no-gaps-1.c: New test.

Comments

Richard Biener June 15, 2016, 1:18 p.m. UTC | #1
On Wed, Jun 15, 2016 at 10:48 AM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
> I recently relaxed the peeling-for-gaps conditions for LD3 but
> kept them as-is for load-and-permute.  I don't think the conditons
> are needed for load-and-permute either though.  No current load-and-
> permute should load outside the group, so if there is no gap at the end,
> the final vector element loaded will correspond to an element loaded
> by the original scalar loop.
>
> The patch for PR68559 (a missed optimisation PR) increased the peeled
> cases from "exact_log2 (groupsize) == -1" to "vf % group_size == 0", so
> before that fix, we didn't peel for gaps if there was no gap at the end
> of the group and if the group size was a power of 2.
>
> The only current non-power-of-2 load-and-permute size is 3, which
> doesn't require loading more than 3 vectors.
>
> The testcase is based on gcc.dg/vect/pr49038.c.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Ok.

Thanks,
Richard.

> Thanks,
> Richard
>
>
> gcc/
>         * tree-vect-stmts.c (vectorizable_load): Remove unnecessary
>         peeling-for-gaps condition.
>
> gcc/testsuite/
>         * gcc.dg/vect/group-no-gaps-1.c: New test.
>
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c
> +++ gcc/tree-vect-stmts.c
> @@ -6356,13 +6356,11 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>           gcc_assert (GROUP_GAP (stmt_info));
>         }
>
> -      /* If there is a gap in the end of the group or the group size cannot
> -         be made a multiple of the vector element count then we access excess
> +      /* If there is a gap in the end of the group then we access excess
>          elements in the last iteration and thus need to peel that off.  */
>        if (loop_vinfo
>           && ! STMT_VINFO_STRIDED_P (stmt_info)
> -         && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
> -             || (!slp && !load_lanes_p && vf % group_size != 0)))
> +         && GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0)
>         {
>           if (dump_enabled_p ())
>             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> Index: gcc/testsuite/gcc.dg/vect/group-no-gaps-1.c
> ===================================================================
> --- /dev/null
> +++ gcc/testsuite/gcc.dg/vect/group-no-gaps-1.c
> @@ -0,0 +1,108 @@
> +/* { dg-require-effective-target mmap } */
> +
> +#include <sys/mman.h>
> +#include <stdio.h>
> +
> +#define COUNT 320
> +#define MMAP_SIZE 0x20000
> +#define ADDRESS1 0x1122000000
> +#define ADDRESS2 (ADDRESS1 + MMAP_SIZE * 16)
> +#define TYPE unsigned int
> +
> +#ifndef MAP_ANONYMOUS
> +#define MAP_ANONYMOUS MAP_ANON
> +#endif
> +
> +#define RHS0(B) b[B]
> +#define RHS1(B) RHS0(B) + b[(B) + 1]
> +#define RHS2(B) RHS1(B) + b[(B) + 2]
> +#define RHS3(B) RHS2(B) + b[(B) + 3]
> +#define RHS4(B) RHS3(B) + b[(B) + 4]
> +#define RHS5(B) RHS4(B) + b[(B) + 5]
> +#define RHS6(B) RHS5(B) + b[(B) + 6]
> +#define RHS7(B) RHS6(B) + b[(B) + 7]
> +
> +#define LHS0(B) a[B]
> +#define LHS1(B) LHS0(B) = a[(B) + 1]
> +#define LHS2(B) LHS1(B) = a[(B) + 2]
> +#define LHS3(B) LHS2(B) = a[(B) + 3]
> +#define LHS4(B) LHS3(B) = a[(B) + 4]
> +#define LHS5(B) LHS4(B) = a[(B) + 5]
> +#define LHS6(B) LHS5(B) = a[(B) + 6]
> +#define LHS7(B) LHS6(B) = a[(B) + 7]
> +
> +#define DEF_GROUP_SIZE(MULT, GAP, NO_GAP)                      \
> +  void __attribute__((noinline, noclone))                      \
> +  gap_load_##MULT (TYPE *__restrict a, TYPE *__restrict b)     \
> +  {                                                            \
> +    for (int i = 0; i < COUNT; i++)                            \
> +      a[i] = RHS##GAP (i * MULT);                              \
> +  }                                                            \
> +  void __attribute__((noinline, noclone))                      \
> +  no_gap_load_##MULT (TYPE *__restrict a, TYPE *__restrict b)  \
> +  {                                                            \
> +    for (int i = 0; i < COUNT; i++)                            \
> +      a[i] = RHS##NO_GAP (i * MULT);                           \
> +  }                                                            \
> +  void __attribute__((noinline, noclone))                      \
> +  gap_store_##MULT (TYPE *__restrict a, TYPE *__restrict b)    \
> +  {                                                            \
> +    for (int i = 0; i < COUNT; i++)                            \
> +      LHS##GAP (i * MULT) = b[i];                              \
> +  }                                                            \
> +  void __attribute__((noinline, noclone))                      \
> +  no_gap_store_##MULT (TYPE *__restrict a, TYPE *__restrict b) \
> +  {                                                            \
> +    for (int i = 0; i < COUNT; i++)                            \
> +      LHS##NO_GAP (i * MULT) = b[i];                           \
> +  }
> +
> +#define USE_GROUP_SIZE(MULT)                                   \
> +  gap_load_##MULT (end_x - COUNT, end_y - COUNT * MULT + 1);   \
> +  no_gap_load_##MULT (end_x - COUNT, end_y - COUNT * MULT);    \
> +  gap_store_##MULT (end_x - COUNT * MULT + 1, end_y - COUNT);  \
> +  no_gap_store_##MULT (end_x - COUNT * MULT, end_y - COUNT)
> +
> +DEF_GROUP_SIZE (2, 0, 1)
> +DEF_GROUP_SIZE (3, 1, 2)
> +DEF_GROUP_SIZE (4, 2, 3)
> +DEF_GROUP_SIZE (5, 3, 4)
> +DEF_GROUP_SIZE (6, 4, 5)
> +DEF_GROUP_SIZE (7, 5, 6)
> +DEF_GROUP_SIZE (8, 6, 7)
> +
> +int
> +main (void)
> +{
> +  void *x, *y;
> +  TYPE *end_x, *end_y;
> +
> +  x = mmap ((void *) ADDRESS1, MMAP_SIZE, PROT_READ | PROT_WRITE,
> +           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +  if (x == MAP_FAILED)
> +    {
> +      perror ("mmap");
> +      return 1;
> +    }
> +
> +  y = mmap ((void *) ADDRESS2, MMAP_SIZE, PROT_READ | PROT_WRITE,
> +           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +  if (y == MAP_FAILED)
> +    {
> +      perror ("mmap");
> +      return 1;
> +    }
> +
> +  end_x = (TYPE *) ((char *) x + MMAP_SIZE);
> +  end_y = (TYPE *) ((char *) y + MMAP_SIZE);
> +
> +  USE_GROUP_SIZE (2);
> +  USE_GROUP_SIZE (3);
> +  USE_GROUP_SIZE (4);
> +  USE_GROUP_SIZE (5);
> +  USE_GROUP_SIZE (6);
> +  USE_GROUP_SIZE (7);
> +  USE_GROUP_SIZE (8);
> +
> +  return 0;
> +}
diff mbox

Patch

Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c
+++ gcc/tree-vect-stmts.c
@@ -6356,13 +6356,11 @@  vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	  gcc_assert (GROUP_GAP (stmt_info));
 	}
 
-      /* If there is a gap in the end of the group or the group size cannot
-         be made a multiple of the vector element count then we access excess
+      /* If there is a gap in the end of the group then we access excess
 	 elements in the last iteration and thus need to peel that off.  */
       if (loop_vinfo
 	  && ! STMT_VINFO_STRIDED_P (stmt_info)
-	  && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
-	      || (!slp && !load_lanes_p && vf % group_size != 0)))
+	  && GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0)
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
Index: gcc/testsuite/gcc.dg/vect/group-no-gaps-1.c
===================================================================
--- /dev/null
+++ gcc/testsuite/gcc.dg/vect/group-no-gaps-1.c
@@ -0,0 +1,108 @@ 
+/* { dg-require-effective-target mmap } */
+
+#include <sys/mman.h>
+#include <stdio.h>
+
+#define COUNT 320
+#define MMAP_SIZE 0x20000
+#define ADDRESS1 0x1122000000
+#define ADDRESS2 (ADDRESS1 + MMAP_SIZE * 16)
+#define TYPE unsigned int
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+#define RHS0(B) b[B]
+#define RHS1(B) RHS0(B) + b[(B) + 1]
+#define RHS2(B) RHS1(B) + b[(B) + 2]
+#define RHS3(B) RHS2(B) + b[(B) + 3]
+#define RHS4(B) RHS3(B) + b[(B) + 4]
+#define RHS5(B) RHS4(B) + b[(B) + 5]
+#define RHS6(B) RHS5(B) + b[(B) + 6]
+#define RHS7(B) RHS6(B) + b[(B) + 7]
+
+#define LHS0(B) a[B]
+#define LHS1(B) LHS0(B) = a[(B) + 1]
+#define LHS2(B) LHS1(B) = a[(B) + 2]
+#define LHS3(B) LHS2(B) = a[(B) + 3]
+#define LHS4(B) LHS3(B) = a[(B) + 4]
+#define LHS5(B) LHS4(B) = a[(B) + 5]
+#define LHS6(B) LHS5(B) = a[(B) + 6]
+#define LHS7(B) LHS6(B) = a[(B) + 7]
+
+#define DEF_GROUP_SIZE(MULT, GAP, NO_GAP)			\
+  void __attribute__((noinline, noclone))			\
+  gap_load_##MULT (TYPE *__restrict a, TYPE *__restrict b)	\
+  {								\
+    for (int i = 0; i < COUNT; i++)				\
+      a[i] = RHS##GAP (i * MULT);				\
+  }								\
+  void __attribute__((noinline, noclone))			\
+  no_gap_load_##MULT (TYPE *__restrict a, TYPE *__restrict b)	\
+  {								\
+    for (int i = 0; i < COUNT; i++)				\
+      a[i] = RHS##NO_GAP (i * MULT);				\
+  }								\
+  void __attribute__((noinline, noclone))			\
+  gap_store_##MULT (TYPE *__restrict a, TYPE *__restrict b)	\
+  {								\
+    for (int i = 0; i < COUNT; i++)				\
+      LHS##GAP (i * MULT) = b[i];				\
+  }								\
+  void __attribute__((noinline, noclone))			\
+  no_gap_store_##MULT (TYPE *__restrict a, TYPE *__restrict b)	\
+  {								\
+    for (int i = 0; i < COUNT; i++)				\
+      LHS##NO_GAP (i * MULT) = b[i];				\
+  }
+
+#define USE_GROUP_SIZE(MULT)					\
+  gap_load_##MULT (end_x - COUNT, end_y - COUNT * MULT + 1);	\
+  no_gap_load_##MULT (end_x - COUNT, end_y - COUNT * MULT);	\
+  gap_store_##MULT (end_x - COUNT * MULT + 1, end_y - COUNT);	\
+  no_gap_store_##MULT (end_x - COUNT * MULT, end_y - COUNT)
+
+DEF_GROUP_SIZE (2, 0, 1)
+DEF_GROUP_SIZE (3, 1, 2)
+DEF_GROUP_SIZE (4, 2, 3)
+DEF_GROUP_SIZE (5, 3, 4)
+DEF_GROUP_SIZE (6, 4, 5)
+DEF_GROUP_SIZE (7, 5, 6)
+DEF_GROUP_SIZE (8, 6, 7)
+
+int
+main (void)
+{
+  void *x, *y;
+  TYPE *end_x, *end_y;
+
+  x = mmap ((void *) ADDRESS1, MMAP_SIZE, PROT_READ | PROT_WRITE,
+	    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (x == MAP_FAILED)
+    {
+      perror ("mmap");
+      return 1;
+    }
+
+  y = mmap ((void *) ADDRESS2, MMAP_SIZE, PROT_READ | PROT_WRITE,
+	    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (y == MAP_FAILED)
+    {
+      perror ("mmap");
+      return 1;
+    }
+
+  end_x = (TYPE *) ((char *) x + MMAP_SIZE);
+  end_y = (TYPE *) ((char *) y + MMAP_SIZE);
+
+  USE_GROUP_SIZE (2);
+  USE_GROUP_SIZE (3);
+  USE_GROUP_SIZE (4);
+  USE_GROUP_SIZE (5);
+  USE_GROUP_SIZE (6);
+  USE_GROUP_SIZE (7);
+  USE_GROUP_SIZE (8);
+
+  return 0;
+}