diff mbox

Fix PR68559

Message ID alpine.LSU.2.11.1511271511510.4884@t29.fhfr.qr
State New
Headers show

Commit Message

Richard Biener Nov. 27, 2015, 2:13 p.m. UTC
The following fixes the excessive peeling for gaps we do when doing
SLP now that I removed most of the restrictions on having gaps in
the first place.

This should make low-trip vectorized loops more efficient (sth
also the combine-epilogue-with-vectorized-body-by-masking patches
claim to do).

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Richard.

2015-11-27  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/68559
	* tree-vect-data-refs.c (vect_analyze_group_access_1): Move
	peeling for gap checks ...
	* tree-vect-stmts.c (vectorizable_load): ... here and relax
	for SLP.
	* tree-vect-loop.c (vect_analyze_loop_2): Re-set
	LOOP_VINFO_PEELING_FOR_GAPS before re-trying without SLP.

	* gcc.dg/vect/slp-perm-4.c: Adjust again.
	* gcc.dg/vect/pr45752.c: Likewise.

Index: gcc/tree-vect-stmts.c
===================================================================
*** gcc/tree-vect-stmts.c	(revision 230998)
--- gcc/tree-vect-stmts.c	(working copy)
*************** vectorizable_load (gimple *stmt, gimple_
*** 6246,6260 ****
           that leaves unused vector loads around punt - we at least create
  	 very sub-optimal code in that case (and blow up memory,
  	 see PR65518).  */
        if (first_stmt == stmt
! 	  && !GROUP_NEXT_ELEMENT (stmt_info)
! 	  && GROUP_SIZE (stmt_info) > TYPE_VECTOR_SUBPARTS (vectype))
  	{
  	  if (dump_enabled_p ())
  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
! 			     "single-element interleaving not supported "
! 			     "for not adjacent vector loads\n");
! 	  return false;
  	}
  
        if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
--- 6250,6294 ----
           that leaves unused vector loads around punt - we at least create
  	 very sub-optimal code in that case (and blow up memory,
  	 see PR65518).  */
+       bool force_peeling = false;
        if (first_stmt == stmt
! 	  && !GROUP_NEXT_ELEMENT (stmt_info))
! 	{
! 	  if (GROUP_SIZE (stmt_info) > TYPE_VECTOR_SUBPARTS (vectype))
! 	    {
! 	      if (dump_enabled_p ())
! 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
! 				 "single-element interleaving not supported "
! 				 "for not adjacent vector loads\n");
! 	      return false;
! 	    }
! 
! 	  /* Single-element interleaving requires peeling for gaps.  */
! 	  force_peeling = true;
! 	}
! 
!       /* If there is a gap in the end of the group or the group size cannot
!          be made a multiple of the vector element count then we access excess
! 	 elements in the last iteration and thus need to peel that off.  */
!       if (loop_vinfo
! 	  && ! STMT_VINFO_STRIDED_P (stmt_info)
! 	  && (force_peeling
! 	      || GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
! 	      || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))
  	{
  	  if (dump_enabled_p ())
  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
! 			     "Data access with gaps requires scalar "
! 			     "epilogue loop\n");
! 	  if (loop->inner)
! 	    {
! 	      if (dump_enabled_p ())
! 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
! 				 "Peeling for outer loop is not supported\n");
! 	      return false;
! 	    }
! 
! 	  LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
  	}
  
        if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
Index: gcc/testsuite/gcc.dg/vect/slp-perm-4.c
===================================================================
*** gcc/testsuite/gcc.dg/vect/slp-perm-4.c	(revision 230998)
--- gcc/testsuite/gcc.dg/vect/slp-perm-4.c	(working copy)
***************
*** 33,39 ****
  #define M34 7716
  #define M44 16
  
! #define N 40
  
  void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
  {
--- 33,39 ----
  #define M34 7716
  #define M44 16
  
! #define N 20
  
  void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
  {
*************** int main (int argc, const char* argv[])
*** 60,68 ****
    unsigned int input[N], output[N], i;
    unsigned int check_results[N]
      = {3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399, 
!     22848, 8174, 307964, 146829, 22009, 32668, 11594, 447564, 202404, 31619, 
!     42488, 15014, 587164, 257979, 41229, 52308, 18434, 726764, 313554, 50839, 
!     62128, 21854, 866364, 369129, 60449, 71948, 25274, 1005964, 424704, 70059};
  
    check_vect ();
  
--- 60,66 ----
    unsigned int input[N], output[N], i;
    unsigned int check_results[N]
      = {3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399, 
!     22848, 8174, 307964, 146829, 22009, 32668, 11594, 447564, 202404, 31619};
  
    check_vect ();
  
*************** int main (int argc, const char* argv[])
*** 85,89 ****
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
  /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
- 
--- 83,87 ----
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "gaps requires scalar epilogue loop" 0 "vect" } } */
  /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
Index: gcc/testsuite/gcc.dg/vect/pr45752.c
===================================================================
*** gcc/testsuite/gcc.dg/vect/pr45752.c	(revision 230998)
--- gcc/testsuite/gcc.dg/vect/pr45752.c	(working copy)
***************
*** 33,39 ****
  #define M34 7716
  #define M44 16
  
! #define N 40
  
  void foo (unsigned int *__restrict__ pInput,
            unsigned int *__restrict__ pOutput,
--- 33,39 ----
  #define M34 7716
  #define M44 16
  
! #define N 20
  
  void foo (unsigned int *__restrict__ pInput,
            unsigned int *__restrict__ pOutput,
*************** int main (int argc, const char* argv[])
*** 77,90 ****
    unsigned int input[N], output[N], i, input2[N], output2[N];
    unsigned int check_results[N]
      = {3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399, 
!     22848, 8174, 307964, 146829, 22009, 32668, 11594, 447564, 202404, 31619, 
!     42488, 15014, 587164, 257979, 41229, 52308, 18434, 726764, 313554, 50839, 
!     62128, 21854, 866364, 369129, 60449, 71948, 25274, 1005964, 424704, 70059};
    unsigned int check_results2[N]
      = {7136, 2702, 84604, 57909, 6633, 16956, 6122, 224204, 113484, 16243, 
!     26776, 9542, 363804, 169059, 25853, 36596, 12962, 503404, 224634, 35463, 
!     46416, 16382, 643004, 280209, 45073, 56236, 19802, 782604, 335784, 54683, 
!     66056, 23222, 922204, 391359, 64293, 75876, 26642, 1061804, 446934, 73903};
  
    check_vect ();
  
--- 77,86 ----
    unsigned int input[N], output[N], i, input2[N], output2[N];
    unsigned int check_results[N]
      = {3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399, 
!     22848, 8174, 307964, 146829, 22009, 32668, 11594, 447564, 202404, 31619 };
    unsigned int check_results2[N]
      = {7136, 2702, 84604, 57909, 6633, 16956, 6122, 224204, 113484, 16243, 
!     26776, 9542, 363804, 169059, 25853, 36596, 12962, 503404, 224634, 35463 };
  
    check_vect ();
  
*************** int main (int argc, const char* argv[])
*** 108,111 ****
--- 104,108 ----
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } } */
+ /* { dg-final { scan-tree-dump-times "gaps requires scalar epilogue loop" 0 "vect" } } */
  /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */

Comments

Alan Lawrence Dec. 2, 2015, 1:27 p.m. UTC | #1
On 27/11/15 14:13, Richard Biener wrote:
>
> The following fixes the excessive peeling for gaps we do when doing
> SLP now that I removed most of the restrictions on having gaps in
> the first place.
>
> This should make low-trip vectorized loops more efficient (sth
> also the combine-epilogue-with-vectorized-body-by-masking patches
> claim to do).
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
>
> Richard.
>
> 2015-11-27  Richard Biener  <rguenther@suse.de>
>
> 	PR tree-optimization/68559
> 	* tree-vect-data-refs.c (vect_analyze_group_access_1): Move
> 	peeling for gap checks ...
> 	* tree-vect-stmts.c (vectorizable_load): ... here and relax
> 	for SLP.
> 	* tree-vect-loop.c (vect_analyze_loop_2): Re-set
> 	LOOP_VINFO_PEELING_FOR_GAPS before re-trying without SLP.
>
> 	* gcc.dg/vect/slp-perm-4.c: Adjust again.
> 	* gcc.dg/vect/pr45752.c: Likewise.

Since this, we have

FAIL: gcc.dg/vect/pr45752.c -flto -ffat-lto-objects  scan-tree-dump-times vect 
"gaps requires scalar epilogue loop" 0
FAIL: gcc.dg/vect/pr45752.c scan-tree-dump-times vect "gaps requires scalar 
epilogue loop" 0

on aarch64 platforms (aarch64-none-linux-gnu, aarch64-none-elf, 
aarch64_be-none-elf).


Thanks, Alan
Richard Biener Dec. 2, 2015, 1:30 p.m. UTC | #2
On Wed, 2 Dec 2015, Alan Lawrence wrote:

> On 27/11/15 14:13, Richard Biener wrote:
> > 
> > The following fixes the excessive peeling for gaps we do when doing
> > SLP now that I removed most of the restrictions on having gaps in
> > the first place.
> > 
> > This should make low-trip vectorized loops more efficient (sth
> > also the combine-epilogue-with-vectorized-body-by-masking patches
> > claim to do).
> > 
> > Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
> > 
> > Richard.
> > 
> > 2015-11-27  Richard Biener  <rguenther@suse.de>
> > 
> > 	PR tree-optimization/68559
> > 	* tree-vect-data-refs.c (vect_analyze_group_access_1): Move
> > 	peeling for gap checks ...
> > 	* tree-vect-stmts.c (vectorizable_load): ... here and relax
> > 	for SLP.
> > 	* tree-vect-loop.c (vect_analyze_loop_2): Re-set
> > 	LOOP_VINFO_PEELING_FOR_GAPS before re-trying without SLP.
> > 
> > 	* gcc.dg/vect/slp-perm-4.c: Adjust again.
> > 	* gcc.dg/vect/pr45752.c: Likewise.
> 
> Since this, we have
> 
> FAIL: gcc.dg/vect/pr45752.c -flto -ffat-lto-objects  scan-tree-dump-times vect
> "gaps requires scalar epilogue loop" 0
> FAIL: gcc.dg/vect/pr45752.c scan-tree-dump-times vect "gaps requires scalar
> epilogue loop" 0
> 
> on aarch64 platforms (aarch64-none-linux-gnu, aarch64-none-elf,
> aarch64_be-none-elf).

Can you open a bug and attach -details vectorizer dumps?

Richard.
diff mbox

Patch

Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c	(revision 230998)
+++ gcc/tree-vect-loop.c	(working copy)
@@ -2190,6 +2190,7 @@  again:
     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
   /* Reset assorted flags.  */
   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+  LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
 
   goto start_over;
Index: gcc/tree-vect-data-refs.c
===================================================================
--- gcc/tree-vect-data-refs.c	(revision 231005)
+++ gcc/tree-vect-data-refs.c	(working copy)
@@ -2166,10 +2166,6 @@  vect_analyze_group_access_1 (struct data
   HOST_WIDE_INT dr_step = -1;
   HOST_WIDE_INT groupsize, last_accessed_element = 1;
   bool slp_impossible = false;
-  struct loop *loop = NULL;
-
-  if (loop_vinfo)
-    loop = LOOP_VINFO_LOOP (loop_vinfo);
 
   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
      size of the interleaving group (including gaps).  */
@@ -2227,24 +2223,6 @@  vect_analyze_group_access_1 (struct data
 	      dump_printf (MSG_NOTE, "\n");
 	    }
 
-	  if (loop_vinfo)
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_NOTE, vect_location,
-		                 "Data access with gaps requires scalar "
-		                 "epilogue loop\n");
-              if (loop->inner)
-                {
-                  if (dump_enabled_p ())
-                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                     "Peeling for outer loop is not"
-                                     " supported\n");
-                  return false;
-                }
-
-              LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
-	    }
-
 	  return true;
 	}
 
@@ -2399,29 +2377,6 @@  vect_analyze_group_access_1 (struct data
           if (bb_vinfo)
             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
         }
-
-      /* If there is a gap in the end of the group or the group size cannot
-         be made a multiple of the vector element count then we access excess
-	 elements in the last iteration and thus need to peel that off.  */
-      if (loop_vinfo
-	  && (groupsize - last_accessed_element > 0
-	      || exact_log2 (groupsize) == -1))
-
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-	                     "Data access with gaps requires scalar "
-	                     "epilogue loop\n");
-          if (loop->inner)
-            {
-              if (dump_enabled_p ())
-                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                 "Peeling for outer loop is not supported\n");
-              return false;
-            }
-
-          LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
-	}
     }
 
   return true;