Message ID | 878t3w9yx2.fsf@arm.com |
---|---|
State | New |
Headers | show |
Series | Fix PEELING_FOR_NITERS calculation (PR 87288) | expand |
On Thu, Sep 20, 2018 at 1:44 PM Richard Sandiford <richard.sandiford@arm.com> wrote: > > PEELING_FOR_GAPS now means "peel one iteration for the epilogue", > in much the same way that PEELING_FOR_ALIGNMENT > 0 means > "peel that number of iterations for the prologue". We weren't > taking this into account when deciding whether we needed to peel > further scalar iterations beyond the iterations for "gaps" and > "alignment". > > Only the first test failed before the patch. The other two > are just for completeness. > > Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf > and x86_64-linux-gnu. OK to install? OK. Richard. > Richard > > > 2018-09-20 Richard Sandiford <richard.sandiford@arm.com> > > gcc/ > PR tree-optimization/87288 > * tree-vect-loop.c (vect_analyze_loop_2): Take PEELING_FOR_GAPS > into account when determining PEELING_FOR_NITERS. > > gcc/testsuite/ > PR tree-optimization/87288 > * gcc.dg/vect/pr87288-1.c: New test. > * gcc.dg/vect/pr87288-2.c: Likewise, > * gcc.dg/vect/pr87288-3.c: Likewise. > > Index: gcc/tree-vect-loop.c > =================================================================== > --- gcc/tree-vect-loop.c 2018-09-20 12:39:14.541555902 +0100 > +++ gcc/tree-vect-loop.c 2018-09-20 12:39:19.013518199 +0100 > @@ -2074,14 +2074,22 @@ vect_analyze_loop_2 (loop_vec_info loop_ > /* The main loop handles all iterations. */ > LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; > else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) > - && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) > + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) > { > - if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) > - - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo), > + /* Work out the (constant) number of iterations that need to be > + peeled for reasons other than niters. */ > + unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); > + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) > + peel_niter += 1; > + if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter, > LOOP_VINFO_VECT_FACTOR (loop_vinfo))) > LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; > } > else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > + /* ??? When peeling for gaps but not alignment, we could > + try to check whether the (variable) niters is known to be > + VF * N + 1. That's something of a niche case though. */ > + || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) > || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf) > || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) > < (unsigned) exact_log2 (const_vf)) > Index: gcc/testsuite/gcc.dg/vect/pr87288-1.c > =================================================================== > --- /dev/null 2018-09-14 11:16:31.122530289 +0100 > +++ gcc/testsuite/gcc.dg/vect/pr87288-1.c 2018-09-20 12:39:19.009518233 +0100 > @@ -0,0 +1,49 @@ > +#include "tree-vect.h" > + > +#define N (VECTOR_BITS / 32) > +#define MAX_COUNT 4 > + > +void __attribute__ ((noipa)) > +run (int *restrict a, int *restrict b, int count) > +{ > + for (int i = 0; i < count * N; ++i) > + { > + a[i * 2] = b[i * 2] + count; > + a[i * 2 + 1] = count; > + } > +} > + > +void __attribute__ ((noipa)) > +check (int *restrict a, int count) > +{ > + for (int i = 0; i < count * N; ++i) > + if (a[i * 2] != i * 41 + count || a[i * 2 + 1] != count) > + __builtin_abort (); > + if (a[count * 2 * N] != 999) > + __builtin_abort (); > +} > + > +int a[N * MAX_COUNT * 2 + 1], b[N * MAX_COUNT * 2]; > + > +int > +main (void) > +{ > + check_vect (); > + > + for (int i = 0; i < N * MAX_COUNT; ++i) > + { > + b[i * 2] = i * 41; > + asm volatile ("" ::: "memory"); > + } > + > + for (int i = 0; i <= MAX_COUNT; ++i) > + { > + a[i * 2 * N] = 999; > + run (a, b, i); > + check (a, i); > + } > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times {LOOP VECTORIZED} 1 "vect" { target { { vect_int && vect_perm } && vect_element_align } } } } */ > Index: gcc/testsuite/gcc.dg/vect/pr87288-2.c > =================================================================== > --- /dev/null 2018-09-14 11:16:31.122530289 +0100 > +++ gcc/testsuite/gcc.dg/vect/pr87288-2.c 2018-09-20 12:39:19.009518233 +0100 > @@ -0,0 +1,64 @@ > +#include "tree-vect.h" > + > +#define N (VECTOR_BITS / 32) > +#define MAX_COUNT 4 > + > +#define RUN_COUNT(COUNT) \ > + void __attribute__ ((noipa)) \ > + run_##COUNT (int *restrict a, int *restrict b) \ > + { \ > + for (int i = 0; i < N * COUNT; ++i) \ > + { \ > + a[i * 2] = b[i * 2] + COUNT; \ > + a[i * 2 + 1] = COUNT; \ > + } \ > + } > + > +RUN_COUNT (1) > +RUN_COUNT (2) > +RUN_COUNT (3) > +RUN_COUNT (4) > + > +void __attribute__ ((noipa)) > +check (int *restrict a, int count) > +{ > + for (int i = 0; i < count * N; ++i) > + if (a[i * 2] != i * 41 + count || a[i * 2 + 1] != count) > + __builtin_abort (); > + if (a[count * 2 * N] != 999) > + __builtin_abort (); > +} > + > +int a[N * MAX_COUNT * 2 + 1], b[N * MAX_COUNT * 2]; > + > +int > +main (void) > +{ > + check_vect (); > + > + for (int i = 0; i < N * MAX_COUNT; ++i) > + { > + b[i * 2] = i * 41; > + asm volatile ("" ::: "memory"); > + } > + > + a[N * 2] = 999; > + run_1 (a, b); > + check (a, 1); > + > + a[N * 4] = 999; > + run_2 (a, b); > + check (a, 2); > + > + a[N * 6] = 999; > + run_3 (a, b); > + check (a, 3); > + > + a[N * 8] = 999; > + run_4 (a, b); > + check (a, 4); > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump {LOOP VECTORIZED} "vect" { target { { vect_int && vect_perm } && vect_element_align } } } } */ > Index: gcc/testsuite/gcc.dg/vect/pr87288-3.c > =================================================================== > --- /dev/null 2018-09-14 11:16:31.122530289 +0100 > +++ gcc/testsuite/gcc.dg/vect/pr87288-3.c 2018-09-20 12:39:19.009518233 +0100 > @@ -0,0 +1,64 @@ > +#include "tree-vect.h" > + > +#define N (VECTOR_BITS / 32) > +#define MAX_COUNT 4 > + > +#define RUN_COUNT(COUNT) \ > + void __attribute__ ((noipa)) \ > + run_##COUNT (int *restrict a, int *restrict b) \ > + { \ > + for (int i = 0; i < N * COUNT + 1; ++i) \ > + { \ > + a[i * 2] = b[i * 2] + COUNT; \ > + a[i * 2 + 1] = COUNT; \ > + } \ > + } > + > +RUN_COUNT (1) > +RUN_COUNT (2) > +RUN_COUNT (3) > +RUN_COUNT (4) > + > +void __attribute__ ((noipa)) > +check (int *restrict a, int count) > +{ > + for (int i = 0; i < count * N + 1; ++i) > + if (a[i * 2] != i * 41 + count || a[i * 2 + 1] != count) > + __builtin_abort (); > + if (a[count * 2 * N + 2] != 999) > + __builtin_abort (); > +} > + > +int a[N * MAX_COUNT * 2 + 3], b[N * MAX_COUNT * 2 + 2]; > + > +int > +main (void) > +{ > + check_vect (); > + > + for (int i = 0; i < N * MAX_COUNT + 1; ++i) > + { > + b[i * 2] = i * 41; > + asm volatile ("" ::: "memory"); > + } > + > + a[N * 2 + 2] = 999; > + run_1 (a, b); > + check (a, 1); > + > + a[N * 4 + 2] = 999; > + run_2 (a, b); > + check (a, 2); > + > + a[N * 6 + 2] = 999; > + run_3 (a, b); > + check (a, 3); > + > + a[N * 8 + 2] = 999; > + run_4 (a, b); > + check (a, 4); > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump {LOOP VECTORIZED} "vect" { target { { vect_int && vect_perm } && vect_element_align } } } } */
Index: gcc/tree-vect-loop.c =================================================================== --- gcc/tree-vect-loop.c 2018-09-20 12:39:14.541555902 +0100 +++ gcc/tree-vect-loop.c 2018-09-20 12:39:19.013518199 +0100 @@ -2074,14 +2074,22 @@ vect_analyze_loop_2 (loop_vec_info loop_ /* The main loop handles all iterations. */ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) - && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) { - if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo), + /* Work out the (constant) number of iterations that need to be + peeled for reasons other than niters. */ + unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) + peel_niter += 1; + if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter, LOOP_VINFO_VECT_FACTOR (loop_vinfo))) LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; } else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) + /* ??? When peeling for gaps but not alignment, we could + try to check whether the (variable) niters is known to be + VF * N + 1. That's something of a niche case though. */ + || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf) || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) < (unsigned) exact_log2 (const_vf)) Index: gcc/testsuite/gcc.dg/vect/pr87288-1.c =================================================================== --- /dev/null 2018-09-14 11:16:31.122530289 +0100 +++ gcc/testsuite/gcc.dg/vect/pr87288-1.c 2018-09-20 12:39:19.009518233 +0100 @@ -0,0 +1,49 @@ +#include "tree-vect.h" + +#define N (VECTOR_BITS / 32) +#define MAX_COUNT 4 + +void __attribute__ ((noipa)) +run (int *restrict a, int *restrict b, int count) +{ + for (int i = 0; i < count * N; ++i) + { + a[i * 2] = b[i * 2] + count; + a[i * 2 + 1] = count; + } +} + +void __attribute__ ((noipa)) +check (int *restrict a, int count) +{ + for (int i = 0; i < count * N; ++i) + if (a[i * 2] != i * 41 + count || a[i * 2 + 1] != count) + __builtin_abort (); + if (a[count * 2 * N] != 999) + __builtin_abort (); +} + +int a[N * MAX_COUNT * 2 + 1], b[N * MAX_COUNT * 2]; + +int +main (void) +{ + check_vect (); + + for (int i = 0; i < N * MAX_COUNT; ++i) + { + b[i * 2] = i * 41; + asm volatile ("" ::: "memory"); + } + + for (int i = 0; i <= MAX_COUNT; ++i) + { + a[i * 2 * N] = 999; + run (a, b, i); + check (a, i); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times {LOOP VECTORIZED} 1 "vect" { target { { vect_int && vect_perm } && vect_element_align } } } } */ Index: gcc/testsuite/gcc.dg/vect/pr87288-2.c =================================================================== --- /dev/null 2018-09-14 11:16:31.122530289 +0100 +++ gcc/testsuite/gcc.dg/vect/pr87288-2.c 2018-09-20 12:39:19.009518233 +0100 @@ -0,0 +1,64 @@ +#include "tree-vect.h" + +#define N (VECTOR_BITS / 32) +#define MAX_COUNT 4 + +#define RUN_COUNT(COUNT) \ + void __attribute__ ((noipa)) \ + run_##COUNT (int *restrict a, int *restrict b) \ + { \ + for (int i = 0; i < N * COUNT; ++i) \ + { \ + a[i * 2] = b[i * 2] + COUNT; \ + a[i * 2 + 1] = COUNT; \ + } \ + } + +RUN_COUNT (1) +RUN_COUNT (2) +RUN_COUNT (3) +RUN_COUNT (4) + +void __attribute__ ((noipa)) +check (int *restrict a, int count) +{ + for (int i = 0; i < count * N; ++i) + if (a[i * 2] != i * 41 + count || a[i * 2 + 1] != count) + __builtin_abort (); + if (a[count * 2 * N] != 999) + __builtin_abort (); +} + +int a[N * MAX_COUNT * 2 + 1], b[N * MAX_COUNT * 2]; + +int +main (void) +{ + check_vect (); + + for (int i = 0; i < N * MAX_COUNT; ++i) + { + b[i * 2] = i * 41; + asm volatile ("" ::: "memory"); + } + + a[N * 2] = 999; + run_1 (a, b); + check (a, 1); + + a[N * 4] = 999; + run_2 (a, b); + check (a, 2); + + a[N * 6] = 999; + run_3 (a, b); + check (a, 3); + + a[N * 8] = 999; + run_4 (a, b); + check (a, 4); + + return 0; +} + +/* { dg-final { scan-tree-dump {LOOP VECTORIZED} "vect" { target { { vect_int && vect_perm } && vect_element_align } } } } */ Index: gcc/testsuite/gcc.dg/vect/pr87288-3.c =================================================================== --- /dev/null 2018-09-14 11:16:31.122530289 +0100 +++ gcc/testsuite/gcc.dg/vect/pr87288-3.c 2018-09-20 12:39:19.009518233 +0100 @@ -0,0 +1,64 @@ +#include "tree-vect.h" + +#define N (VECTOR_BITS / 32) +#define MAX_COUNT 4 + +#define RUN_COUNT(COUNT) \ + void __attribute__ ((noipa)) \ + run_##COUNT (int *restrict a, int *restrict b) \ + { \ + for (int i = 0; i < N * COUNT + 1; ++i) \ + { \ + a[i * 2] = b[i * 2] + COUNT; \ + a[i * 2 + 1] = COUNT; \ + } \ + } + +RUN_COUNT (1) +RUN_COUNT (2) +RUN_COUNT (3) +RUN_COUNT (4) + +void __attribute__ ((noipa)) +check (int *restrict a, int count) +{ + for (int i = 0; i < count * N + 1; ++i) + if (a[i * 2] != i * 41 + count || a[i * 2 + 1] != count) + __builtin_abort (); + if (a[count * 2 * N + 2] != 999) + __builtin_abort (); +} + +int a[N * MAX_COUNT * 2 + 3], b[N * MAX_COUNT * 2 + 2]; + +int +main (void) +{ + check_vect (); + + for (int i = 0; i < N * MAX_COUNT + 1; ++i) + { + b[i * 2] = i * 41; + asm volatile ("" ::: "memory"); + } + + a[N * 2 + 2] = 999; + run_1 (a, b); + check (a, 1); + + a[N * 4 + 2] = 999; + run_2 (a, b); + check (a, 2); + + a[N * 6 + 2] = 999; + run_3 (a, b); + check (a, 3); + + a[N * 8 + 2] = 999; + run_4 (a, b); + check (a, 4); + + return 0; +} + +/* { dg-final { scan-tree-dump {LOOP VECTORIZED} "vect" { target { { vect_int && vect_perm } && vect_element_align } } } } */