[nvptx, openacc] Don't emit barriers for empty loops
2018-04-21 Tom de Vries <tom@codesourcery.com>
PR target/85381
* config/nvptx/nvptx.c (nvptx_process_pars): Don't emit barriers for
empty loops.
* testsuite/libgomp.oacc-c-c++-common/pr85381-2.c: New test.
* testsuite/libgomp.oacc-c-c++-common/pr85381-3.c: New test.
* testsuite/libgomp.oacc-c-c++-common/pr85381-4.c: New test.
* testsuite/libgomp.oacc-c-c++-common/pr85381-5.c: New test.
* testsuite/libgomp.oacc-c-c++-common/pr85381.c: New test.
---
gcc/config/nvptx/nvptx.c | 15 +++++++---
.../libgomp.oacc-c-c++-common/pr85381-2.c | 35 ++++++++++++++++++++++
.../libgomp.oacc-c-c++-common/pr85381-3.c | 34 +++++++++++++++++++++
.../libgomp.oacc-c-c++-common/pr85381-4.c | 26 ++++++++++++++++
.../libgomp.oacc-c-c++-common/pr85381-5.c | 23 ++++++++++++++
.../testsuite/libgomp.oacc-c-c++-common/pr85381.c | 17 +++++++++++
6 files changed, 146 insertions(+), 4 deletions(-)
@@ -4467,9 +4467,12 @@ nvptx_process_pars (parallel *par)
{
nvptx_shared_propagate (false, is_call, par->forked_block,
par->forked_insn, !worker);
- bool empty = nvptx_shared_propagate (true, is_call,
- par->forked_block, par->fork_insn,
- !worker);
+ bool no_prop_p
+ = nvptx_shared_propagate (true, is_call, par->forked_block,
+ par->fork_insn, !worker);
+ bool empty_loop_p
+ = !is_call && (NEXT_INSN (par->forked_insn)
+ && NEXT_INSN (par->forked_insn) == par->joining_insn);
rtx barrier = GEN_INT (0);
int threads = 0;
@@ -4479,7 +4482,11 @@ nvptx_process_pars (parallel *par)
threads = nvptx_mach_vector_length ();
}
- if (!empty || !is_call)
+ if (no_prop_p && empty_loop_p)
+ ;
+ else if (no_prop_p && is_call)
+ ;
+ else
{
/* Insert begin and end synchronizations. */
emit_insn_before (nvptx_cta_sync (barrier, threads),
new file mode 100644
@@ -0,0 +1,35 @@
+/* { dg-additional-options "-save-temps" } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1 -O2" } } */
+
+int
+main (void)
+{
+ int v1;
+
+ #pragma acc parallel
+ #pragma acc loop worker
+ for (v1 = 0; v1 < 20; v1 += 2)
+ ;
+
+ return 0;
+}
+
+/* Todo: Boths bar.syncs can be removed.
+ Atm we generate this dead code inbetween forked and joining:
+
+ mov.u32 %r28, %ntid.y;
+ mov.u32 %r29, %tid.y;
+ add.u32 %r30, %r29, %r29;
+ setp.gt.s32 %r31, %r30, 19;
+ @%r31 bra $L2;
+ add.u32 %r25, %r28, %r28;
+ mov.u32 %r24, %r30;
+ $L3:
+ add.u32 %r24, %r24, %r25;
+ setp.le.s32 %r33, %r24, 19;
+ @%r33 bra $L3;
+ $L2:
+
+ so the loop is not recognized as empty loop (which we detect by seeing if
+ joining immediately follows forked). */
+/* { dg-final { scan-assembler-times "bar.sync" 2 } } */
new file mode 100644
@@ -0,0 +1,34 @@
+/* { dg-additional-options "-save-temps -w" } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1 -O2" } } */
+
+int a;
+#pragma acc declare create(a)
+
+#pragma acc routine vector
+void __attribute__((noinline, noclone))
+foo_v (void)
+{
+ a = 1;
+}
+
+#pragma acc routine worker
+void __attribute__((noinline, noclone))
+foo_w (void)
+{
+ a = 2;
+}
+
+int
+main (void)
+{
+
+ #pragma acc parallel
+ foo_v ();
+
+ #pragma acc parallel
+ foo_w ();
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "bar.sync" } } */
new file mode 100644
@@ -0,0 +1,26 @@
+/* { dg-additional-options "-save-temps -w" } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1 -O2" } } */
+
+#define n 1024
+
+int
+main (void)
+{
+ #pragma acc parallel
+ {
+ #pragma acc loop worker
+ for (int i = 0; i < n; i++)
+ ;
+
+ #pragma acc loop worker
+ for (int i = 0; i < n; i++)
+ ;
+ }
+
+ return 0;
+}
+
+/* Atm, %ntid.y is broadcast from one loop to the next, so there are 2 bar.syncs
+ for that (the other two are there for the same reason as in pr85381-2.c).
+ Todo: Recompute %ntid.y instead of broadcasting it. */
+/* { dg-final { scan-assembler-times "bar.sync" 4 } } */
new file mode 100644
@@ -0,0 +1,23 @@
+/* { dg-additional-options "-save-temps" } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1 -O2" } } */
+
+#define n 1024
+
+int
+main (void)
+{
+ #pragma acc parallel vector_length(128)
+ {
+ #pragma acc loop vector
+ for (int i = 0; i < n; i++)
+ ;
+
+ #pragma acc loop vector
+ for (int i = 0; i < n; i++)
+ ;
+ }
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "bar.sync" } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* { dg-additional-options "-save-temps" } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_DEVICE_TYPE_nvidia=1 -O2" } } */
+
+int
+main (void)
+{
+ int v1;
+
+ #pragma acc parallel vector_length (128)
+ #pragma acc loop vector
+ for (v1 = 0; v1 < 20; v1 += 2)
+ ;
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "bar.sync" } } */