diff mbox series

[nvptx,PR85204] Fix neutering of bb with only cond jump

Message ID fcfdc3be-bd2b-5484-8991-8aabbf23a7f3@mentor.com
State New
Headers show
Series [nvptx,PR85204] Fix neutering of bb with only cond jump | expand

Commit Message

Tom de Vries April 5, 2018, 8:41 a.m. UTC
Hi,

When compiling the test-case in the patch, the following ptx code is 
generated:
...
$L4:
   @ %r91 bra.uni $L24;
   selp.u32 %r95,1,0,%r80;
   st.shared.u32 [__worker_bcast],%r95;
  $L25:
  $L24:
   @ %r92 bra $L25;
...

There's an eternal loop starting at the last insn, and unsurprisingly 
the test-case hangs.

The last insn is a vector neutering branch, which should have been 
inserted after the worker neutering branch (the first insn).

In other words, we want:
...
  $L4:
    @ %r91 bra.uni $L24;
+  @ %r92 bra $L25;
    selp.u32 %r95,1,0,%r80;
    st.shared.u32 [__worker_bcast],%r95;
   $L25:
   $L24:
-  @ %r92 bra $L25;
...

This minimal stage4 patch fixes this problem. [ I filed a PR85223 
"[nvptx] nvptx_single needs rewrite" for a stage1 rewrite of nvptx_single. ]

Build x86_64 with nvptx accelerator, and tested libgomp.

Committed to stage4 trunk.

Thanks,
- Tom
diff mbox series

Patch

[nvptx] Fix neutering of bb with only cond jump

2018-04-05  Tom de Vries  <tom@codesourcery.com>

	PR target/85204
	* config/nvptx/nvptx.c (nvptx_single): Fix neutering of bb with only
	cond jump.

	* testsuite/libgomp.oacc-c-c++-common/broadcast-1.c: New test.

---
 gcc/config/nvptx/nvptx.c                           |  6 ++-
 .../libgomp.oacc-c-c++-common/broadcast-1.c        | 49 ++++++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index b2b150f..a9a3053 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4048,6 +4048,7 @@  nvptx_single (unsigned mask, basic_block from, basic_block to)
   /* Insert the vector test inside the worker test.  */
   unsigned mode;
   rtx_insn *before = tail;
+  rtx_insn *neuter_start = NULL;
   for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
     if (GOMP_DIM_MASK (mode) & skip_mask)
       {
@@ -4065,7 +4066,10 @@  nvptx_single (unsigned mask, basic_block from, basic_block to)
 	  br = gen_br_true (pred, label);
 	else
 	  br = gen_br_true_uni (pred, label);
-	emit_insn_before (br, head);
+	if (neuter_start)
+	  neuter_start = emit_insn_after (br, neuter_start);
+	else
+	  neuter_start = emit_insn_before (br, head);
 
 	LABEL_NUSES (label)++;
 	if (tail_branch)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-1.c
new file mode 100644
index 0000000..ca0d37b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-1.c
@@ -0,0 +1,49 @@ 
+/* Ensure that worker-vector state conditional expressions are
+   properly handled by the nvptx backend.  */
+
+#include <assert.h>
+#include <math.h>
+
+
+#define N 1024
+
+int A[N][N] ;
+
+void test(int x)
+{
+#pragma acc parallel  num_gangs(16) num_workers(4) vector_length(32) copyout(A)
+  {
+#pragma acc loop gang
+    for(int j=0;j<N;j++)
+      {
+	if (x==1)
+	  {
+#pragma acc loop worker vector
+	    for(int i=0;i<N;i++)
+	      A[i][j] = 1;
+	  }
+	else
+	  {
+#pragma acc loop worker vector
+	    for(int i=0;i<N;i++)
+	      A[i][j] = -1;
+	  }
+      }
+  }
+}
+
+
+int main(void)
+{
+  test (0);
+  for (int i = 0; i < N; i++)
+    for (int j = 0; j < N; j++)
+      assert (A[i][j] == -1);
+
+  test (1);
+  for (int i = 0; i < N; i++)
+    for (int j = 0; j < N; j++)
+      assert (A[i][j] == 1);
+
+  return 0;
+}