diff mbox series

[nvptx,PR85653,committed] Add workaround for subsequent bar.syncs

Message ID 179a62dd-b889-b81f-250a-d06afc15f334@mentor.com
State New
Headers show
Series [nvptx,PR85653,committed] Add workaround for subsequent bar.syncs | expand

Commit Message

Tom de Vries May 5, 2018, 8:11 a.m. UTC
Hi,

when compiling this testcase with the og7 branch without the recently 
committed "[nvptx, openacc] Don't emit barriers for empty loops":
...
int
main (void)
{
   long long v1;
#pragma acc parallel num_gangs (640) num_workers(1) vector_length (128)
#pragma acc loop
   for (v1 = 0; v1 < 20; v1 += 2)
     ;

   return 0;
}
...

this ptx is generated:
...
{

   bar.sync 0;
   bar.sync 0;
   ret;
}
...

This triggers some bug in the ptxas/JIT compiler that hangs the testcase 
on my quadro m1200.

We can work around this by adding two membar.ctas inbetween.

To the best of my knowledge, this is currently not triggering on trunk, 
but I'd rather have the workaround in place in case future changes will 
produce subsequent barsyncs again.

Build trunk with x86_64 with nvptx accelerator and tested libgomp.

Build og7 branch (both with the patch mentioned above in place, and 
reverted) with x86_64 with nvptx accelerator and tested libgomp.

Committed to trunk.

Thanks,
- Tom
diff mbox series

Patch

[nvptx] Add workaround for subsequent bar.syncs

2018-05-04  Tom de Vries  <tom@codesourcery.com>

	PR target/85653
	* config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG_3): Define.
	(workaround_barsyncs): New function.
	(nvptx_reorg): Use workaround_barsyncs.
	* config/nvptx/nvptx.md (define_c_enum "unspecv"): Add UNSPECV_MEMBAR.
	(define_expand "nvptx_membar_cta"): New define_expand.
	(define_insn "*nvptx_membar_cta"): New insn.

---
 gcc/config/nvptx/nvptx.c  | 49 +++++++++++++++++++++++++++++++++++++++++++++++
 gcc/config/nvptx/nvptx.md | 17 ++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index a0c7bc1..5608bee 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -79,6 +79,7 @@ 
 
 #define WORKAROUND_PTXJIT_BUG 1
 #define WORKAROUND_PTXJIT_BUG_2 1
+#define WORKAROUND_PTXJIT_BUG_3 1
 
 /* The various PTX memory areas an object might reside in.  */
 enum nvptx_data_area
@@ -4647,6 +4648,50 @@  prevent_branch_around_nothing (void)
   }
 #endif
 
+#ifdef WORKAROUND_PTXJIT_BUG_3
+/* Insert two membar.cta insns inbetween two subsequent bar.sync insns.  This
+   works around a hang observed at driver version 390.48 for sm_50.  */
+
+static void
+workaround_barsyncs (void)
+{
+  bool seen_barsync = false;
+  for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    {
+      if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
+	{
+	  if (seen_barsync)
+	    {
+	      emit_insn_before (gen_nvptx_membar_cta (), insn);
+	      emit_insn_before (gen_nvptx_membar_cta (), insn);
+	    }
+
+	  seen_barsync = true;
+	  continue;
+	}
+
+      if (!seen_barsync)
+	continue;
+
+      if (NOTE_P (insn) || DEBUG_INSN_P (insn))
+	continue;
+      else if (INSN_P (insn))
+	switch (recog_memoized (insn))
+	  {
+	  case CODE_FOR_nvptx_fork:
+	  case CODE_FOR_nvptx_forked:
+	  case CODE_FOR_nvptx_joining:
+	  case CODE_FOR_nvptx_join:
+	    continue;
+	  default:
+	    break;
+	  }
+
+      seen_barsync = false;
+    }
+}
+#endif
+
 /* PTX-specific reorganization
    - Split blocks at fork and join instructions
    - Compute live registers
@@ -4730,6 +4775,10 @@  nvptx_reorg (void)
   prevent_branch_around_nothing ();
 #endif
 
+#ifdef WORKAROUND_PTXJIT_BUG_3
+  workaround_barsyncs ();
+#endif
+
   regstat_free_n_sets_and_refs ();
 
   df_finish_pass (true);
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 68bba36..9754219 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -56,6 +56,7 @@ 
    UNSPECV_XCHG
    UNSPECV_BARSYNC
    UNSPECV_MEMBAR
+   UNSPECV_MEMBAR_CTA
    UNSPECV_DIM_POS
 
    UNSPECV_FORK
@@ -1481,6 +1482,22 @@ 
   "\\tmembar.sys;"
   [(set_attr "predicable" "false")])
 
+(define_expand "nvptx_membar_cta"
+  [(set (match_dup 0)
+	(unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMBAR_CTA))]
+  ""
+{
+  operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[0]) = 1;
+})
+
+(define_insn "*nvptx_membar_cta"
+  [(set (match_operand:BLK 0 "" "")
+	(unspec_volatile:BLK [(match_dup 0)] UNSPECV_MEMBAR_CTA))]
+  ""
+  "\\tmembar.cta;"
+  [(set_attr "predicable" "false")])
+
 (define_insn "nvptx_nounroll"
   [(unspec_volatile [(const_int 0)] UNSPECV_NOUNROLL)]
   ""