diff mbox

[gomp4] Remove some ptxness from middle end

Message ID 55883F43.2080604@codesourcery.com
State New
Headers show

Commit Message

Nathan Sidwell June 22, 2015, 5 p.m. UTC
I've committed this patch to the gomp4 branch, after testing.  It does a number 
of cleanups

1) removes the ptx-specific TID, NTID, CTAID & NCTAID builtins, replacing them 
with openacc-specific GOACC_id and GOACC_nid builtins, using gang/worker & 
vector level enumeration.  These are mapped by the PTX backend to PTX-specifc 
instructions.

2) Created a  oacc_loop_levels enumeration, and generate the loop nest masks 
from that.

3) Removed a bunch of duplicate calculations in omp-low related to determining 
number of threads and thread index. With #2 it becomes easier to use a loop.

nathan

Comments

Marek Polacek June 22, 2015, 5:04 p.m. UTC | #1
On Mon, Jun 22, 2015 at 01:00:51PM -0400, Nathan Sidwell wrote:
> +  if (GET_CODE (arg) != CONST_INT
> +      || (unsigned HOST_WIDE_INT)INTVAL (arg) >= OACC_HWM)

Don't we have UINTVAL for this?  So UINTVAL (arg).

	Marek
Nathan Sidwell June 22, 2015, 5:07 p.m. UTC | #2
On 06/22/15 13:04, Marek Polacek wrote:
> On Mon, Jun 22, 2015 at 01:00:51PM -0400, Nathan Sidwell wrote:
>> +  if (GET_CODE (arg) != CONST_INT
>> +      || (unsigned HOST_WIDE_INT)INTVAL (arg) >= OACC_HWM)
>
> Don't we have UINTVAL for this?  So UINTVAL (arg).

Oh, thanks! will fix

nathan
diff mbox

Patch

2015-06-20  Nathan Sidwell  <nathan@codesourcery.com>

	gcc/
	* omp-builtins.def (BUILT_IN_GOACC_NTID, BUILTIN_NCTAID): Replace
	with ...
	(BUILT_IN_GOACC_NID): ... this.
	(BUILT_IN_GOACC_TID, BUILTIN_CTAID): Replace with ...
	(BUILT_IN_GOACC_ID): ... this.
	* builtins.c: Include omp-low.h.
	(expand_oacc_buoltin): Replace with ...
	(expand_oacc_id): ... this.
	(expand_builtin, is_simple_builtin): Adjust.oo
	* omp-low.h (enum oacc_loop_levels): New.
	* omp-low.c (MASK_GANG, MASK_WORKER, MASK_VECTOR): Replace with ...
	(OACC_LOOP_MASK): ... this.
	(scan_omp_for, scan_omp_target): Adjust.
	(expand_oacc_get_num_threads): Adjust and use a loop.
	(expand_oacc_get_thread_num): Likewise.
	(oacc_loop_needs_thread_barrier_p, find_omp_for_region_gwv,
	find_omp_taarget_region_data, required_predication_mask,
	generate_vector_broadcast, generate_oacc_broadcast): Adjust.
	(make_predication_test): Adjust and use a loop.
	(predicate_bb, oacc_broadcast, oacc_init_count_vars): Adjust.
	* config/nvptx/nvptx.md (UNSPEC_NTID, UNSPEC_TID, UNSPEC_NCTAID,
	UNSPEC_CTAID): Replace with ...
	(UNSPEC_NID, UNSPEC_ID): ... these.
	(*oacc_ntid_insn, oacc_ntid, *oacc_tid_insn, oacc_tid,
	*oacc_nctaid_insn, oacc_nctaid, *oacc_ctaid_insn,
	oacc_ctaid): Replace with ...
	(oacc_nid, oacc_id): ... these.
	* config/nvptx/nvptx.c (nvptx_print_operand [CASE 'd']): Remove.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/gang-static-2.c: Replace
	GOACC_ctaid builtin with GOACC_id.

Index: libgomp/testsuite/libgomp.oacc-c-c++-common/gang-static-2.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/gang-static-2.c	(revision 224671)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/gang-static-2.c	(working copy)
@@ -35,38 +35,38 @@  main ()
 
 #pragma acc parallel loop gang (static:*) num_gangs (10)
   for (i = 0; i < 100; i++)
-    a[i] = __builtin_GOACC_ctaid (0);
+    a[i] = __builtin_GOACC_id (0);
 
   test_nonstatic (a, 10);
 
 #pragma acc parallel loop gang (static:1) num_gangs (10)
   for (i = 0; i < 100; i++)
-    a[i] = __builtin_GOACC_ctaid (0);
+    a[i] = __builtin_GOACC_id (0);
 
   test_static (a, 10, 1);
 
 #pragma acc parallel loop gang (static:2) num_gangs (10)
   for (i = 0; i < 100; i++)
-    a[i] = __builtin_GOACC_ctaid (0);
+    a[i] = __builtin_GOACC_id (0);
 
   test_static (a, 10, 2);
 
 #pragma acc parallel loop gang (static:5) num_gangs (10)
   for (i = 0; i < 100; i++)
-    a[i] = __builtin_GOACC_ctaid (0);
+    a[i] = __builtin_GOACC_id (0);
 
   test_static (a, 10, 5);
 
 #pragma acc parallel loop gang (static:20) num_gangs (10)
   for (i = 0; i < 100; i++)
-    a[i] = __builtin_GOACC_ctaid (0);
+    a[i] = __builtin_GOACC_id (0);
 
   test_static (a, 10, 20);
 
   /* Non-static gang.  */
 #pragma acc parallel loop gang num_gangs (10)
   for (i = 0; i < 100; i++)
-    a[i] = __builtin_GOACC_ctaid (0);
+    a[i] = __builtin_GOACC_id (0);
 
   test_nonstatic (a, 10);
 
Index: gcc/omp-builtins.def
===================================================================
--- gcc/omp-builtins.def	(revision 224671)
+++ gcc/omp-builtins.def	(working copy)
@@ -61,13 +61,9 @@  DEF_GOACC_BUILTIN_FNSPEC (BUILT_IN_GOACC
 DEF_GOACC_BUILTIN (BUILT_IN_GOACC_WAIT, "GOACC_wait",
 		   BT_FN_VOID_INT_INT_VAR,
 		   ATTR_NOTHROW_LIST)
-DEF_GOACC_BUILTIN (BUILT_IN_GOACC_NTID, "GOACC_ntid",
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_ID, "GOACC_id",
 		   BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
-DEF_GOACC_BUILTIN (BUILT_IN_GOACC_TID, "GOACC_tid",
-		   BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
-DEF_GOACC_BUILTIN (BUILT_IN_GOACC_NCTAID, "GOACC_nctaid",
-		   BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
-DEF_GOACC_BUILTIN (BUILT_IN_GOACC_CTAID, "GOACC_ctaid",
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_NID, "GOACC_nid",
 		   BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
 DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_GANGLOCAL_PTR, "GOACC_get_ganglocal_ptr",
 		   BT_FN_PTR, ATTR_NOTHROW_LEAF_LIST)
Index: gcc/config/nvptx/nvptx.md
===================================================================
--- gcc/config/nvptx/nvptx.md	(revision 224671)
+++ gcc/config/nvptx/nvptx.md	(working copy)
@@ -49,10 +49,8 @@ 
 
    UNSPEC_ALLOCA
 
-   UNSPEC_NTID
-   UNSPEC_TID
-   UNSPEC_NCTAID
-   UNSPEC_CTAID
+   UNSPEC_NID
+   UNSPEC_ID
 
    UNSPEC_SHARED_DATA
 ])
@@ -1263,65 +1261,32 @@ 
   DONE;
 })
 
-(define_insn "*oacc_ntid_insn"
-  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "n")] UNSPEC_NTID))]
-  ""
-  "%.\\tmov.u32 %0, %%ntid%d1;")
-
-(define_expand "oacc_ntid"
-  [(set (match_operand:SI 0 "nvptx_register_operand" "")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_NTID))]
-  ""
-{
-  if (INTVAL (operands[1]) < 0 || INTVAL (operands[1]) > 2)
-    FAIL;
-})
-
-(define_insn "*oacc_tid_insn"
-  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "n")] UNSPEC_TID))]
-  ""
-  "%.\\tmov.u32 %0, %%tid%d1;")
-
-(define_expand "oacc_tid"
+(define_insn "oacc_nid"
   [(set (match_operand:SI 0 "nvptx_register_operand" "")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_TID))]
+	(unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_NID))]
   ""
 {
-  if (INTVAL (operands[1]) < 0 || INTVAL (operands[1]) > 2)
-    FAIL;
+  static const char *const asms[] =
+{ /* Must match oacc_loop_levels ordering.  */
+  "%.\\tmov.u32 %0, %%nctaid.x;",/* gang */
+  "%.\\tmov.u32 %0, %%ntid.y;",	/* worker */
+  "%.\\tmov.u32 %0, %%ntid.x;",	/* vector */
+};
+  return asms[INTVAL (operands[1])];
 })
 
-;; Number of CUDA grids (CPA = Cooperative Thread Arrays)
-(define_insn "*oacc_nctaid_insn"
-  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "n")] UNSPEC_NCTAID))]
-  ""
-  "%.\\tmov.u32 %0, %%nctaid%d1;")
-
-(define_expand "oacc_nctaid"
-  [(set (match_operand:SI 0 "nvptx_register_operand" "")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_NCTAID))]
-  ""
-{
-  if (INTVAL (operands[1]) < 0 || INTVAL (operands[1]) > 2)
-    FAIL;
-})
-
-(define_insn "*oacc_ctaid_insn"
-  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "n")] UNSPEC_CTAID))]
-  ""
-  "%.\\tmov.u32 %0, %%ctaid%d1;")
-
-(define_expand "oacc_ctaid"
+(define_insn "oacc_id"
   [(set (match_operand:SI 0 "nvptx_register_operand" "")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_CTAID))]
+	(unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_ID))]
   ""
 {
-  if (INTVAL (operands[1]) < 0 || INTVAL (operands[1]) > 2)
-    FAIL;
+  static const char *const asms[] =
+{ /* Must match oacc_loop_levels ordering.  */
+  "%.\\tmov.u32 %0, %%ctaid.x;",/* gang */
+  "%.\\tmov.u32 %0, %%tid.y;",	/* worker */
+  "%.\\tmov.u32 %0, %%tid.x;",	/* vector */
+};
+  return asms[INTVAL (operands[1])];
 })
 
 (define_insn "oacc_thread_broadcastsi"
Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 224671)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -1673,7 +1673,6 @@  condition_unidirectional_p (rtx cond)
 
    A -- print an address space identifier for a MEM
    c -- print an opcode suffix for a comparison operator, including a type code
-   d -- print a CONST_INT as a vector dimension (x, y, or z)
    f -- print a full reg even for something that must always be split
    t -- print a type opcode suffix, promoting QImode to 32 bits
    T -- print a type size in bits
@@ -1718,18 +1717,6 @@  nvptx_print_operand (FILE *file, rtx x,
       }
       break;
 
-    case 'd':
-      gcc_assert (x_code == CONST_INT);
-      if (INTVAL (x) == 0)
-	fputs (".x", file);
-      else if (INTVAL (x) == 1)
-	fputs (".y", file);
-      else if (INTVAL (x) == 2)
-	fputs (".z", file);
-      else
-	gcc_unreachable ();
-      break;
-
     case 't':
       op_mode = nvptx_underlying_object_mode (x);
       fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, true));
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 224671)
+++ gcc/omp-low.c	(working copy)
@@ -172,9 +172,7 @@  struct omp_region
 
 /* Levels of parallelism as defined by OpenACC.  Increasing numbers
    correspond to deeper loop nesting levels.  */
-#define MASK_GANG 1
-#define MASK_WORKER 2
-#define MASK_VECTOR 4
+#define OACC_LOOP_MASK(X) (1 << (X))
 
 /* Context structure.  Used to store information about each parallel
    directive in the code.  */
@@ -2967,17 +2965,17 @@  scan_omp_for (gomp_for *stmt, omp_contex
 	  int val;
 	  if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_GANG)
 	    {
-	      val = MASK_GANG;
+	      val = OACC_LOOP_MASK (OACC_gang);
 	      gwv_clause = true;
 	    }
 	  else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_WORKER)
 	    {
-	      val = MASK_WORKER;
+	      val = OACC_LOOP_MASK (OACC_worker);
 	      gwv_clause = true;
 	    }
 	  else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR)
 	    {
-	      val = MASK_VECTOR;
+	      val = OACC_LOOP_MASK (OACC_vector);
 	      gwv_clause = true;
 	    }
 	  else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_SEQ)
@@ -3122,11 +3120,11 @@  scan_omp_target (gomp_target *stmt, omp_
       for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
 	{
 	  if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_GANGS)
-	    ctx->gwv_this |= MASK_GANG;
+	    ctx->gwv_this |= OACC_LOOP_MASK (OACC_gang);
 	  else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_WORKERS)
-	    ctx->gwv_this |= MASK_WORKER;
+	    ctx->gwv_this |= OACC_LOOP_MASK (OACC_worker);
 	  else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR_LENGTH)
-	    ctx->gwv_this |= MASK_VECTOR;
+	    ctx->gwv_this |= OACC_LOOP_MASK (OACC_vector);
 	}
     }
 
@@ -4992,53 +4990,25 @@  is_atomic_compatible_reduction (tree var
 static tree
 expand_oacc_get_num_threads (gimple_seq *seq, int gwv_bits)
 {
-  tree res = NULL_TREE;
-  tree u0 = fold_convert (unsigned_type_node, integer_zero_node);
-  tree u1 = fold_convert (unsigned_type_node, integer_one_node);
-
-  if (gwv_bits & MASK_GANG)
-    {
-      tree decl = builtin_decl_explicit (BUILT_IN_GOACC_NCTAID);
-      tree gang_count = create_tmp_var (unsigned_type_node);
-      gimple call = gimple_build_call (decl, 1, u0);
-      gimple_call_set_lhs (call, gang_count);
-      gimple_seq_add_stmt (seq, call);
-      res = gang_count;
-    }
-  
-  if (gwv_bits & MASK_WORKER)
-    {
-      tree decl = builtin_decl_explicit (BUILT_IN_GOACC_NTID);
-      tree worker_count = create_tmp_var (unsigned_type_node);
-      gimple call = gimple_build_call (decl, 1, u1);
-      gimple_call_set_lhs (call, worker_count);
-      gimple_seq_add_stmt (seq, call);
-      if (res != NULL_TREE)
-        res = fold_build2 (MULT_EXPR, unsigned_type_node, res, worker_count);
-      else
-        res = worker_count;
-    }
-  
-  if (gwv_bits & MASK_VECTOR)
-    {
-      tree decl = builtin_decl_explicit (BUILT_IN_GOACC_NTID);
-      tree vector_length = create_tmp_var (unsigned_type_node);
-      gimple call = gimple_build_call (decl, 1, u0);
-      gimple_call_set_lhs (call, vector_length);
-      gimple_seq_add_stmt (seq, call);
-      if (res != NULL_TREE)
-	res = fold_build2 (MULT_EXPR, unsigned_type_node, res, vector_length);
-      else
-	res = vector_length;
-    }
+  tree res = build_int_cst (unsigned_type_node, 1);
+  tree  decl = builtin_decl_explicit (BUILT_IN_GOACC_NID);
+  unsigned ix;
 
-  if (res == NULL_TREE)
-    res = u1;
+  for (ix = 0; (1 << ix) <= gwv_bits; ix++)
+    if ((1 << ix) & gwv_bits)
+      {
+	tree arg = build_int_cst (unsigned_type_node, ix);
+	tree count = create_tmp_var (unsigned_type_node);
+	gimple call = gimple_build_call (decl, 1, arg);
+	
+	gimple_call_set_lhs (call, count);
+	gimple_seq_add_stmt (seq, call);
+	res = fold_build2 (MULT_EXPR, unsigned_type_node, res, count);
+      }
   
   return res;
 }
 
-
 /* Find the current thread number to use within a region partitioned by
    GWV_BITS.  Setup code required for the calculation is added to SEQ.  See
    note for expand_oacc_get_num_threads above re: builtin usage.  */
@@ -5047,90 +5017,43 @@  static tree
 expand_oacc_get_thread_num (gimple_seq *seq, int gwv_bits)
 {
   tree res = NULL_TREE;
-  tree u0 = fold_convert (unsigned_type_node, integer_zero_node);
-  tree u1 = fold_convert (unsigned_type_node, integer_one_node);
-  tree vector_count = NULL_TREE;
-  tree tid_decl = builtin_decl_explicit (BUILT_IN_GOACC_TID);
-  tree ntid_decl = builtin_decl_explicit (BUILT_IN_GOACC_NTID);
-
-  if (gwv_bits & MASK_VECTOR)
-    {
-      tree vector_id = create_tmp_var (unsigned_type_node);
-      gimple call = gimple_build_call (tid_decl, 1, u0);
-      gimple_call_set_lhs (call, vector_id);
-      gimple_seq_add_stmt (seq, call);
-      res = vector_id;
-    }
-
-  if (gwv_bits & MASK_WORKER)
-    {
-      tree worker_id = create_tmp_var (unsigned_type_node);
-      gimple call = gimple_build_call (tid_decl, 1, u1);
-      gimple_call_set_lhs (call, worker_id);
-      gimple_seq_add_stmt (seq, call);
-      if (res != NULL_TREE)
-	{
-	  vector_count = create_tmp_var (unsigned_type_node);
-	  call = gimple_build_call (ntid_decl, 1, u0);
-	  gimple_call_set_lhs (call, vector_count);
-	  gimple_seq_add_stmt (seq, call);
-	  res = fold_build2 (PLUS_EXPR, unsigned_type_node,
-			     fold_build2 (MULT_EXPR, unsigned_type_node,
-					  vector_count, worker_id), res);
-	}
-      else
-	res = worker_id;
-    }
-
-  if (gwv_bits & MASK_GANG)
-    {
-      tree worker_count;
-      tree ctaid_decl = builtin_decl_explicit (BUILT_IN_GOACC_CTAID);
-      tree gang_id = create_tmp_var (unsigned_type_node);
-      gimple call = gimple_build_call (ctaid_decl, 1, u0);
-      gimple_call_set_lhs (call, gang_id);
-      gimple_seq_add_stmt (seq, call);
+  tree id_decl = builtin_decl_explicit (BUILT_IN_GOACC_ID);
+  tree nid_decl = builtin_decl_explicit (BUILT_IN_GOACC_NID);
+  unsigned ix;
 
-      if (gwv_bits & MASK_WORKER)
-	{
-	  worker_count = create_tmp_var (unsigned_type_node);
-	  call = gimple_build_call (ntid_decl, 1, u1);
-	  gimple_call_set_lhs (call, worker_count);
-	  gimple_seq_add_stmt (seq, call);
-	}
-      else
-	worker_count = u1;
+  /* Start at gang level, and examine relevant dimension indices.  */
+  for (ix = 0; (1 << ix) <= gwv_bits; ix++)
+    if ((1 << ix) & gwv_bits)
+      {
+	tree arg = build_int_cst (unsigned_type_node, ix);
 
-      if (gwv_bits & MASK_VECTOR)
-	{
-	  if (vector_count == NULL_TREE)
-	    {
-	      vector_count = create_tmp_var (unsigned_type_node);
-	      call = gimple_build_call (ntid_decl, 1, u0);
-	      gimple_call_set_lhs (call, vector_count);
-	      gimple_seq_add_stmt (seq, call);
-	    }
-	}
-      else
-	vector_count = u1;
+	if (res)
+	  {
+	    /* We had an outer index, so scale that by the size of
+	       this dimension.  */
+	    tree n = create_tmp_var (unsigned_type_node);
+	    gimple call = gimple_build_call (nid_decl, 1, arg);
+	    
+	    gimple_call_set_lhs (call, n);
+	    gimple_seq_add_stmt (seq, call);
+	    res = fold_build2 (MULT_EXPR, unsigned_type_node, res, n);
+	  }
 
-      if (gwv_bits & (MASK_WORKER | MASK_VECTOR))
-	{
-	  gcc_assert (res != NULL_TREE);
-	  res = fold_build2 (PLUS_EXPR, unsigned_type_node,
-		  fold_build2 (MULT_EXPR, unsigned_type_node,
-			       fold_build2 (MULT_EXPR, unsigned_type_node,
-					    worker_count, vector_count),
-			       gang_id),
-		  res);
-	}
-      else
-	res = gang_id;
-    }
+	/* Determine index in this dimension.  */
+	tree id = create_tmp_var (unsigned_type_node);
+	gimple call = gimple_build_call (id_decl, 1, arg);
+	
+	gimple_call_set_lhs (call, id);
+	gimple_seq_add_stmt (seq, call);
+	if (res)
+	  res = fold_build2 (PLUS_EXPR, unsigned_type_node, res, id);
+	else
+	  res = id;
+      }
 
   if (res == NULL_TREE)
-    res = u0;
-
+    res = build_int_cst (unsigned_type_node, 0);
+			 
   return res;
 }
 
@@ -7278,10 +7201,10 @@  expand_omp_for_generic (struct omp_regio
 static bool
 oacc_loop_needs_threadbarrier_p (int gwv_bits)
 {
-  return (gwv_bits & (MASK_GANG | MASK_WORKER)) == MASK_WORKER;
+  return !(gwv_bits & OACC_LOOP_MASK (OACC_gang))
+    && (gwv_bits & OACC_LOOP_MASK (OACC_worker));
 }
 
-
 /* A subroutine of expand_omp_for.  Generate code for a parallel
    loop with static schedule and no specified chunk size.  Given
    parameters:
@@ -10416,11 +10339,11 @@  find_omp_for_region_gwv (gimple stmt)
 
   tree clauses = gimple_omp_for_clauses (stmt);
   if (find_omp_clause (clauses, OMP_CLAUSE_GANG))
-    tmp |= MASK_GANG;
+    tmp |= OACC_LOOP_MASK (OACC_gang);
   if (find_omp_clause (clauses, OMP_CLAUSE_WORKER))
-    tmp |= MASK_WORKER;
+    tmp |= OACC_LOOP_MASK (OACC_worker);
   if (find_omp_clause (clauses, OMP_CLAUSE_VECTOR))
-    tmp |= MASK_VECTOR;
+    tmp |= OACC_LOOP_MASK (OACC_vector);
 
   return tmp;
 }
@@ -10437,11 +10360,11 @@  find_omp_target_region_data (struct omp_
 
   tree clauses = gimple_omp_target_clauses (stmt);
   if (find_omp_clause (clauses, OMP_CLAUSE_NUM_GANGS))
-    region->gwv_this |= MASK_GANG;
+    region->gwv_this |= OACC_LOOP_MASK (OACC_gang);
   if (find_omp_clause (clauses, OMP_CLAUSE_NUM_WORKERS))
-    region->gwv_this |= MASK_WORKER;
+    region->gwv_this |= OACC_LOOP_MASK (OACC_worker);
   if (find_omp_clause (clauses, OMP_CLAUSE_VECTOR_LENGTH))
-    region->gwv_this |= MASK_VECTOR;
+    region->gwv_this |= OACC_LOOP_MASK (OACC_vector);
   region->broadcast_array = gimple_omp_target_broadcast_array (stmt);
 }
 
@@ -10621,14 +10544,14 @@  required_predication_mask (omp_region *r
     return 0;
 
   int mask = 0;
-  if ((outer_target->gwv_this & MASK_WORKER) != 0
+  if ((outer_target->gwv_this & OACC_LOOP_MASK (OACC_worker)) != 0
       && (region->type == GIMPLE_OMP_TARGET
-	  || (outer_masks & MASK_WORKER) == 0))
-    mask |= MASK_WORKER;
-  if ((outer_target->gwv_this & MASK_VECTOR) != 0
+	  || (outer_masks & OACC_LOOP_MASK (OACC_worker)) == 0))
+    mask |= OACC_LOOP_MASK (OACC_worker);
+  if ((outer_target->gwv_this & OACC_LOOP_MASK (OACC_vector)) != 0
       && (region->type == GIMPLE_OMP_TARGET
-	  || (outer_masks & MASK_VECTOR) == 0))
-    mask |= MASK_VECTOR;
+	  || (outer_masks & OACC_LOOP_MASK (OACC_vector)) == 0))
+    mask |= OACC_LOOP_MASK (OACC_vector);
   return mask;
 }
 
@@ -10698,7 +10621,7 @@  generate_vector_broadcast (tree dest_var
 
 /* Generate a broadcast across OpenACC threads in REGION so that VAR
    is broadcast to DEST_VAR.  MASK specifies the parallelism level and
-   thereby the broadcast method.  If it is equal to MASK_VECTOR, we
+   thereby the broadcast method.  If it is only vector, we
    can use a warp broadcast, otherwise we fall back to memory
    store/load.  */
 
@@ -10706,7 +10629,7 @@  static gimple
 generate_oacc_broadcast (omp_region *region, tree dest_var, tree var,
 			 gimple_stmt_iterator &where, int mask)
 {
-  if (mask == MASK_VECTOR)
+  if (mask == OACC_LOOP_MASK (OACC_vector))
     return generate_vector_broadcast (dest_var, var, where);
 
   omp_region *parent = enclosing_target_region (region);
@@ -10735,7 +10658,7 @@  generate_oacc_broadcast (omp_region *reg
 /* Build a test for OpenACC predication.  TRUE_EDGE is the edge that should be
    taken if the block should be executed.  SKIP_DEST_BB is the destination to
    jump to otherwise.  MASK specifies the type of predication, it can contain
-   the bits MASK_VECTOR and/or MASK_WORKER.  */
+   the bits for VECTOR and/or WORKER.  */
 
 static void
 make_predication_test (edge true_edge, basic_block skip_dest_bb, int mask)
@@ -10743,32 +10666,31 @@  make_predication_test (edge true_edge, b
   basic_block cond_bb = true_edge->src;
   
   gimple_stmt_iterator tmp_gsi = gsi_last_bb (cond_bb);
-  tree decl = builtin_decl_explicit (BUILT_IN_GOACC_TID);
-
-  tree vvar = NULL_TREE, wvar = NULL_TREE;
+  tree decl = builtin_decl_explicit (BUILT_IN_GOACC_ID);
   tree comp_var = NULL_TREE;
-  if (mask & MASK_VECTOR)
-    {
-      gimple call = gimple_build_call (decl, 1, integer_zero_node);
-      vvar = create_tmp_var (unsigned_type_node);
-      comp_var = vvar;
-      gimple_call_set_lhs (call, vvar);
-      gsi_insert_after (&tmp_gsi, call, GSI_NEW_STMT);
-    }
-  if (mask & MASK_WORKER)
-    {
-      gimple call = gimple_build_call (decl, 1, integer_one_node);
-      wvar = create_tmp_var (unsigned_type_node);
-      comp_var = wvar;
-      gimple_call_set_lhs (call, wvar);
-      gsi_insert_after (&tmp_gsi, call, GSI_NEW_STMT);
-    }
-  if (wvar && vvar)
-    {
-      comp_var = create_tmp_var (unsigned_type_node);
-      gassign *ior = gimple_build_assign (comp_var, BIT_IOR_EXPR, wvar, vvar);
-      gsi_insert_after (&tmp_gsi, ior, GSI_NEW_STMT);
-    }
+  unsigned ix;
+
+  for (ix = OACC_worker; ix <= OACC_vector; ix++)
+    if (mask & (1 << ix))
+      {
+	gimple call = gimple_build_call
+	  (decl, 1, build_int_cst (unsigned_type_node, ix));
+	tree var = create_tmp_var (unsigned_type_node);
+
+	gimple_call_set_lhs (call, var);
+	gsi_insert_after (&tmp_gsi, call, GSI_NEW_STMT);
+	if (comp_var)
+	  {
+	    tree new_comp = create_tmp_var (unsigned_type_node);
+	    gassign *ior = gimple_build_assign (new_comp,
+						BIT_IOR_EXPR, comp_var, var);
+	    gsi_insert_after (&tmp_gsi, ior, GSI_NEW_STMT);
+	    comp_var = new_comp;
+	  }
+	else
+	  comp_var = var;
+      }
+
   tree cond = build2 (EQ_EXPR, boolean_type_node, comp_var,
 		      fold_convert (unsigned_type_node, integer_zero_node));
   gimple cond_stmt = gimple_build_cond_empty (cond);
@@ -10789,7 +10711,7 @@  make_predication_test (edge true_edge, b
 
 /* Apply OpenACC predication to basic block BB which is in
    region PARENT.  MASK has a bitmask of levels that need to be
-   applied; MASK_VECTOR and/or MASK_WORKER may be set.  */
+   applied; VECTOR and/or WORKER may be set.  */
 
 static void
 predicate_bb (basic_block bb, struct omp_region *parent, int mask)
@@ -10798,8 +10720,8 @@  predicate_bb (basic_block bb, struct omp
      around them if not in the controlling worker.  Don't insert
      unnecessary (and incorrect) predication.  */
   if (parent->type == GIMPLE_OMP_FOR
-      && (parent->gwv_this & MASK_VECTOR))
-    mask &= ~MASK_WORKER;
+      && (parent->gwv_this & OACC_LOOP_MASK (OACC_vector)))
+    mask &= ~OACC_LOOP_MASK (OACC_worker);
 
   if (mask == 0 || parent->type == GIMPLE_OMP_ATOMIC_LOAD)
     return;
@@ -10873,15 +10795,16 @@  predicate_bb (basic_block bb, struct omp
 	  skip_dest_bb = single_succ (inner->exit);
 	  gcc_assert (inner->entry == bb);
 	  if (code != GIMPLE_OMP_FOR
-	      || ((inner->gwv_this & (MASK_VECTOR | MASK_WORKER)) == MASK_VECTOR
-		  && (mask & MASK_WORKER) != 0))
+	      || ((inner->gwv_this & OACC_LOOP_MASK (OACC_vector))
+		  && !(inner->gwv_this & OACC_LOOP_MASK (OACC_worker))
+		  && (mask & OACC_LOOP_MASK  (OACC_worker))))
 	    {
 	      gimple_stmt_iterator head_gsi = gsi_start_bb (bb);
 	      gsi_prev (&head_gsi);
 	      edge e0 = split_block (bb, gsi_stmt (head_gsi));
 	      int mask2 = mask;
 	      if (code == GIMPLE_OMP_FOR)
-		mask2 &= ~MASK_VECTOR;
+		mask2 &= ~OACC_LOOP_MASK (OACC_vector);
 	      if (!split_stmt || code != GIMPLE_OMP_FOR)
 		{
 		  /* The simple case: nothing here except the for,
@@ -11199,7 +11122,7 @@  oacc_broadcast (basic_block entry_bb, ba
 	use.erase (it);
     }
 
-  if (mask == MASK_VECTOR)
+  if (mask == OACC_LOOP_MASK (OACC_vector))
     {
       /* Broadcast all decls in USE right before the last instruction in
 	 entry_bb.  */
@@ -11213,7 +11136,7 @@  oacc_broadcast (basic_block entry_bb, ba
 
       gsi_insert_seq_before (&gsi, seq, GSI_CONTINUE_LINKING);
     }
-  else if (mask & MASK_WORKER)
+  else if (mask & OACC_LOOP_MASK (OACC_worker))
     {
       if (use.empty ())
 	return entry_bb;
@@ -13104,25 +13027,31 @@  lower_omp_taskreg (gimple_stmt_iterator
 static void
 oacc_init_count_vars (omp_context *ctx, tree clauses ATTRIBUTE_UNUSED)
 {
-  tree gettid = builtin_decl_explicit (BUILT_IN_GOACC_TID);
-  tree getntid = builtin_decl_explicit (BUILT_IN_GOACC_NTID);
+  tree getid = builtin_decl_explicit (BUILT_IN_GOACC_ID);
+  tree getnid = builtin_decl_explicit (BUILT_IN_GOACC_NID);
   tree worker_var, worker_count;
-  tree u1 = fold_convert (unsigned_type_node, integer_one_node);
-  tree u0 = fold_convert (unsigned_type_node, integer_zero_node);
-  if (ctx->gwv_this & MASK_WORKER)
+  
+  if (ctx->gwv_this & OACC_LOOP_MASK (OACC_worker))
     {
+      tree arg = build_int_cst (unsigned_type_node, OACC_worker);
+      
       worker_var = create_tmp_var (unsigned_type_node, ".worker");
       worker_count = create_tmp_var (unsigned_type_node, ".workercount");
-      gimple call1 = gimple_build_call (gettid, 1, u1);
+      
+      gimple call1 = gimple_build_call (getid, 1, arg);
       gimple_call_set_lhs (call1, worker_var);
       gimple_seq_add_stmt (&ctx->ganglocal_init, call1);
-      gimple call2 = gimple_build_call (getntid, 1, u1);
+
+      gimple call2 = gimple_build_call (getnid, 1, arg);
       gimple_call_set_lhs (call2, worker_count);
       gimple_seq_add_stmt (&ctx->ganglocal_init, call2);
     }
   else
-    worker_var = u0, worker_count = u1;
-
+    {
+      worker_var = build_int_cst (unsigned_type_node, 0);
+      worker_count = build_int_cst (unsigned_type_node, 1);
+    }
+  
   ctx->worker_var = worker_var;
   ctx->worker_count = worker_count;
 }
Index: gcc/omp-low.h
===================================================================
--- gcc/omp-low.h	(revision 224671)
+++ gcc/omp-low.h	(working copy)
@@ -20,6 +20,14 @@  along with GCC; see the file COPYING3.
 #ifndef GCC_OMP_LOW_H
 #define GCC_OMP_LOW_H
 
+enum oacc_loop_levels
+  {
+    OACC_gang,
+    OACC_worker,
+    OACC_vector,
+    OACC_HWM
+  };
+
 struct omp_region;
 
 extern tree find_omp_clause (tree, enum omp_clause_code);
Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c	(revision 224671)
+++ gcc/builtins.c	(working copy)
@@ -85,7 +85,7 @@  along with GCC; see the file COPYING3.
 #include "tree-chkp.h"
 #include "rtl-chkp.h"
 #include "gomp-constants.h"
-
+#include "omp-low.h"
 
 static tree do_mpc_arg1 (tree, tree, int (*)(mpc_ptr, mpc_srcptr, mpc_rnd_t));
 
@@ -5962,44 +5962,42 @@  expand_oacc_threadbarrier (void)
 
 
 /* Expand a thread-id/thread-count builtin for OpenACC.  */
+
 static rtx
-expand_oacc_builtin (enum built_in_function fcode, tree exp, rtx target)
+expand_oacc_id (enum built_in_function fcode, tree exp, rtx target)
 {
   tree arg0 = CALL_EXPR_ARG (exp, 0);
   rtx result = const0_rtx;
   rtx arg;
 
-  gcc_assert (TREE_CODE (arg0) == INTEGER_CST);
   arg = expand_normal (arg0);
+  if (GET_CODE (arg) != CONST_INT
+      || (unsigned HOST_WIDE_INT)INTVAL (arg) >= OACC_HWM)
+    {
+      error ("argument to %D must be constant in range 0 to %d",
+	     get_callee_fndecl (exp), OACC_HWM - 1);
+      return result;
+    }
 
   enum insn_code icode = CODE_FOR_nothing;
   switch (fcode)
     {
-    case BUILT_IN_GOACC_NTID:
-#ifdef HAVE_oacc_ntid
-      icode = CODE_FOR_oacc_ntid;
-#endif
-      result = const1_rtx;
-      break;
-    case BUILT_IN_GOACC_TID:
-#ifdef HAVE_oacc_tid
-      icode = CODE_FOR_oacc_tid;
-#endif
-      break;
-    case BUILT_IN_GOACC_NCTAID:
-#ifdef HAVE_oacc_nctaid
-      icode = CODE_FOR_oacc_nctaid;
+    case BUILT_IN_GOACC_NID:
+#ifdef HAVE_oacc_nid
+      icode = CODE_FOR_oacc_nid;
 #endif
       result = const1_rtx;
       break;
-    case BUILT_IN_GOACC_CTAID:
-#ifdef HAVE_oacc_ctaid
-      icode = CODE_FOR_oacc_ctaid;
+    case BUILT_IN_GOACC_ID:
+#ifdef HAVE_oacc_id
+      icode = CODE_FOR_oacc_id;
 #endif
       break;
     default:
+      gcc_unreachable ();
       break;
     }
+
   if (icode != CODE_FOR_nothing)
     {
       machine_mode mode = insn_data[icode].operand[0].mode;
@@ -7218,11 +7216,9 @@  expand_builtin (tree exp, rtx target, rt
 	return target;
       break;
 
-    case BUILT_IN_GOACC_NTID:
-    case BUILT_IN_GOACC_TID:
-    case BUILT_IN_GOACC_NCTAID:
-    case BUILT_IN_GOACC_CTAID:
-      return expand_oacc_builtin (fcode, exp, target);
+    case BUILT_IN_GOACC_ID:
+    case BUILT_IN_GOACC_NID:
+      return expand_oacc_id (fcode, exp, target);
 
     case BUILT_IN_GOACC_GET_GANGLOCAL_PTR:
       target = expand_oacc_ganglocal_ptr (target);
@@ -12590,9 +12586,8 @@  is_simple_builtin (tree decl)
       case BUILT_IN_EH_FILTER:
       case BUILT_IN_EH_POINTER:
       case BUILT_IN_EH_COPY_VALUES:
-	/* Just a special register access.  */
-      case BUILT_IN_GOACC_NTID:
-      case BUILT_IN_GOACC_TID:
+	/* Just a special register read.  */
+      case BUILT_IN_GOACC_NID:
 	return true;
 
       default: