diff mbox

[7/8] Add pass_parloops_oacc_kernels to pass_oacc_kernels

Message ID 54746B24.3030409@mentor.com
State New
Headers show

Commit Message

Tom de Vries Nov. 25, 2014, 11:42 a.m. UTC
On 15-11-14 18:23, Tom de Vries wrote:
> On 15-11-14 13:14, Tom de Vries wrote:
>> Hi,
>>
>> I'm submitting a patch series with initial support for the oacc kernels
>> directive.
>>
>> The patch series uses pass_parallelize_loops to implement parallelization of
>> loops in the oacc kernels region.
>>
>> The patch series consists of these 8 patches:
>> ...
>>      1  Expand oacc kernels after pass_build_ealias
>>      2  Add pass_oacc_kernels
>>      3  Add pass_ch_oacc_kernels to pass_oacc_kernels
>>      4  Add pass_tree_loop_{init,done} to pass_oacc_kernels
>>      5  Add pass_loop_im to pass_oacc_kernels
>>      6  Add pass_ccp to pass_oacc_kernels
>>      7  Add pass_parloops_oacc_kernels to pass_oacc_kernels
>>      8  Do simple omp lowering for no address taken var
>> ...
>
> This patch adds:
> - a specialized version of pass_parallelize_loops called
>      pass_parloops_oacc_kernels to pass group pass_oacc_kernels, and
> - relevant test-cases.
>
> The pass only handles loops that are in a kernels region, and skips over bits of
> pass_parallelize_loops that are already done for oacc kernels.
>
> The pass reintroduces the use of omp_expand_local, I haven't managed to make it
> work yet using the external pass pass_expand_omp_ssa.
>
> An obvious limitation of the patch is the fact that we copy over the clauses
> from the kernels directive to the generated parallel directive. We'll need to do
> something more intelligent here, f.i. setting vector_length based on the
> parallelization factor.
>
> Another limitation is that the pass still needs -ftree-parallelize-loops to
> trigger.
>

Updated for using pass_copyprop instead of pass_ccp in pass_oacc_kernels.

Bootstrapped and reg-tested as before.

OK for trunk?

Thanks,
- Tom
diff mbox

Patch

[PATCH 7/7] Add pass_parloops_oacc_kernels to pass_oacc_kernels

2014-11-25  Tom de Vries  <tom@codesourcery.com>

	* passes.def: Add pass_parallelize_loops_oacc_kernels in pass group
	pass_oacc_kernels.  Move pass_expand_omp_ssa into pass group
	pass_oacc_kernels.
	* tree-parloops.c (create_parallel_loop): Add function parameters
	region_entry and bool oacc_kernels_p.  Handle oacc_kernels_p.
	(gen_parallel_loop): Same.  Use omp_expand_local if oacc_kernels_p.
	Call create_parallel_loop with additional args.
	(parallelize_loops): Add function parameter oacc_kernels_p.  Calculate
	dominance info.  Skip loops that are not in a kernels region. Call
	gen_parallel_loop with additional args.
	(pass_parallelize_loops::execute): Call parallelize_loops with false
	argument.
	(pass_data_parallelize_loops_oacc_kernels): New pass_data.
	(class pass_parallelize_loops_oacc_kernels): New pass.
	(pass_parallelize_loops_oacc_kernels::execute)
	(make_pass_parallelize_loops_oacc_kernels): New function.
	* tree-pass.h (make_pass_parallelize_loops_oacc_kernels): Declare.

	* testsuite/libgomp.oacc-c/oacc-kernels-2-run.c: New test.
	* testsuite/libgomp.oacc-c/oacc-kernels-run.c: New test.

	* gcc.dg/oacc-kernels-2.c: New test.
	* gcc.dg/oacc-kernels.c: New test.
---
 gcc/passes.def                                     |   1 +
 gcc/testsuite/gcc.dg/oacc-kernels-2.c              |  79 +++++++
 gcc/testsuite/gcc.dg/oacc-kernels.c                |  71 ++++++
 gcc/tree-parloops.c                                | 242 ++++++++++++++++-----
 gcc/tree-pass.h                                    |   2 +
 .../testsuite/libgomp.oacc-c/oacc-kernels-2-run.c  |  65 ++++++
 .../testsuite/libgomp.oacc-c/oacc-kernels-run.c    |  59 +++++
 7 files changed, 464 insertions(+), 55 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/oacc-kernels-2.c
 create mode 100644 gcc/testsuite/gcc.dg/oacc-kernels.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c/oacc-kernels-2-run.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c/oacc-kernels-run.c

diff --git a/gcc/passes.def b/gcc/passes.def
index fb0d331..d91283b 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -94,6 +94,7 @@  along with GCC; see the file COPYING3.  If not see
 	      NEXT_PASS (pass_tree_loop_init);
 	      NEXT_PASS (pass_lim);
 	      NEXT_PASS (pass_copy_prop);
+      	      NEXT_PASS (pass_parallelize_loops_oacc_kernels);
 	      NEXT_PASS (pass_expand_omp_ssa);
 	      NEXT_PASS (pass_tree_loop_done);
 	  POP_INSERT_PASSES ()
diff --git a/gcc/testsuite/gcc.dg/oacc-kernels-2.c b/gcc/testsuite/gcc.dg/oacc-kernels-2.c
new file mode 100644
index 0000000..1ff4bad
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/oacc-kernels-2.c
@@ -0,0 +1,79 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target fopenacc } */
+/* { dg-options "-fopenacc -ftree-parallelize-loops=32 -O2 -std=c99 -fdump-tree-parloops_oacc_kernels-all -fdump-tree-copyrename" } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define N (1024 * 512)
+#define N_REF 4293394432
+
+#if 1
+#define COUNTERTYPE unsigned int
+#else
+#define COUNTERTYPE int
+#endif
+
+int
+main (void)
+{
+  unsigned int i;
+
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = malloc (N * sizeof (unsigned int));
+  b = malloc (N * sizeof (unsigned int));
+  c = malloc (N * sizeof (unsigned int));
+
+
+#pragma acc kernels copyout (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+#pragma acc kernels copyout (b[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+  {
+    unsigned int sum = 0;
+
+    for (COUNTERTYPE i = 0; i < N; i++)
+      sum += c[i];
+
+    printf ("sum: %u\n", sum);
+
+    if (sum != N_REF)
+      abort ();
+  }
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function.  It pops up first in
+   all_passes/pass_all_optimizations/pass_rename_ssa_copies.  */
+/* { dg-final { scan-tree-dump-times "Function main._omp_fn.0 " 1 "copyrename2" } } */
+/* { dg-final { scan-tree-dump-times "Function main._omp_fn.1 " 1 "copyrename2" } } */
+/* { dg-final { scan-tree-dump-times "Function main._omp_fn.2 " 1 "copyrename2" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "copyrename*" } } */
diff --git a/gcc/testsuite/gcc.dg/oacc-kernels.c b/gcc/testsuite/gcc.dg/oacc-kernels.c
new file mode 100644
index 0000000..de94aa9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/oacc-kernels.c
@@ -0,0 +1,71 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target fopenacc } */
+/* { dg-options "-fopenacc -ftree-parallelize-loops=32 -O2 -std=c99 -fdump-tree-parloops_oacc_kernels-all -fdump-tree-copyrename" } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define N (1024 * 512)
+#define N_REF 4293394432
+
+#if 1
+#define COUNTERTYPE unsigned int
+#else
+#define COUNTERTYPE int
+#endif
+
+int
+main (void)
+{
+  unsigned int i;
+
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = malloc (N * sizeof (unsigned int));
+  b = malloc (N * sizeof (unsigned int));
+  c = malloc (N * sizeof (unsigned int));
+
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    a[i] = i * 2;
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+  {
+    unsigned int sum = 0;
+
+    for (COUNTERTYPE i = 0; i < N; i++)
+      sum += c[i];
+
+    printf ("sum: %u\n", sum);
+
+    if (sum != N_REF)
+      abort ();
+  }
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function.  It pops up first in
+   all_passes/pass_all_optimizations/pass_rename_ssa_copies.  */
+/* { dg-final { scan-tree-dump-times "Function main._omp_fn.0 " 1 "copyrename2" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "copyrename*" } } */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index e5dca78..7bc945b 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -1611,7 +1611,8 @@  transform_to_exit_first_loop (struct loop *loop,
 
 static basic_block
 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
-		      tree new_data, unsigned n_threads, location_t loc)
+		      tree new_data, unsigned n_threads, location_t loc,
+		      basic_block region_entry, bool oacc_kernels_p)
 {
   gimple_stmt_iterator gsi;
   basic_block bb, paral_bb, for_bb, ex_bb;
@@ -1623,15 +1624,44 @@  create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
   /* Prepare the GIMPLE_OMP_PARALLEL statement.  */
   bb = loop_preheader_edge (loop)->src;
   paral_bb = single_pred (bb);
-  gsi = gsi_last_bb (paral_bb);
+  if (!oacc_kernels_p)
+    gsi = gsi_last_bb (paral_bb);
+  else
+    /* Make sure the oacc parallel is inserted on top of the oacc kernels
+       region.  */
+    gsi = gsi_last_bb (region_entry);
 
-  t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
-  OMP_CLAUSE_NUM_THREADS_EXPR (t)
-    = build_int_cst (integer_type_node, n_threads);
-  stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
-  gimple_set_location (stmt, loc);
+  if (!oacc_kernels_p)
+    {
+      t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
+      OMP_CLAUSE_NUM_THREADS_EXPR (t)
+	= build_int_cst (integer_type_node, n_threads);
+      stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+      gimple_set_location (stmt, loc);
 
-  gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+      gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+    }
+  else
+    {
+      /* Create oacc parallel pragma based on oacc kernels pragma.  */
+      gimple kernels = last_stmt (region_entry);
+      stmt = gimple_build_oacc_parallel (NULL,
+					 gimple_oacc_kernels_clauses (kernels));
+      tree child_fn = gimple_oacc_kernels_child_fn (kernels);
+      gimple_oacc_parallel_set_child_fn (stmt, child_fn);
+      tree data_arg = gimple_oacc_kernels_data_arg (kernels);
+      gimple_oacc_parallel_set_data_arg (stmt, data_arg);
+
+      gimple_set_location (stmt, loc);
+
+      /* Insert oacc parallel pragma after the oacc kernels pragma.  */
+      {
+	gimple_stmt_iterator gsi2;
+	gsi2 = gsi;
+	gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+	gsi_remove (&gsi2, true);
+      }
+    }
 
   /* Initialize NEW_DATA.  */
   if (data)
@@ -1647,12 +1677,18 @@  create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
     }
 
-  /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
-  bb = split_loop_exit_edge (single_dom_exit (loop));
-  gsi = gsi_last_bb (bb);
-  stmt = gimple_build_omp_return (false);
-  gimple_set_location (stmt, loc);
-  gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+  /* Skip insertion of OMP_RETURN for oacc_kernels_p.  We've already generated
+     one when lowering the oacc kernels directive in
+     pass_lower_omp/lower_omp (). */
+  if (!oacc_kernels_p)
+    {
+      /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
+      bb = split_loop_exit_edge (single_dom_exit (loop));
+      gsi = gsi_last_bb (bb);
+      stmt = gimple_build_omp_return (false);
+      gimple_set_location (stmt, loc);
+      gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+    }
 
   /* Extract data for GIMPLE_OMP_FOR.  */
   gcc_assert (loop->header == single_dom_exit (loop)->src);
@@ -1705,7 +1741,11 @@  create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
   t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
   OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
 
-  for_stmt = gimple_build_omp_for (NULL, GF_OMP_FOR_KIND_FOR, t, 1, NULL);
+  for_stmt = gimple_build_omp_for (NULL,
+				   (oacc_kernels_p
+				    ? GF_OMP_FOR_KIND_OACC_LOOP
+				    : GF_OMP_FOR_KIND_FOR),
+				   NULL_TREE, 1, NULL);
   gimple_set_location (for_stmt, loc);
   gimple_omp_for_set_index (for_stmt, 0, initvar);
   gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
@@ -1736,7 +1776,7 @@  create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
   free_dominance_info (CDI_DOMINATORS);
   calculate_dominance_info (CDI_DOMINATORS);
 
-  return paral_bb;
+  return oacc_kernels_p ? region_entry : paral_bb;
 }
 
 /* Generates code to execute the iterations of LOOP in N_THREADS
@@ -1748,11 +1788,13 @@  create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
 static void
 gen_parallel_loop (struct loop *loop,
 		   reduction_info_table_type *reduction_list,
-		   unsigned n_threads, struct tree_niter_desc *niter)
+		   unsigned n_threads, struct tree_niter_desc *niter,
+		   basic_block region_entry, bool oacc_kernels_p)
 {
   tree many_iterations_cond, type, nit;
   tree arg_struct, new_arg_struct;
   gimple_seq stmts;
+  basic_block parallel_head;
   edge entry, exit;
   struct clsn_data clsn_data;
   unsigned prob;
@@ -1829,40 +1871,43 @@  gen_parallel_loop (struct loop *loop,
   if (stmts)
     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
 
-  if (loop->inner)
-    m_p_thread=2;
-  else
-    m_p_thread=MIN_PER_THREAD;
-
-   many_iterations_cond =
-     fold_build2 (GE_EXPR, boolean_type_node,
-                nit, build_int_cst (type, m_p_thread * n_threads));
-
-  many_iterations_cond
-    = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
-		   invert_truthvalue (unshare_expr (niter->may_be_zero)),
-		   many_iterations_cond);
-  many_iterations_cond
-    = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
-  if (stmts)
-    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
-  if (!is_gimple_condexpr (many_iterations_cond))
+  if (!oacc_kernels_p)
     {
+      if (loop->inner)
+	m_p_thread=2;
+      else
+	m_p_thread=MIN_PER_THREAD;
+
+      many_iterations_cond =
+	fold_build2 (GE_EXPR, boolean_type_node,
+		     nit, build_int_cst (type, m_p_thread * n_threads));
+
+      many_iterations_cond
+	= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
+		       invert_truthvalue (unshare_expr (niter->may_be_zero)),
+		       many_iterations_cond);
       many_iterations_cond
-	= force_gimple_operand (many_iterations_cond, &stmts,
-				true, NULL_TREE);
+	= force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
       if (stmts)
 	gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
-    }
+      if (!is_gimple_condexpr (many_iterations_cond))
+	{
+	  many_iterations_cond
+	    = force_gimple_operand (many_iterations_cond, &stmts,
+				    true, NULL_TREE);
+	  if (stmts)
+	    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+	}
 
-  initialize_original_copy_tables ();
+      initialize_original_copy_tables ();
 
-  /* We assume that the loop usually iterates a lot.  */
-  prob = 4 * REG_BR_PROB_BASE / 5;
-  loop_version (loop, many_iterations_cond, NULL,
-		prob, prob, REG_BR_PROB_BASE - prob, true);
-  update_ssa (TODO_update_ssa);
-  free_original_copy_tables ();
+      /* We assume that the loop usually iterates a lot.  */
+      prob = 4 * REG_BR_PROB_BASE / 5;
+      loop_version (loop, many_iterations_cond, NULL,
+		    prob, prob, REG_BR_PROB_BASE - prob, true);
+      update_ssa (TODO_update_ssa);
+      free_original_copy_tables ();
+    }
 
   /* Base all the induction variables in LOOP on a single control one.  */
   canonicalize_loop_ivs (loop, &nit, true);
@@ -1879,19 +1924,31 @@  gen_parallel_loop (struct loop *loop,
   entry = loop_preheader_edge (loop);
   exit = single_dom_exit (loop);
 
-  eliminate_local_variables (entry, exit);
-  /* In the old loop, move all variables non-local to the loop to a structure
-     and back, and create separate decls for the variables used in loop.  */
-  separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
-			    &new_arg_struct, &clsn_data);
+  /* This rewrites the body in terms of new variables.  This has already
+     been done for oacc_kernels_p in pass_lower_omp/lower_omp ().  */
+  if (!oacc_kernels_p)
+    {
+      eliminate_local_variables (entry, exit);
+      /* In the old loop, move all variables non-local to the loop to a
+	 structure and back, and create separate decls for the variables used in
+	 loop.  */
+      separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
+				&new_arg_struct, &clsn_data);
+    }
+  else
+    {
+      arg_struct = NULL_TREE;
+      new_arg_struct = NULL_TREE;
+    }
 
   /* Create the parallel constructs.  */
   loc = UNKNOWN_LOCATION;
   cond_stmt = last_stmt (loop->header);
   if (cond_stmt)
     loc = gimple_location (cond_stmt);
-  create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
-			new_arg_struct, n_threads, loc);
+  parallel_head = create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
+					new_arg_struct, n_threads, loc,
+					region_entry, oacc_kernels_p);
   if (reduction_list->elements () > 0)
     create_call_for_reduction (loop, reduction_list, &clsn_data);
 
@@ -1905,6 +1962,16 @@  gen_parallel_loop (struct loop *loop,
      removed statements.  */
   FOR_EACH_LOOP (loop, 0)
     free_numbers_of_iterations_estimates_loop (loop);
+
+  if (oacc_kernels_p)
+    {
+      /* Expand the parallel constructs.  We do it directly here instead of
+	 running a separate expand_omp pass, since it is more efficient, and
+	 less likely to cause troubles with further analyses not being able to
+	 deal with the OMP trees.  */
+
+      omp_expand_local (parallel_head);
+    }
 }
 
 /* Returns true when LOOP contains vector phi nodes.  */
@@ -2131,7 +2198,7 @@  try_create_reduction_list (loop_p loop,
    otherwise.  */
 
 bool
-parallelize_loops (void)
+parallelize_loops (bool oacc_kernels_p)
 {
   unsigned n_threads = flag_tree_parallelize_loops;
   bool changed = false;
@@ -2140,6 +2207,7 @@  parallelize_loops (void)
   struct obstack parloop_obstack;
   HOST_WIDE_INT estimated;
   source_location loop_loc;
+  basic_block region_entry, region_exit;
 
   /* Do not parallelize loops in the functions created by parallelization.  */
   if (parallelized_function_p (cfun->decl))
@@ -2151,9 +2219,25 @@  parallelize_loops (void)
   reduction_info_table_type reduction_list (10);
   init_stmt_vec_info_vec ();
 
+  calculate_dominance_info (CDI_DOMINATORS);
+
   FOR_EACH_LOOP (loop, 0)
     {
       reduction_list.empty ();
+
+      if (oacc_kernels_p)
+	{
+	  if (!loop_in_oacc_kernels_region_p (loop, &region_entry, &region_exit))
+	    continue;
+	  else
+	    {
+	      if (dump_file && (dump_flags & TDF_DETAILS))
+		fprintf (dump_file,
+			 "Trying loop %d with header bb %d in oacc kernels region\n",
+			 loop->num, loop->header->index);
+	    }
+	}
+
       if (dump_file && (dump_flags & TDF_DETAILS))
       {
         fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
@@ -2223,8 +2307,9 @@  parallelize_loops (void)
 	  fprintf (dump_file, "\nloop at %s:%d: ",
 		   LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc));
       }
+
       gen_parallel_loop (loop, &reduction_list,
-			 n_threads, &niter_desc);
+			 n_threads, &niter_desc, region_entry, oacc_kernels_p);
     }
 
   free_stmt_vec_info_vec ();
@@ -2275,7 +2360,7 @@  pass_parallelize_loops::execute (function *fun)
   if (number_of_loops (fun) <= 1)
     return 0;
 
-  if (parallelize_loops ())
+  if (parallelize_loops (false))
     {
       fun->curr_properties &= ~(PROP_gimple_eomp);
       return TODO_update_ssa;
@@ -2293,4 +2378,51 @@  make_pass_parallelize_loops (gcc::context *ctxt)
 }
 
 
+namespace {
+
+const pass_data pass_data_parallelize_loops_oacc_kernels =
+{
+  GIMPLE_PASS, /* type */
+  "parloops_oacc_kernels", /* name */
+  OPTGROUP_LOOP, /* optinfo_flags */
+  TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
+  ( PROP_cfg | PROP_ssa ), /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_parallelize_loops_oacc_kernels : public gimple_opt_pass
+{
+public:
+  pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_parallelize_loops_oacc_kernels, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *) { return flag_tree_parallelize_loops > 1; }
+  virtual unsigned int execute (function *);
+
+}; // class pass_parallelize_loops_oacc_kernels
+
+unsigned
+pass_parallelize_loops_oacc_kernels::execute (function *fun)
+{
+  if (number_of_loops (fun) <= 1)
+    return 0;
+
+  if (parallelize_loops (true))
+    return TODO_cleanup_cfg | TODO_rebuild_alias;
+  return 0;
+}
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
+{
+  return new pass_parallelize_loops_oacc_kernels (ctxt);
+}
+
 #include "gt-tree-parloops.h"
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index dd1f308..a5c7713 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -374,6 +374,8 @@  extern gimple_opt_pass *make_pass_slp_vectorize (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_complete_unroll (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
+extern gimple_opt_pass *
+  make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
diff --git a/libgomp/testsuite/libgomp.oacc-c/oacc-kernels-2-run.c b/libgomp/testsuite/libgomp.oacc-c/oacc-kernels-2-run.c
new file mode 100644
index 0000000..5cdae0b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/oacc-kernels-2-run.c
@@ -0,0 +1,65 @@ 
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2 -std=c99" } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define N (1024 * 512)
+#define N_REF 4293394432
+
+#if 1
+#define COUNTERTYPE unsigned int
+#else
+#define COUNTERTYPE int
+#endif
+
+int
+main (void)
+{
+  unsigned int i;
+
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = malloc (N * sizeof (unsigned int));
+  b = malloc (N * sizeof (unsigned int));
+  c = malloc (N * sizeof (unsigned int));
+
+
+#pragma acc kernels copyout (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+#pragma acc kernels copyout (b[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+  {
+    unsigned int sum = 0;
+
+    for (COUNTERTYPE i = 0; i < N; i++)
+      sum += c[i];
+
+    printf ("sum: %u\n", sum);
+
+    if (sum != N_REF)
+      abort ();
+  }
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c/oacc-kernels-run.c b/libgomp/testsuite/libgomp.oacc-c/oacc-kernels-run.c
new file mode 100644
index 0000000..b9e62a0
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/oacc-kernels-run.c
@@ -0,0 +1,59 @@ 
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2 -std=c99" } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define N (1024 * 512)
+#define N_REF 4293394432
+
+#if 1
+#define COUNTERTYPE unsigned int
+#else
+#define COUNTERTYPE int
+#endif
+
+int
+main (void)
+{
+  unsigned int i;
+
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = malloc (N * sizeof (unsigned int));
+  b = malloc (N * sizeof (unsigned int));
+  c = malloc (N * sizeof (unsigned int));
+
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    a[i] = i * 2;
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+  {
+    unsigned int sum = 0;
+
+    for (COUNTERTYPE i = 0; i < N; i++)
+      sum += c[i];
+
+    printf ("sum: %u\n", sum);
+
+    if (sum != N_REF)
+      abort ();
+  }
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
-- 
1.9.1