[08/10,OpenACC] New OpenACC kernels region decompose algorithm
diff mbox series

Message ID 77512ca9-4c6c-d138-181b-8c87624c2033@codesourcery.com
State New
Headers show
Series
  • Rework handling of OpenACC kernels regions
Related show

Commit Message

Kwok Cheung Yeung July 17, 2019, 9:29 p.m. UTC
Previously, OpenACC kernels region bodies were decomposed into a sequence of 
alternating gang-single and gang-parallel "parallel" regions. The new algorithm 
in this patch introduces a third possibility: Loops that look like they might 
benefit from the parloops pass are converted into old "kernels" regions, 
exposing them to the parloops pass later on. This has the benefit that loops 
that cannot be parallelized are not offloaded to the GPU.

2019-07-16  Thomas Schwinge  <thomas@codesourcery.com>

	gcc/
	* omp-oacc-kernels.c (adjust_region_code_walk_stmt_fn)
	(adjust_region_code): New functions.
	(make_loops_gang_single): Update.
	(make_gang_single_region): Rename to...
	(make_region_seq): ... this, and update.
	(make_gang_parallel_loop_region): Rename to...
	(make_region_loop_nest): ... this, and update.
	(is_unconditional_oacc_for_loop): Remove stmt parameter and check.
	(decompose_kernels_region_body): Update.

	gcc/testsuite/
	* c-c++-common/goacc/kernels-conversion.c: Adjust test.
	* gfortran.dg/goacc/kernels-conversion.f95: Likewise.
	* c-c++-common/goacc/kernels-decompose-1.c: New file.
	* gfortran.dg/goacc/kernels-decompose-1.f95: Likewise.
	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c: New
	file.
---
  gcc/omp-oacc-kernels.c                             | 293 +++++++++++++++++----
  .../c-c++-common/goacc/kernels-conversion.c        |  19 +-
  .../c-c++-common/goacc/kernels-decompose-1.c       | 123 +++++++++
  .../gfortran.dg/goacc/kernels-conversion.f95       |  22 +-
  .../gfortran.dg/goacc/kernels-decompose-1.f95      | 132 ++++++++++
  .../kernels-decompose-1.c                          |  30 +++
  6 files changed, 553 insertions(+), 66 deletions(-)
  create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-decompose-1.c
  create mode 100644 gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1.f95
  create mode 100644 
libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c

+  assert (a == 234);
+
+  return 0;
+}

Patch
diff mbox series

diff --git a/gcc/omp-oacc-kernels.c b/gcc/omp-oacc-kernels.c
index 0fae74a..d65e6c6 100644
--- a/gcc/omp-oacc-kernels.c
+++ b/gcc/omp-oacc-kernels.c
@@ -39,6 +39,7 @@  along with GCC; see the file COPYING3.  If not see
  #include "gimple-iterator.h"
  #include "gimple-walk.h"
  #include "gomp-constants.h"
+#include "omp-general.h"

  /* This is a preprocessing pass to be run immediately before lower_omp.  It
     will convert OpenACC "kernels" regions into sequences of "parallel"
@@ -135,6 +136,95 @@  top_level_omp_for_in_stmt (gimple *stmt)
    return NULL;
  }

+/* Helper for adjust_region_code: evaluate the statement at GSI_P.  */
+
+static tree
+adjust_region_code_walk_stmt_fn (gimple_stmt_iterator *gsi_p,
+				 bool *handled_ops_p,
+				 struct walk_stmt_info *wi)
+{
+  int *region_code = (int *) wi->info;
+
+  gimple *stmt = gsi_stmt (*gsi_p);
+  switch (gimple_code (stmt))
+    {
+    case GIMPLE_OMP_FOR:
+      {
+	tree clauses = gimple_omp_for_clauses (stmt);
+	if (omp_find_clause (clauses, OMP_CLAUSE_INDEPENDENT))
+	  {
+	    /* Explicit 'independent' clause.  */
+	    /* Keep going; recurse into loop body.  */
+	    break;
+	  }
+	else if (omp_find_clause (clauses, OMP_CLAUSE_SEQ))
+	  {
+	    /* Explicit 'seq' clause.  */
+	    /* We'll "parallelize" if at some level a loop construct has been
+	       marked up by the user as unparallelizable ('seq' clause; we'll
+	       respect that in the later processing).  Given that the user has
+	       explicitly marked it up, this loop construct cannot be
+	       performance-critical (and we thus don't have to "avoid
+	       offloading"), and in this case it's also fine to "parallelize"
+	       instead of "gang-single", because any outer or inner loops may
+	       still exploit the available parallelism.  */
+	    /* Keep going; recurse into loop body.  */
+	    break;
+	  }
+	else
+	  {
+	    /* Explicit or implicit 'auto' clause.  */
+	    /* The user would like this loop analyzed ('auto' clause) and
+	       typically parallelized, but we don't have available yet the
+	       compiler logic to analyze this, so can't parallelize it here, so
+	       we'd very likely be running into a performance problem if we
+	       were to execute this unparallelized, thus forward the whole loop
+	       nest to "parloops".  */
+	    *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+	    /* Terminate: final decision for this region.  */
+	    *handled_ops_p = true;
+	    return integer_zero_node;
+	  }
+	gcc_unreachable ();
+      }
+
+    case GIMPLE_COND:
+    case GIMPLE_GOTO:
+    case GIMPLE_SWITCH:
+    case GIMPLE_ASM:
+    case GIMPLE_TRANSACTION:
+    case GIMPLE_RETURN:
+      /* Statement that might constitute some looping/control flow pattern.  */
+      /* The user would like this code analyzed (implicit inside a 'kernels'
+	 region) and typically parallelized, but we don't have available yet
+	 the compiler logic to analyze this, so can't parallelize it here, so
+	 we'd very likely be running into a performance problem if we were to
+	 execute this unparallelized, thus forward the whole thing to
+	 "parloops".  */
+      *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+      /* Terminate: final decision for this region.  */
+      *handled_ops_p = true;
+      return integer_zero_node;
+
+    default:
+      /* Keep going.  */
+      break;
+    }
+
+  return NULL;
+}
+
+/* Adjust the REGION_CODE for the region in GS.  */
+
+static void
+adjust_region_code (gimple_seq gs, int *region_code)
+{
+  struct walk_stmt_info wi;
+  memset (&wi, 0, sizeof (wi));
+  wi.info = region_code;
+  walk_gimple_seq (gs, adjust_region_code_walk_stmt_fn, NULL, &wi);
+}
+
  /* Helper function for make_loops_gang_single for walking the tree. If the
     statement indicated by GSI_P is an OpenACC for loop with a gang clause,
     issue a warning and remove the clause.  */
@@ -174,6 +264,7 @@  visit_loops_in_gang_single_region (gimple_stmt_iterator *gsi_p,
        gimple_omp_for_set_clauses (stmt, clauses);
        /* No need to recurse into nested statements; no loop nested inside
           this loop can be gang-partitioned.  */
+      sorry ("'gang' loop in \"gang-single\" region");
        *handled_ops_p = true;
        break;

@@ -184,16 +275,16 @@  visit_loops_in_gang_single_region (gimple_stmt_iterator 
*gsi_p,
    return NULL;
  }

-/* Visit all nested OpenACC loops in the statement indicated by GSI.  This
+/* Visit all nested OpenACC loops in the sequence indicated by GS.  This
     statement is expected to be inside a gang-single region.  Issue a warning
     for any loops inside it that have gang clauses and remove the clauses.  */

  static void
-make_loops_gang_single (gimple_stmt_iterator gsi)
+make_loops_gang_single (gimple_seq gs)
  {
    struct walk_stmt_info wi;
    memset (&wi, 0, sizeof (wi));
-  walk_gimple_stmt (&gsi, visit_loops_in_gang_single_region, NULL, &wi);
+  walk_gimple_seq (gs, visit_loops_in_gang_single_region, NULL, &wi);
  }

  /* Construct a "gang-single" OpenACC parallel region at LOC containing the
@@ -202,21 +293,75 @@  make_loops_gang_single (gimple_stmt_iterator gsi)
     to force gang-single execution.  */

  static gimple *
-make_gang_single_region (location_t loc, gimple_seq stmts, tree clauses)
+make_region_seq (location_t loc, gimple_seq stmts,
+		 tree num_gangs_clause,
+		 tree num_workers_clause,
+		 tree vector_length_clause,
+		 tree clauses)
  {
    /* This correctly unshares the entire clause chain rooted here.  */
    clauses = unshare_expr (clauses);
-  /* Make a num_gangs(1) clause.  */
-  tree gang_single_clause = build_omp_clause (loc, OMP_CLAUSE_NUM_GANGS);
-  OMP_CLAUSE_OPERAND (gang_single_clause, 0) = integer_one_node;
-  OMP_CLAUSE_CHAIN (gang_single_clause) = clauses;
+
+  dump_user_location_t loc_stmts_first = gimple_seq_first (stmts);
+
+  /* Figure out the region code for this region.  */
+  /* Optimistic default: assume "setup code", no looping; thus not
+     performance-critical.  */
+  int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE;
+  adjust_region_code (stmts, &region_code);
+
+  if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc_stmts_first,
+			 "beginning \"gang-single\" region in OpenACC"
+			 " 'kernels' construct\n");
+
+      /* Make a num_gangs(1) clause.  */
+      tree gang_single_clause = build_omp_clause (loc, OMP_CLAUSE_NUM_GANGS);
+      OMP_CLAUSE_OPERAND (gang_single_clause, 0) = integer_one_node;
+      OMP_CLAUSE_CHAIN (gang_single_clause) = clauses;
+      clauses = gang_single_clause;
+
+      /* Remove and issue warnings about gang clauses on any OpenACC
+	 loops nested inside this sequentially executed statement.  */
+      make_loops_gang_single (stmts);
+    }
+  else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc_stmts_first,
+			 "beginning \"parloops\" region in OpenACC"
+			 " 'kernels' construct\n");
+
+      /* As we're transforming a "GF_OMP_TARGET_KIND_OACC_KERNELS" into another
+	 "GF_OMP_TARGET_KIND_OACC_KERNELS", this isn't doing any of the clauses
+	 mangling that "make_region_loop_nest" is doing.  */
+      /* Re-assemble the clauses stripped off earlier.  */
+      if (num_gangs_clause != NULL)
+	{
+	  tree c = unshare_expr (num_gangs_clause);
+	  OMP_CLAUSE_CHAIN (c) = clauses;
+	  clauses = c;
+	}
+      if (num_workers_clause != NULL)
+	{
+	  tree c = unshare_expr (num_workers_clause);
+	  OMP_CLAUSE_CHAIN (c) = clauses;
+	  clauses = c;
+	}
+      if (vector_length_clause != NULL)
+	{
+	  tree c = unshare_expr (vector_length_clause);
+	  OMP_CLAUSE_CHAIN (c) = clauses;
+	  clauses = c;
+	}
+    }
+  else
+    gcc_unreachable ();

    /* Build the gang-single region.  */
-  gimple *single_region
-    = gimple_build_omp_target (
-        NULL,
-        GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE,
-        gang_single_clause);
+  gimple *single_region = gimple_build_omp_target (NULL, region_code, clauses);
    gimple_set_location (single_region, loc);
    gbind *single_body = gimple_build_bind (NULL, stmts, make_node (BLOCK));
    gimple_omp_set_body (single_region, single_body);
@@ -224,7 +369,7 @@  make_gang_single_region (location_t loc, gimple_seq stmts, 
tree clauses)
    return single_region;
  }

-/* Helper function for make_gang_parallel_loop_region.  Adds a num_gangs
+/* Helper function for make_region_loop_nest.  Adds a num_gangs
     (num_workers, vector_length) clause to the given CLAUSES, either the one
     from the parent region (PARENT_CLAUSE) or a new one based on the loop's
     own LOOP_CLAUSE ("gang(num: N)" or similar for workers or vectors) with
@@ -256,7 +401,7 @@  add_parent_or_loop_num_clause (tree parent_clause, tree 
loop_clause,
    return clauses;
  }

-/* Helper for make_gang_parallel_loop_region, looking for "worker(num: N)"
+/* Helper for make_region_loop_nest, looking for "worker(num: N)"
     or "vector(length: N)" clauses in nested loops.  Removes the numeric
     argument, transferring it to the enclosing parallel region (via
     WI->INFO).  If numeric arguments within the same loop nest conflict,
@@ -493,32 +638,65 @@  transform_kernels_loop_clauses (gimple *omp_for,
     adjust_nested_loop_clauses function.  */

  static gimple *
-make_gang_parallel_loop_region (gimple *omp_for, gimple *stmt,
-                                tree num_gangs_clause,
-                                tree num_workers_clause,
-                                tree vector_length_clause,
-                                tree clauses)
+make_region_loop_nest (gimple *omp_for, gimple_seq stmts,
+		       tree num_gangs_clause,
+		       tree num_workers_clause,
+		       tree vector_length_clause,
+		       tree clauses)
  {
    /* This correctly unshares the entire clause chain rooted here.  */
    clauses = unshare_expr (clauses);

-  clauses = transform_kernels_loop_clauses (omp_for,
-					    num_gangs_clause,
-					    num_workers_clause,
-					    vector_length_clause,
-					    clauses);
+  /* Figure out the region code for this region.  */
+  /* Optimistic default: assume that the loop nest is parallelizable
+     (essentially, no GIMPLE_OMP_FOR with (explicit or implicit) 'auto' clause,
+     and no un-annotated loops).  */
+  int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED;
+  adjust_region_code (stmts, &region_code);
+
+  if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, omp_for,
+			 "parallelized loop nest in OpenACC 'kernels'"
+			 " construct\n");
+
+      clauses = transform_kernels_loop_clauses (omp_for,
+						num_gangs_clause,
+						num_workers_clause,
+						vector_length_clause,
+						clauses);
+    }
+  else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, omp_for,
+			 "forwarded loop nest in OpenACC 'kernels' construct"
+			 " to \"parloops\" for analysis\n");
+
+      /* We're transforming one "GF_OMP_TARGET_KIND_OACC_KERNELS" into another
+	 "GF_OMP_TARGET_KIND_OACC_KERNELS", so don't have to
+	 "transform_kernels_loop_clauses".  */
+      /* Re-assemble the clauses stripped off earlier.  */
+      clauses
+	= add_parent_or_loop_num_clause (num_gangs_clause, NULL,
+					 OMP_CLAUSE_NUM_GANGS, clauses);
+      clauses
+	= add_parent_or_loop_num_clause (num_workers_clause, NULL,
+					 OMP_CLAUSE_NUM_WORKERS, clauses);
+      clauses
+	= add_parent_or_loop_num_clause (vector_length_clause, NULL,
+					 OMP_CLAUSE_VECTOR_LENGTH, clauses);
+    }
+  else
+    gcc_unreachable ();

    /* Now build the parallel region containing this loop.  */
-  gimple_seq parallel_body = NULL;
-  gimple_seq_add_stmt (&parallel_body, stmt);
    gimple *parallel_body_bind
-    = gimple_build_bind (NULL, parallel_body, make_node (BLOCK));
+    = gimple_build_bind (NULL, stmts, make_node (BLOCK));
    gimple *parallel_region
-    = gimple_build_omp_target (
-        parallel_body_bind,
-        GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED,
-        clauses);
-  gimple_set_location (parallel_region, gimple_location (stmt));
+    = gimple_build_omp_target (parallel_body_bind, region_code, clauses);
+  gimple_set_location (parallel_region, gimple_location (omp_for));

    return parallel_region;
  }
@@ -744,9 +922,9 @@  class control_flow_regions
         information for the statement sequence SEQ.  */
      control_flow_regions (gimple_seq seq);

-    /* Return true if the STMT with the given index IDX in the analyzed
+    /* Return true if the statement with the given index IDX in the analyzed
         statement sequence is an unconditionally executed OpenACC loop.  */
-    bool is_unconditional_oacc_for_loop (gimple *stmt, size_t idx);
+    bool is_unconditional_oacc_for_loop (size_t idx);

    private:
      /* Find the region representative for the statement identified by index
@@ -777,11 +955,8 @@  control_flow_regions::control_flow_regions (gimple_seq seq)
  }

  bool
-control_flow_regions::is_unconditional_oacc_for_loop (gimple *stmt, size_t idx)
+control_flow_regions::is_unconditional_oacc_for_loop (size_t idx)
  {
-  if (top_level_omp_for_in_stmt (stmt) == NULL)
-    /* Not an OpenACC for loop.  */
-    return false;
    if (idx == 0 || idx == representatives.length () - 1)
      /* The first or last statement in the kernels region.  This means that
         there is no room before or after it for a jump or a label.  Thus
@@ -917,7 +1092,7 @@  control_flow_regions::compute_regions (gimple_seq seq)
  }

  /* Decompose the body of the KERNELS_REGION, which was originally annotated
-   with the KERNELS_CLAUSES, into a series of parallel regions.  */
+   with the KERNELS_CLAUSES, into a series of regions.  */

  static gimple *
  decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
@@ -1057,17 +1232,24 @@  decompose_kernels_region_body (gimple *kernels_region, 
tree kernels_clauses)

        gimple *stmt = gsi_stmt (gsi);
        gimple *omp_for = top_level_omp_for_in_stmt (stmt);
+      bool is_unconditional_oacc_for_loop = false;
+      if (omp_for != NULL)
+	is_unconditional_oacc_for_loop
+	  = cf_regions.is_unconditional_oacc_for_loop (idx);
        if (omp_for != NULL
-          && cf_regions.is_unconditional_oacc_for_loop (stmt, idx))
+          && is_unconditional_oacc_for_loop)
          {
-          /* This is an OMP for statement, put it into a parallel region.
+          /* This is an OMP for statement, put it into a separate region.
               But first, construct a gang-single region containing any
               complex sequential statements we may have seen.  */
            if (gang_single_seq != NULL && !only_simple_assignments)
              {
                gimple *single_region
-                = make_gang_single_region (loc, gang_single_seq,
-                                           kernels_clauses);
+                = make_region_seq (loc, gang_single_seq,
+				   num_gangs_clause,
+				   num_workers_clause,
+				   vector_length_clause,
+				   kernels_clauses);
                gimple_seq_add_stmt (&region_body, single_region);
              }
            else if (gang_single_seq != NULL && only_simple_assignments)
@@ -1085,8 +1267,10 @@  decompose_kernels_region_body (gimple *kernels_region, 
tree kernels_clauses)
            gang_single_seq = NULL;
            only_simple_assignments = true;

+	  gimple_seq parallel_seq = NULL;
+	  gimple_seq_add_stmt (&parallel_seq, stmt);
            gimple *parallel_region
-            = make_gang_parallel_loop_region (omp_for, stmt,
+	    = make_region_loop_nest (omp_for, parallel_seq,
                                                num_gangs_clause,
                                                num_workers_clause,
                                                vector_length_clause,
@@ -1095,6 +1279,16 @@  decompose_kernels_region_body (gimple *kernels_region, 
tree kernels_clauses)
          }
        else
          {
+	  if (omp_for != NULL)
+	    {
+	      gcc_checking_assert (!is_unconditional_oacc_for_loop);
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, omp_for,
+				 "unparallelized loop nest in OpenACC"
+				 " 'kernels' region: it's executed"
+				 " conditionally\n");
+	    }
+
            /* This is not an unconditional OMP for statement, so it will be
               put into a gang-single region.  */
            gimple_seq_add_stmt (&gang_single_seq, stmt);
@@ -1107,9 +1301,6 @@  decompose_kernels_region_body (gimple *kernels_region, 
tree kernels_clauses)
                  && DECL_ARTIFICIAL (gimple_assign_lhs (stmt)));
            if (!is_simple_assignment)
              only_simple_assignments = false;
-          /* Remove and issue warnings about gang clauses on any OpenACC
-             loops nested inside this sequentially executed statement.  */
-          make_loops_gang_single (gsi);
          }
      }

@@ -1128,7 +1319,11 @@  decompose_kernels_region_body (gimple *kernels_region, 
tree kernels_clauses)
    if (gang_single_seq != NULL)
      {
        gimple *single_region
-        = make_gang_single_region (loc, gang_single_seq, kernels_clauses);
+        = make_region_seq (loc, gang_single_seq,
+			   num_gangs_clause,
+			   num_workers_clause,
+			   vector_length_clause,
+			   kernels_clauses);
        gimple_seq_add_stmt (&region_body, single_region);
      }

diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c 
b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
index 3e52ec4..ea7eec9 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
@@ -12,19 +12,22 @@  main (void)
    unsigned int sum = 1;

  #pragma acc kernels copyin(a[0:N]) copy(sum)
-  /* { dg-bogus "region contains gang partitoned code but is not gang 
partitioned" "gang partitioned" { xfail *-*-* } .-1 } */
    {
+    /* converted to "oacc_kernels" */
      #pragma acc loop
      for (i = 0; i < N; ++i)
        sum += a[i];

+    /* converted to "oacc_parallel_kernels_gang_single" */
      sum++;
      a[0]++;

-    #pragma acc loop
+    /* converted to "oacc_parallel_kernels_parallelized" */
+    #pragma acc loop independent
      for (i = 0; i < N; ++i)
        sum += a[i];

+    /* converted to "oacc_kernels" */
      if (sum > 10)
        {
          #pragma acc loop
@@ -32,7 +35,8 @@  main (void)
            sum += a[i];
        }

-    #pragma acc loop
+    /* converted to "oacc_kernels" */
+    #pragma acc loop auto
      for (i = 0; i < N; ++i)
        sum += a[i];
    }
@@ -44,10 +48,11 @@  main (void)
     parallel regions.  */
  /* { dg-final { scan-tree-dump-times "oacc_data_kernels" 1 
"convert_oacc_kernels" } } */

-/* The three unconditional loop regions are parallelized, the sequential
-   part in between and the conditional loop are made gang-single.  */
-/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 3 
"convert_oacc_kernels" } } */
-/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 2 
"convert_oacc_kernels" } } */
+/* As noted in the comments above, we get one gang-single serial region; one
+   parallelized loop region; and three "old-style" kernel regions. */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 1 
"convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 1 
"convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_kernels" 3 "convert_oacc_kernels" } } */

  /* Each of the parallel regions is async, and there is a final call to
     __builtin_GOACC_wait.  */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1.c 
b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1.c
new file mode 100644
index 0000000..b5d58c3
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1.c
@@ -0,0 +1,123 @@ 
+/* Test OpenACC 'kernels' construct decomposition.  */
+
+/* { dg-additional-options "-fopenacc-kernels=split" } */
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+/* { dg-additional-options "-O2" } for "parloops".  */
+
+/* See also "../../gfortran.dg/goacc/kernels-decompose-1.f95".  */
+
+#pragma acc routine gang
+extern int
+f_g (int);
+
+#pragma acc routine worker
+extern int
+f_w (int);
+
+#pragma acc routine vector
+extern int
+f_v (int);
+
+#pragma acc routine seq
+extern int
+f_s (int);
+
+int
+main ()
+{
+  int x, y, z;
+#define N 10
+  int a[N], b[N], c[N];
+
+#pragma acc kernels
+  {
+    x = 0; /* { dg-message "note: beginning .gang-single. region in OpenACC 
.kernels. construct" } */
+    y = x < 10;
+    z = x++;
+    ;
+  }
+
+#pragma acc kernels
+  for (int i = 0; i < N; i++) /* { dg-message "note: beginning .parloops. 
region in OpenACC .kernels. construct" } */
+    a[i] = 0;
+
+#pragma acc kernels loop
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. construct to 
.parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (int i = 0; i < N; i++)
+    b[i] = a[N - i - 1];
+
+#pragma acc kernels
+  {
+#pragma acc loop
+    /* { dg-message "note: forwarded loop nest in OpenACC .kernels. construct 
to .parloops. for analysis" "" { target *-*-* } .-1 } */
+    for (int i = 0; i < N; i++)
+      b[i] = a[N - i - 1];
+
+#pragma acc loop
+    /* { dg-message "note: forwarded loop nest in OpenACC .kernels. construct 
to .parloops. for analysis" "" { target *-*-* } .-1 } */
+    for (int i = 0; i < N; i++)
+      c[i] = a[i] * b[i];
+
+    a[z] = 0; /* { dg-message "note: beginning .gang-single. region in OpenACC 
.kernels. construct" } */
+
+#pragma acc loop
+    /* { dg-message "note: forwarded loop nest in OpenACC .kernels. construct 
to .parloops. for analysis" "" { target *-*-* } .-1 } */
+    for (int i = 0; i < N; i++)
+      c[i] += a[i];
+
+#pragma acc loop seq /* { dg-message "note: assigned OpenACC seq loop 
parallelism" } */
+    /* { dg-message "note: parallelized loop nest in OpenACC .kernels. 
construct" "" { target *-*-* } .-1 } */
+    for (int i = 0 + 1; i < N; i++)
+      c[i] += c[i - 1];
+  }
+
+#pragma acc kernels
+  {
+#pragma acc loop independent /* { dg-message "note: assigned OpenACC gang loop 
parallelism" } */
+    /* { dg-message "note: parallelized loop nest in OpenACC .kernels. 
construct" "" { target *-*-* } .-1 } */
+    for (int i = 0; i < N; ++i)
+#pragma acc loop independent /* { dg-message "note: assigned OpenACC worker 
loop parallelism" } */
+      for (int j = 0; j < N; ++j)
+#pragma acc loop independent /* { dg-message "note: assigned OpenACC seq loop 
parallelism" } */
+	 /* { dg-warning "insufficient partitioning available to parallelize loop" "" 
{ target *-*-* } .-1 } */
+	for (int k = 0; k < N; ++k)
+	  a[(i + j + k) % N]
+	    = b[j]
+	    + f_v (c[k]); /* { dg-message "note: assigned OpenACC vector loop 
parallelism" } */
+
+    //TODO Should the following turn into "gang-single" instead of "parloops"?
+    //TODO The problem is that the first STMT is "if (y <= 4) goto <D.2547>; 
else goto <D.2548>;", thus "parloops".
+    if (y < 5) /* { dg-message "note: beginning .parloops. region in OpenACC 
.kernels. construct" } */
+#pragma acc loop independent /* { dg-message "note: unparallelized loop nest in 
OpenACC .kernels. region: it's executed conditionally" } */
+      for (int j = 0; j < N; ++j)
+	b[j] = f_w (c[j]);
+  }
+
+#pragma acc kernels /* { dg-warning "region contains gang partitoned code but 
is not gang partitioned" } */
+  {
+    /* { dg-message "note: beginning .gang-single. region in OpenACC .kernels. 
construct" "" { target *-*-* } .+1 } */
+    y = f_g (a[5]); /* { dg-message "note: assigned OpenACC gang worker vector 
loop parallelism" } */
+
+#pragma acc loop independent /* { dg-message "note: assigned OpenACC gang loop 
parallelism" } */
+    /* { dg-message "note: parallelized loop nest in OpenACC .kernels. 
construct" "" { target *-*-* } .-1 } */
+    for (int j = 0; j < N; ++j)
+      b[j] = y + f_w (c[j]); /* { dg-message "note: assigned OpenACC worker 
vector loop parallelism" } */
+  }
+
+#pragma acc kernels
+  {
+    y = 3; /* { dg-message "note: beginning .gang-single. region in OpenACC 
.kernels. construct" } */
+
+#pragma acc loop independent /* { dg-message "note: assigned OpenACC gang 
worker loop parallelism" } */
+    /* { dg-message "note: parallelized loop nest in OpenACC .kernels. 
construct" "" { target *-*-* } .-1 } */
+    for (int j = 0; j < N; ++j)
+      b[j] = y + f_v (c[j]); /* { dg-message "note: assigned OpenACC vector 
loop parallelism" } */
+
+    z = 2; /* { dg-message "note: beginning .gang-single. region in OpenACC 
.kernels. construct" } */
+  }
+
+#pragma acc kernels /* { dg-message "note: beginning .gang-single. region in 
OpenACC .kernels. construct" } */
+  ;
+
+  return 0;
+}
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95 
b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
index 559916c..6604727 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
@@ -9,19 +9,23 @@  program main

    !$acc kernels copyin(a(1:N)) copy(sum)

+  ! converted to "oacc_kernels"
    !$acc loop
    do i = 1, N
      sum = sum + a(i)
    end do

+  ! converted to "oacc_parallel_kernels_gang_single"
    sum = sum + 1
    a(1) = a(1) + 1

-  !$acc loop
+  ! converted to "oacc_parallel_kernels_parallelized"
+  !$acc loop independent
    do i = 1, N
      sum = sum + a(i)
    end do

+  ! converted to "oacc_kernels"
    if (sum .gt. 10) then
      !$acc loop
      do i = 1, N
@@ -29,8 +33,8 @@  program main
      end do
    end if

-  !$acc loop
-  ! { dg-bogus "region contains gang partitoned code but is not gang 
partitioned" "gang partitioned" { xfail *-*-* } .-1 }
+  ! converted to "oacc_kernels"
+  !$acc loop auto
    do i = 1, N
      sum = sum + a(i)
    end do
@@ -42,15 +46,13 @@  end program main
  ! parallel regions.
  ! { dg-final { scan-tree-dump-times "oacc_data_kernels" 1 
"convert_oacc_kernels" } }

-! The three unconditional loop regions are parallelized, the sequential part
-! in between and the conditional loop are made gang-single.
-! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 3 
"convert_oacc_kernels" } }
-! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 2 
"convert_oacc_kernels" } }
+! As noted in the comments above, we get one gang-single serial region; one
+! parallelized loop region; and three "old-style" kernel regions.
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 1 
"convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 1 
"convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "oacc_kernels" 3 "convert_oacc_kernels" } }

  ! Each of the parallel regions is async, and there is a final call to
  ! __builtin_GOACC_wait.
  ! { dg-final { scan-tree-dump-times "oacc_parallel_kernels.* async\(-1\)" 5 
"convert_oacc_kernels" } }
  ! { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 
"convert_oacc_kernels" } }
-
-! Check that the original kernels region is removed.
-! { dg-final { scan-tree-dump-not "oacc_kernels" "convert_oacc_kernels" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1.f95 
b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1.f95
new file mode 100644
index 0000000..520bf03
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1.f95
@@ -0,0 +1,132 @@ 
+! Test OpenACC 'kernels' construct decomposition.
+
+! { dg-additional-options "-fopenacc-kernels=split" }
+! { dg-additional-options "-fopt-info-optimized-omp" }
+! { dg-additional-options "-O2" } for "parloops".
+
+! See also "../../c-c++-common/goacc/kernels-decompose-1.c".
+
+program main
+  implicit none
+
+  integer, external :: f_g
+  !$acc routine (f_g) gang
+  integer, external :: f_w
+  !$acc routine (f_w) worker
+  integer, external :: f_v
+  !$acc routine (f_v) vector
+  integer, external :: f_s
+  !$acc routine (f_s) seq
+
+  integer :: i, j, k
+  integer :: x, y, z
+  logical :: y_l
+  integer, parameter :: N = 10
+  integer :: a(N), b(N), c(N)
+
+  !$acc kernels
+  x = 0 ! { dg-message "note: beginning .gang-single. region in OpenACC 
.kernels. construct" }
+  y = 0
+  y_l = x < 10
+  z = x
+  x = x + 1
+  ;
+  !$acc end kernels
+
+  !$acc kernels ! { dg-message "note: assigned OpenACC gang loop parallelism" }
+  do i = 1, N ! { dg-message "note: beginning .parloops. region in OpenACC 
.kernels. construct" }
+     a(i) = 0
+  end do
+  !$acc end kernels
+
+  !$acc kernels loop ! { dg-message "note: assigned OpenACC gang loop 
parallelism" }
+  ! { dg-message "note: forwarded loop nest in OpenACC .kernels. construct to 
.parloops. for analysis" "" { target *-*-* } .-1 }
+  do i = 1, N
+     b(i) = a(N - i + 1)
+  end do
+
+  !$acc kernels
+  !$acc loop ! { dg-message "note: assigned OpenACC gang loop parallelism" }
+  ! { dg-message "note: forwarded loop nest in OpenACC .kernels. construct to 
.parloops. for analysis" "" { target *-*-* } .-1 }
+  do i = 1, N
+     b(i) = a(N - i + 1)
+  end do
+
+  !$acc loop ! { dg-message "note: assigned OpenACC gang loop parallelism" }
+  ! { dg-message "note: forwarded loop nest in OpenACC .kernels. construct to 
.parloops. for analysis" "" { target *-*-* } .-1 }
+  do i = 1, N
+     c(i) = a(i) * b(i)
+  end do
+
+  a(z) = 0 ! { dg-message "note: beginning .gang-single. region in OpenACC 
.kernels. construct" }
+
+  !$acc loop ! { dg-message "note: assigned OpenACC gang loop parallelism" }
+  ! { dg-message "note: forwarded loop nest in OpenACC .kernels. construct to 
.parloops. for analysis" "" { target *-*-* } .-1 }
+  do i = 1, N
+     c(i) = c(i) + a(i)
+  end do
+
+  !$acc loop seq ! { dg-message "note: assigned OpenACC seq loop parallelism" }
+  ! { dg-message "note: parallelized loop nest in OpenACC .kernels. construct" 
"" { target *-*-* } .-1 }
+  do i = 1 + 1, N
+     c(i) = c(i) + c(i - 1)
+  end do
+  !$acc end kernels
+
+  !$acc kernels ! { dg-bogus "note: assigned OpenACC seq loop parallelism" 
"TODO" { xfail *-*-* } }
+  !$acc loop independent ! { dg-message "note: assigned OpenACC gang loop 
parallelism" }
+  ! { dg-message "note: parallelized loop nest in OpenACC .kernels. construct" 
"" { target *-*-* } .-1 }
+  do i = 1, N
+     !$acc loop independent ! { dg-message "note: assigned OpenACC worker loop 
parallelism" }
+     do j = 1, N
+        !$acc loop independent ! { dg-message "note: assigned OpenACC seq loop 
parallelism" "TODO" { xfail *-*-* } }
+        ! { dg-warning "insufficient partitioning available to parallelize 
loop" "TODO" { xfail *-*-* } .-1 }
+        ! { dg-bogus "note: assigned OpenACC vector loop parallelism" "TODO" { 
xfail *-*-* } .-2 }
+        do k = 1, N
+           a(1 + mod(i + j + k, N)) &
+                = b(j) &
+                + f_v (c(k)) ! { dg-message "note: assigned OpenACC vector loop 
parallelism" "TODO" { xfail *-*-* } .-1 }
+        end do
+     end do
+  end do
+
+  !TODO Should the following turn into "gang-single" instead of "parloops"?
+  !TODO The problem is that the first STMT is "if (y <= 4) goto <D.2547>; else 
goto <D.2548>;", thus "parloops".
+  if (y < 5) then ! { dg-message "note: beginning .parloops. region in OpenACC 
.kernels. construct" }
+     !$acc loop independent ! { dg-message "note: unparallelized loop nest in 
OpenACC .kernels. region: it's executed conditionally" }
+     do j = 1, N
+        b(j) = f_w (c(j))
+     end do
+  end if
+  !$acc end kernels
+
+  !$acc kernels
+  !TODO This refers to the "gang-single" "f_g" call.
+  ! { dg-warning "region contains gang partitoned code but is not gang 
partitioned" "TODO" { xfail *-*-* } .-2 }
+  ! { dg-message "note: beginning .gang-single. region in OpenACC .kernels. 
construct" "" { target *-*-* } .+1 }
+  y = f_g (a(5)) ! { dg-message "note: assigned OpenACC gang worker vector loop 
parallelism" "TODO" { xfail *-*-* } }
+
+  !$acc loop independent ! { dg-message "note: assigned OpenACC gang loop 
parallelism" "TODO" { xfail *-*-* } }
+  ! { dg-message "note: parallelized loop nest in OpenACC .kernels. construct" 
"" { target *-*-* } .-1 }
+  ! { dg-bogus "note: assigned OpenACC gang vector loop parallelism" "TODO" { 
xfail *-*-* } .-2 }
+  do j = 1, N
+     b(j) = y + f_w (c(j)) ! { dg-message "note: assigned OpenACC worker vector 
loop parallelism" "TODO" { xfail *-*-* } }
+  end do
+  !$acc end kernels
+
+  !$acc kernels
+  y = 3 ! { dg-message "note: beginning .gang-single. region in OpenACC 
.kernels. construct" }
+
+  !$acc loop independent ! { dg-message "note: assigned OpenACC gang worker 
loop parallelism" "TODO" { xfail *-*-* } }
+  ! { dg-message "note: parallelized loop nest in OpenACC .kernels. construct" 
"" { target *-*-* } .-1 }
+  ! { dg-bogus "note: assigned OpenACC gang vector loop parallelism" "TODO" { 
xfail *-*-* } .-2 }
+  do j = 1, N
+     b(j) = y + f_v (c(j)) ! { dg-message "note: assigned OpenACC vector loop 
parallelism" "TODO" { xfail *-*-* } }
+  end do
+
+  z = 2 ! { dg-message "note: beginning .gang-single. region in OpenACC 
.kernels. construct" }
+  !$acc end kernels
+
+  !$acc kernels ! { dg-message "note: beginning .gang-single. region in OpenACC 
.kernels. construct" }
+  !$acc end kernels
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c 
b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c
new file mode 100644
index 0000000..601e543
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c
@@ -0,0 +1,30 @@ 
+/* { dg-additional-options "-fopenacc-kernels=split" } */
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+#undef NDEBUG
+#include <assert.h>
+
+int main()
+{
+  int a = 0;
+#define N 123
+  int b[N] = { 0 };
+
+#pragma acc kernels
+  {
+    int c = 234; /* { dg-warning "note: beginning .gang-single. region in 
OpenACC .kernels. construct" } */
+
+#pragma acc loop independent gang /* { dg-warning "note: assigned OpenACC gang 
loop parallelism" } */
+    /* { dg-warning "note: parallelized loop nest in OpenACC .kernels. 
construct" "" { target *-*-* } 17 } */
+    for (int i = 0; i < N; ++i)
+      b[i] = c;
+
+    a = c; /* { dg-warning "note: beginning .gang-single. region in OpenACC 
.kernels. construct" } */
+  }
+
+  for (int i = 0; i < N; ++i)
+    assert (b[i] == 234);