diff mbox

Handle oacc kernels with other directives (was: openacc kernels directive -- initial support)

Message ID 87fv7tz0g3.fsf@kepler.schwinge.homeip.net
State New
Headers show

Commit Message

Thomas Schwinge April 21, 2015, 8:33 p.m. UTC
Hi!

On Sat, 15 Nov 2014 13:14:52 +0100, Tom de Vries <Tom_deVries@mentor.com> wrote:
> I'm submitting a patch series with initial support for the oacc kernels directive.

Committed to gomp-4_0-branch in r222288:

commit 7109b39defb87bc839983339c9fb4cdcb3891238
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date:   Tue Apr 21 20:32:01 2015 +0000

    Handle oacc kernels with other directives
    
    Mark directives with fn spec attributes to prevent them from acting as
    optimization barrier.
    
    	gcc/
    	* builtin-attrs.def (DOT_DOT_r_r_r): Add DEF_ATTR_FOR_STRING.
    	(ATTR_FNSPEC_DOT_DOT_r_r_r_NOTHROW_LIST): Add DEF_ATTR_TREE_LIST.
    	* omp-builtins.def (BUILT_IN_GOACC_DATA_START)
    	(BUILT_IN_GOACC_ENTER_EXIT_DATA, BUILT_IN_GOACC_UPDATE): Use
    	DEF_GOACC_BUILTIN_FNSPEC instead of DEF_GOACC_BUILTIN.
    
    	gcc/testsuite/
    	* c-c++-common/goacc/kernels-loop-data-2.c: New test.
    	* c-c++-common/goacc/kernels-loop-data-enter-exit-2.c: New test.
    	* c-c++-common/goacc/kernels-loop-data-enter-exit.c: New test.
    	* c-c++-common/goacc/kernels-loop-data-update.c: New test.
    	* c-c++-common/goacc/kernels-loop-data.c: New test.
    	* c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c: New
    	test.
    	* gfortran.dg/goacc/kernels-loop-data-2.f95: New test.
    	* gfortran.dg/goacc/kernels-loop-data-enter-exit-2.f95: New test.
    	* gfortran.dg/goacc/kernels-loop-data-enter-exit.f95: New test.
    	* gfortran.dg/goacc/kernels-loop-data-update.f95: New test.
    	* gfortran.dg/goacc/kernels-loop-data.f95: New test.
    	* gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95: New
    	test.
    
    	libgomp/
    	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c: New
    	test.
    	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c:
    	New test.
    	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c:
    	New test.
    	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c:
    	New test.
    	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c: New
    	test.
    	* testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c:
    	New test.
    	* testsuite/libgomp.oacc-fortran/kernels-loop-data-2.f95: New
    	test.
    	* testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit-2.f95:
    	New test.
    	* testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit.f95:
    	New test.
    	* testsuite/libgomp.oacc-fortran/kernels-loop-data-update.f95: New
    	test.
    	* testsuite/libgomp.oacc-fortran/kernels-loop-data.f95: New test.
    	* testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95:
    	New test.
    
    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@222288 138bc75d-0d04-0410-961f-82ee72b054a4
---
 gcc/ChangeLog.gomp                                 |    6 ++
 gcc/builtin-attrs.def                              |    3 +
 gcc/omp-builtins.def                               |   21 +++---
 gcc/testsuite/ChangeLog.gomp                       |   15 +++++
 .../c-c++-common/goacc/kernels-loop-data-2.c       |   71 ++++++++++++++++++++
 .../goacc/kernels-loop-data-enter-exit-2.c         |   69 +++++++++++++++++++
 .../goacc/kernels-loop-data-enter-exit.c           |   66 ++++++++++++++++++
 .../c-c++-common/goacc/kernels-loop-data-update.c  |   66 ++++++++++++++++++
 .../c-c++-common/goacc/kernels-loop-data.c         |   65 ++++++++++++++++++
 .../goacc/kernels-parallel-loop-data-enter-exit.c  |   67 ++++++++++++++++++
 .../gfortran.dg/goacc/kernels-loop-data-2.f95      |   52 ++++++++++++++
 .../goacc/kernels-loop-data-enter-exit-2.f95       |   52 ++++++++++++++
 .../goacc/kernels-loop-data-enter-exit.f95         |   50 ++++++++++++++
 .../gfortran.dg/goacc/kernels-loop-data-update.f95 |   49 ++++++++++++++
 .../gfortran.dg/goacc/kernels-loop-data.f95        |   50 ++++++++++++++
 .../kernels-parallel-loop-data-enter-exit.f95      |   51 ++++++++++++++
 libgomp/ChangeLog.gomp                             |   24 +++++++
 .../kernels-loop-data-2.c                          |   56 +++++++++++++++
 .../kernels-loop-data-enter-exit-2.c               |   54 +++++++++++++++
 .../kernels-loop-data-enter-exit.c                 |   51 ++++++++++++++
 .../kernels-loop-data-update.c                     |   53 +++++++++++++++
 .../libgomp.oacc-c-c++-common/kernels-loop-data.c  |   50 ++++++++++++++
 .../kernels-parallel-loop-data-enter-exit.c        |   52 ++++++++++++++
 .../libgomp.oacc-fortran/kernels-loop-data-2.f95   |   38 +++++++++++
 .../kernels-loop-data-enter-exit-2.f95             |   38 +++++++++++
 .../kernels-loop-data-enter-exit.f95               |   36 ++++++++++
 .../kernels-loop-data-update.f95                   |   36 ++++++++++
 .../libgomp.oacc-fortran/kernels-loop-data.f95     |   36 ++++++++++
 .../kernels-parallel-loop-data-enter-exit.f95      |   37 ++++++++++
 29 files changed, 1306 insertions(+), 8 deletions(-)



Grüße,
 Thomas
diff mbox

Patch

diff --git gcc/ChangeLog.gomp gcc/ChangeLog.gomp
index b1933ba..1e12554 100644
--- gcc/ChangeLog.gomp
+++ gcc/ChangeLog.gomp
@@ -1,5 +1,11 @@ 
 2015-04-21  Tom de Vries  <tom@codesourcery.com>
 
+	* builtin-attrs.def (DOT_DOT_r_r_r): Add DEF_ATTR_FOR_STRING.
+	(ATTR_FNSPEC_DOT_DOT_r_r_r_NOTHROW_LIST): Add DEF_ATTR_TREE_LIST.
+	* omp-builtins.def (BUILT_IN_GOACC_DATA_START)
+	(BUILT_IN_GOACC_ENTER_EXIT_DATA, BUILT_IN_GOACC_UPDATE): Use
+	DEF_GOACC_BUILTIN_FNSPEC instead of DEF_GOACC_BUILTIN.
+
 	* passes.def: Add pass_fre after pass_ch_oacc_kernels.
 
 	* passes.def: Add pass_scev_cprop to pass_oacc_kernels.
diff --git gcc/builtin-attrs.def gcc/builtin-attrs.def
index 8eca053..2897c19 100644
--- gcc/builtin-attrs.def
+++ gcc/builtin-attrs.def
@@ -65,6 +65,7 @@  DEF_ATTR_FOR_INT (6)
 		      ATTR_##ENUM, ATTR_NULL)
 DEF_ATTR_FOR_STRING (STR1, "1")
 DEF_ATTR_FOR_STRING (DOT_DOT_DOT_r_r_r, "...rrr")
+DEF_ATTR_FOR_STRING (DOT_DOT_r_r_r, "..rrr")
 #undef DEF_ATTR_FOR_STRING
 
 /* Construct a tree for a list of two integers.  */
@@ -131,6 +132,8 @@  DEF_ATTR_TREE_LIST (ATTR_PURE_NOTHROW_LEAF_LIST, ATTR_PURE,	\
 DEF_ATTR_TREE_LIST (ATTR_FNSPEC_DOT_DOT_DOT_r_r_r_NOTHROW_LIST, \
 		    ATTR_FNSPEC, ATTR_LIST_DOT_DOT_DOT_r_r_r, \
 		    ATTR_NOTHROW_LIST)
+DEF_ATTR_TREE_LIST (ATTR_FNSPEC_DOT_DOT_r_r_r_NOTHROW_LIST, \
+		    ATTR_FNSPEC, ATTR_LIST_DOT_DOT_r_r_r, ATTR_NOTHROW_LIST)
 DEF_ATTR_TREE_LIST (ATTR_NORETURN_NOTHROW_LIST, ATTR_NORETURN,	\
 			ATTR_NULL, ATTR_NOTHROW_LIST)
 DEF_ATTR_TREE_LIST (ATTR_NORETURN_NOTHROW_LEAF_LIST, ATTR_NORETURN,\
diff --git gcc/omp-builtins.def gcc/omp-builtins.def
index cd273f2..ba64976 100644
--- gcc/omp-builtins.def
+++ gcc/omp-builtins.def
@@ -32,13 +32,17 @@  along with GCC; see the file COPYING3.  If not see
 
 DEF_GOACC_BUILTIN (BUILT_IN_ACC_GET_DEVICE_TYPE, "acc_get_device_type",
 		   BT_FN_INT, ATTR_NOTHROW_LIST)
-DEF_GOACC_BUILTIN (BUILT_IN_GOACC_DATA_START, "GOACC_data_start",
-		   BT_FN_VOID_INT_SIZE_PTR_PTR_PTR, ATTR_NOTHROW_LIST)
+DEF_GOACC_BUILTIN_FNSPEC (BUILT_IN_GOACC_DATA_START, "GOACC_data_start",
+		   	  BT_FN_VOID_INT_SIZE_PTR_PTR_PTR,
+		   	  ATTR_FNSPEC_DOT_DOT_r_r_r_NOTHROW_LIST,
+		   	  ATTR_NOTHROW_LIST, "..rrr")
 DEF_GOACC_BUILTIN (BUILT_IN_GOACC_DATA_END, "GOACC_data_end",
 		   BT_FN_VOID, ATTR_NOTHROW_LIST)
-DEF_GOACC_BUILTIN (BUILT_IN_GOACC_ENTER_EXIT_DATA, "GOACC_enter_exit_data",
-		   BT_FN_VOID_INT_SIZE_PTR_PTR_PTR_INT_INT_VAR,
-		   ATTR_NOTHROW_LIST)
+DEF_GOACC_BUILTIN_FNSPEC (BUILT_IN_GOACC_ENTER_EXIT_DATA,
+			  "GOACC_enter_exit_data",
+			  BT_FN_VOID_INT_SIZE_PTR_PTR_PTR_INT_INT_VAR,
+			  ATTR_FNSPEC_DOT_DOT_r_r_r_NOTHROW_LIST,
+			  ATTR_NOTHROW_LIST, "..rrr")
 DEF_GOACC_BUILTIN_FNSPEC (BUILT_IN_GOACC_KERNELS_INTERNAL,
 			  "GOACC_kernels_internal",
 			  BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_INT_INT_INT_INT_INT_VAR,
@@ -50,9 +54,10 @@  DEF_GOACC_BUILTIN (BUILT_IN_GOACC_KERNELS, "GOACC_kernels",
 DEF_GOACC_BUILTIN (BUILT_IN_GOACC_PARALLEL, "GOACC_parallel",
 		   BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_INT_INT_INT_INT_INT_VAR,
 		   ATTR_NOTHROW_LIST)
-DEF_GOACC_BUILTIN (BUILT_IN_GOACC_UPDATE, "GOACC_update",
-		   BT_FN_VOID_INT_SIZE_PTR_PTR_PTR_INT_INT_VAR,
-		   ATTR_NOTHROW_LIST)
+DEF_GOACC_BUILTIN_FNSPEC (BUILT_IN_GOACC_UPDATE, "GOACC_update",
+			  BT_FN_VOID_INT_SIZE_PTR_PTR_PTR_INT_INT_VAR,
+			  ATTR_FNSPEC_DOT_DOT_r_r_r_NOTHROW_LIST,
+			  ATTR_NOTHROW_LIST, "..rrr")
 DEF_GOACC_BUILTIN (BUILT_IN_GOACC_WAIT, "GOACC_wait",
 		   BT_FN_VOID_INT_INT_VAR,
 		   ATTR_NOTHROW_LIST)
diff --git gcc/testsuite/ChangeLog.gomp gcc/testsuite/ChangeLog.gomp
index ed80f5b..4c2928b 100644
--- gcc/testsuite/ChangeLog.gomp
+++ gcc/testsuite/ChangeLog.gomp
@@ -1,6 +1,21 @@ 
 2015-04-21  Tom de Vries  <tom@codesourcery.com>
 	    Thomas Schwinge  <thomas@codesourcery.com>
 
+	* c-c++-common/goacc/kernels-loop-data-2.c: New test.
+	* c-c++-common/goacc/kernels-loop-data-enter-exit-2.c: New test.
+	* c-c++-common/goacc/kernels-loop-data-enter-exit.c: New test.
+	* c-c++-common/goacc/kernels-loop-data-update.c: New test.
+	* c-c++-common/goacc/kernels-loop-data.c: New test.
+	* c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c: New
+	test.
+	* gfortran.dg/goacc/kernels-loop-data-2.f95: New test.
+	* gfortran.dg/goacc/kernels-loop-data-enter-exit-2.f95: New test.
+	* gfortran.dg/goacc/kernels-loop-data-enter-exit.f95: New test.
+	* gfortran.dg/goacc/kernels-loop-data-update.f95: New test.
+	* gfortran.dg/goacc/kernels-loop-data.f95: New test.
+	* gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95: New
+	test.
+
 	* c-c++-common/goacc/kernels-counter-vars-function-scope.c: New test.
 	* c-c++-common/goacc/kernels-one-counter-var.c: New test.
 	* g++.dg/ipa/devirt-37.C: Update for new pass_fre.
diff --git gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c
new file mode 100644
index 0000000..fc6da6e
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c
@@ -0,0 +1,71 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc data copyout (a[0:N])
+  {
+#pragma acc kernels present (a[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	a[i] = i * 2;
+    }
+  }
+
+#pragma acc data copyout (b[0:N])
+  {
+#pragma acc kernels present (b[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	b[i] = i * 4;
+    }
+  }
+
+#pragma acc data copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+    {
+      for (COUNTERTYPE ii = 0; ii < N; ii++)
+	c[ii] = a[ii] + b[ii];
+    }
+  }
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c
new file mode 100644
index 0000000..945359f
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c
@@ -0,0 +1,69 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N])
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+#pragma acc exit data copyout (a[0:N])
+
+#pragma acc enter data create (b[0:N])
+#pragma acc kernels present (b[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+#pragma acc exit data copyout (b[0:N])
+
+
+#pragma acc enter data copyin (a[0:N], b[0:N]) create (c[0:N])
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+#pragma acc exit data copyout (c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c
new file mode 100644
index 0000000..2d6e5e3
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c
@@ -0,0 +1,66 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+#pragma acc kernels present (b[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+#pragma acc exit data copyout (a[0:N], c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c
new file mode 100644
index 0000000..c7aaf0f
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c
@@ -0,0 +1,66 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc update device (b[0:N])
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+#pragma acc exit data copyout (a[0:N], c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only two loops are analyzed, and that both can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 2 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c
new file mode 100644
index 0000000..46ca9c5
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c
@@ -0,0 +1,65 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc data copyout (a[0:N], b[0:N], c[0:N])
+  {
+#pragma acc kernels present (a[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	a[i] = i * 2;
+    }
+
+#pragma acc kernels present (b[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	b[i] = i * 4;
+    }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+    {
+      for (COUNTERTYPE ii = 0; ii < N; ii++)
+	c[ii] = a[ii] + b[ii];
+    }
+  }
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c
new file mode 100644
index 0000000..3e799ed
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c
@@ -0,0 +1,67 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+#pragma acc parallel present (b[0:N])
+  {
+#pragma acc loop
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+#pragma acc exit data copyout (a[0:N], b[0:N], c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only two loops are analyzed, and that both can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 2 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-2.f95 gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-2.f95
new file mode 100644
index 0000000..1b75a23
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-2.f95
@@ -0,0 +1,52 @@ 
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-ftree-parallelize-loops=32" }
+! { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" }
+! { dg-additional-options "-fdump-tree-optimized" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc data copyout (a(0:n-1))
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+  !$acc end data
+
+  !$acc data copyout (b(0:n-1))
+  !$acc kernels present (b(0:n-1))
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end kernels
+  !$acc end data
+
+  !$acc data copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1))
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+  !$acc end data
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main
+
+! Check that only three loops are analyzed, and that all can be parallelized.
+! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } }
+! { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } }
+
+! Check that the loop has been split off into a function.
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.1 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.2 " 1 "optimized" } }
+
+! { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } }
+! { dg-final { cleanup-tree-dump "optimized" } }
diff --git gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit-2.f95 gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit-2.f95
new file mode 100644
index 0000000..4ba83b6
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit-2.f95
@@ -0,0 +1,52 @@ 
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-ftree-parallelize-loops=32" }
+! { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" }
+! { dg-additional-options "-fdump-tree-optimized" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc enter data create (a(0:n-1))
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+  !$acc exit data copyout (a(0:n-1))
+
+  !$acc enter data create (b(0:n-1))
+  !$acc kernels present (b(0:n-1))
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end kernels
+  !$acc exit data copyout (b(0:n-1))
+
+  !$acc enter data copyin (a(0:n-1), b(0:n-1)) create (c(0:n-1))
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+  !$acc exit data copyout (c(0:n-1))
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main
+
+! Check that only three loops are analyzed, and that all can be parallelized.
+! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } }
+! { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } }
+
+! Check that the loop has been split off into a function.
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.1 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.2 " 1 "optimized" } }
+
+! { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } }
+! { dg-final { cleanup-tree-dump "optimized" } }
diff --git gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit.f95 gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit.f95
new file mode 100644
index 0000000..2b05b33
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit.f95
@@ -0,0 +1,50 @@ 
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-ftree-parallelize-loops=32" }
+! { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" }
+! { dg-additional-options "-fdump-tree-optimized" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc enter data create (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+
+  !$acc kernels present (b(0:n-1))
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end kernels
+
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+
+  !$acc exit data copyout (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main
+
+! Check that only three loops are analyzed, and that all can be parallelized.
+! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } }
+! { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } }
+
+! Check that the loop has been split off into a function.
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.1 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.2 " 1 "optimized" } }
+
+! { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } }
+! { dg-final { cleanup-tree-dump "optimized" } }
diff --git gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-update.f95 gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-update.f95
new file mode 100644
index 0000000..b3c80dc
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-update.f95
@@ -0,0 +1,49 @@ 
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-ftree-parallelize-loops=32" }
+! { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" }
+! { dg-additional-options "-fdump-tree-optimized" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc enter data create (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+
+  !$acc update device (b(0:n-1))
+
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+
+  !$acc exit data copyout (a(0:n-1), c(0:n-1))
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main
+
+! Check that only three loops are analyzed, and that all can be parallelized.
+! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 2 "parloops_oacc_kernels" } }
+! { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } }
+
+! Check that the loop has been split off into a function.
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.1 " 1 "optimized" } }
+
+! { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } }
+! { dg-final { cleanup-tree-dump "optimized" } }
diff --git gcc/testsuite/gfortran.dg/goacc/kernels-loop-data.f95 gcc/testsuite/gfortran.dg/goacc/kernels-loop-data.f95
new file mode 100644
index 0000000..98c5e7a
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/kernels-loop-data.f95
@@ -0,0 +1,50 @@ 
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-ftree-parallelize-loops=32" }
+! { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" }
+! { dg-additional-options "-fdump-tree-optimized" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc data copyout (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+
+  !$acc kernels present (b(0:n-1))
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end kernels
+
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+
+  !$acc end data
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main
+
+! Check that only three loops are analyzed, and that all can be parallelized.
+! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } }
+! { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } }
+
+! Check that the loop has been split off into a function.
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.1 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.2 " 1 "optimized" } }
+
+! { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } }
+! { dg-final { cleanup-tree-dump "optimized" } }
diff --git gcc/testsuite/gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95 gcc/testsuite/gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95
new file mode 100644
index 0000000..7ea2b49
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95
@@ -0,0 +1,51 @@ 
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-ftree-parallelize-loops=32" }
+! { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" }
+! { dg-additional-options "-fdump-tree-optimized" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc enter data create (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+
+  !$acc parallel present (b(0:n-1))
+  !$acc loop
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end parallel
+
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+
+  !$acc exit data copyout (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main
+
+! Check that only three loops are analyzed, and that all can be parallelized.
+! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 2 "parloops_oacc_kernels" } }
+! { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } }
+
+! Check that the loop has been split off into a function.
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.1 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.2 " 1 "optimized" } }
+
+! { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } }
+! { dg-final { cleanup-tree-dump "optimized" } }
diff --git libgomp/ChangeLog.gomp libgomp/ChangeLog.gomp
index bcb3340..3d762bd 100644
--- libgomp/ChangeLog.gomp
+++ libgomp/ChangeLog.gomp
@@ -1,6 +1,30 @@ 
 2015-04-21  Tom de Vries  <tom@codesourcery.com>
 	    Thomas Schwinge  <thomas@codesourcery.com>
 
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c: New
+	test.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c:
+	New test.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c:
+	New test.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c:
+	New test.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c: New
+	test.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c:
+	New test.
+	* testsuite/libgomp.oacc-fortran/kernels-loop-data-2.f95: New
+	test.
+	* testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit-2.f95:
+	New test.
+	* testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit.f95:
+	New test.
+	* testsuite/libgomp.oacc-fortran/kernels-loop-data-update.f95: New
+	test.
+	* testsuite/libgomp.oacc-fortran/kernels-loop-data.f95: New test.
+	* testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95:
+	New test.
+
 	* testsuite/libgomp.oacc-fortran/kernels-loop-2.f95: New test.
 	* testsuite/libgomp.oacc-fortran/kernels-loop.f95: New test.
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c
new file mode 100644
index 0000000..325ea7d
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c
@@ -0,0 +1,56 @@ 
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+#pragma acc data copyout (a[0:N])
+  {
+#pragma acc kernels present (a[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	a[i] = i * 2;
+    }
+  }
+
+#pragma acc data copyout (b[0:N])
+  {
+#pragma acc kernels present (b[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	b[i] = i * 4;
+    }
+  }
+
+#pragma acc data copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+    {
+      for (COUNTERTYPE ii = 0; ii < N; ii++)
+	c[ii] = a[ii] + b[ii];
+    }
+  }
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c
new file mode 100644
index 0000000..9c378a2
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c
@@ -0,0 +1,54 @@ 
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N])
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+#pragma acc exit data copyout (a[0:N])
+
+#pragma acc enter data create (b[0:N])
+#pragma acc kernels present (b[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+#pragma acc exit data copyout (b[0:N])
+
+
+#pragma acc enter data copyin (a[0:N], b[0:N]) create (c[0:N])
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+#pragma acc exit data copyout (c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c
new file mode 100644
index 0000000..78cf4c1
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c
@@ -0,0 +1,51 @@ 
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+#pragma acc kernels present (b[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+#pragma acc exit data copyout (a[0:N], b[0:N], c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c
new file mode 100644
index 0000000..67c2c36
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c
@@ -0,0 +1,53 @@ 
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc update device (b[0:N])
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+#pragma acc exit data copyout (a[0:N], c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c
new file mode 100644
index 0000000..acd7f30
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c
@@ -0,0 +1,50 @@ 
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+#pragma acc data copyout (a[0:N], b[0:N], c[0:N])
+  {
+#pragma acc kernels present (a[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	a[i] = i * 2;
+    }
+
+#pragma acc kernels present (b[0:N])
+    {
+      for (COUNTERTYPE i = 0; i < N; i++)
+	b[i] = i * 4;
+    }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+    {
+      for (COUNTERTYPE ii = 0; ii < N; ii++)
+	c[ii] = a[ii] + b[ii];
+    }
+  }
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c
new file mode 100644
index 0000000..cab10df
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c
@@ -0,0 +1,52 @@ 
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+  c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+#pragma acc enter data create (a[0:N], b[0:N], c[0:N])
+
+#pragma acc kernels present (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+#pragma acc parallel present (b[0:N])
+  {
+#pragma acc loop
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc kernels present (a[0:N], b[0:N], c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+#pragma acc exit data copyout (a[0:N], b[0:N], c[0:N])
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    if (c[i] != a[i] + b[i])
+      abort ();
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-2.f95 libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-2.f95
new file mode 100644
index 0000000..7b52253
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-2.f95
@@ -0,0 +1,38 @@ 
+! { dg-do run }
+! { dg-options "-ftree-parallelize-loops=32" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc data copyout (a(0:n-1))
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+  !$acc end data
+
+  !$acc data copyout (b(0:n-1))
+  !$acc kernels present (b(0:n-1))
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end kernels
+  !$acc end data
+
+  !$acc data copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1))
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+  !$acc end data
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit-2.f95 libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit-2.f95
new file mode 100644
index 0000000..af98efa
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit-2.f95
@@ -0,0 +1,38 @@ 
+! { dg-do run }
+! { dg-options "-ftree-parallelize-loops=32" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc enter data create (a(0:n-1))
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+  !$acc exit data copyout (a(0:n-1))
+
+  !$acc enter data create (b(0:n-1))
+  !$acc kernels present (b(0:n-1))
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end kernels
+  !$acc exit data copyout (b(0:n-1))
+
+  !$acc enter data copyin (a(0:n-1), b(0:n-1)) create (c(0:n-1))
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+  !$acc exit data copyout (c(0:n-1))
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit.f95 libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit.f95
new file mode 100644
index 0000000..bb6f8dc
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit.f95
@@ -0,0 +1,36 @@ 
+! { dg-do run }
+! { dg-options "-ftree-parallelize-loops=32" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc enter data create (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+
+  !$acc kernels present (b(0:n-1))
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end kernels
+
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+
+  !$acc exit data copyout (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-update.f95 libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-update.f95
new file mode 100644
index 0000000..cab1f2c
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-update.f95
@@ -0,0 +1,36 @@ 
+! { dg-do run }
+! { dg-options "-ftree-parallelize-loops=32" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc enter data create (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+
+  !$acc update device (b(0:n-1))
+
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+
+  !$acc exit data copyout (a(0:n-1), c(0:n-1))
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data.f95 libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data.f95
new file mode 100644
index 0000000..f26671d
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data.f95
@@ -0,0 +1,36 @@ 
+! { dg-do run }
+! { dg-options "-ftree-parallelize-loops=32" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc data copyout (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+
+  !$acc kernels present (b(0:n-1))
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end kernels
+
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+
+  !$acc end data
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95 libgomp/testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95
new file mode 100644
index 0000000..2322152
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95
@@ -0,0 +1,37 @@ 
+! { dg-do run }
+! { dg-options "-ftree-parallelize-loops=32" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc enter data create (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+
+  !$acc parallel present (b(0:n-1))
+  !$acc loop
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end parallel
+
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+
+  !$acc exit data copyout (a(0:n-1), b(0:n-1), c(0:n-1))
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) call abort
+  end do
+
+end program main