diff mbox

[gomp4] Implement OpenACC 2.5 reference counting, and finalize clause

Message ID 3cd53eb1-38b2-e082-2c20-d3e0b14f6385@mentor.com
State New
Headers show

Commit Message

Chung-Lin Tang May 16, 2017, 12:55 p.m. UTC
This patch for gomp-4_0-branch implements OpenACC 2.5 reference counting
of mappings, the finalize clause of the exit data directive, and the
corresponding API routines.

Tested without regressions, committed to gomp-4_0-branch.

Chung-Lin

2017-05-16  Chung-Lin Tang  <cltang@codesourcery.com>

        gcc/c/
        * c-parser.c (c_parser_omp_clause_name):  Handle 'finalize' clause.
        (c_parser_oacc_simple_clause): Add 'finalize' to comments.
        (c_parser_oacc_all_clauses): Handle PRAGMA_OACC_CLAUSE_FINALIZE.
        (OACC_EXIT_DATA_CLAUSE_MASK): Add PRAGMA_OACC_CLAUSE_FINALIZE.
        * c-typeck.c (c_finish_omp_clauses): Handle OMP_CLAUSE_FINALIZE.

        gcc/cp/
        * parser.c (cp_parser_omp_clause_name): Handle 'finalize' clause.
        (cp_parser_oacc_simple_clause): Add 'finalize' to comments.
        (cp_parser_oacc_all_clauses): Handle PRAGMA_OACC_CLAUSE_FINALIZE.
        (OACC_EXIT_DATA_CLAUSE_MASK): Add PRAGMA_OACC_CLAUSE_FINALIZE.
        * semantics.c (finish_omp_clauses): Handle OMP_CLAUSE_FINALIZE.

        gcc/c-family/
        * c-pragma.h (enum pragma_omp_clause): Add PRAGMA_OACC_CLAUSE_FINALIZE.

        gcc/fortran/
        * gfortran.h (struct gfc_omp_clauses): Add 'finalize:1' bitfield.
        * openmp.c (enum omp_mask2): Add OMP_CLAUSE_FINALIZE.
        (gfc_match_omp_clauses): Handle 'finalize' clause.
        (OACC_EXIT_DATA_CLAUSES): Add OMP_CLAUSE_FINALIZE.
        * trans-openmp.c (gfc_trans_omp_clauses_1): Handle finalize bit.

        gcc/
        * tree-core.h (enum omp_clause_code): Add OMP_CLAUSE_FINALIZE.
        * tree.c (omp_clause_num_ops): Add entry for OMP_CLAUSE_FINALIZE.
        (omp_clause_code_name): Add "finalize" entry.
        * omp-low.c (scan_sharing_clauses): Handle OMP_CLAUSE_FINALIZE.
        (expand_omp_target): Add finalize argument for GOACC_enter_exit_data
        call.
        * gimplify.c (gimplify_scan_omp_clauses): Handle OMP_CLAUSE_FINALIZE.
        (gimplify_adjust_omp_clauses): Likewise.

        libgomp/
        * openacc.h (acc_copyout_finalize): Declare new API function.
        (acc_copyout_finalize_async): Likewise.
	(acc_delete_finalize): Likewise.
        (acc_delete_finalize_async): Likewise.
        * openacc_lib.h (acc_copyout_finalize): Declare new API function.
        (acc_copyout_finalize_async): Likewise.
        (acc_delete_finalize): Likewise.
        (acc_delete_finalize_async): Likewise.
        * openacc.f90 (acc_copyout_finalize_32_h): Define.
	(acc_copyout_finalize_64_h): Likewise.
        (acc_copyout_finalize_array_h): Likewise.
        (acc_copyout_finalize_l): Likewise.
        (acc_copyout_finalize_async_32_h): Define.
        (acc_copyout_finalize_async_64_h): Likewise.
        (acc_copyout_finalize_async_array_h): Likewise.
        (acc_copyout_finalize_async_l): Likewise.
	(acc_delete_finalize_32_h): Define.
        (acc_delete_finalize_64_h): Likewise.
        (acc_delete_finalize_array_h): Likewise.
	(acc_delete_finalize_l): Likewise.
        (acc_delete_finalize_async_32_h): Define.
        (acc_delete_finalize_async_64_h): Likewise.
        (acc_delete_finalize_async_array_h): Likewise.
        (acc_delete_finalize_async_l): Likewise.
        * libgomp.map (OACC_2.5): Add acc_copyout_finalize* and
        acc_delete_finalize* entries.
	* libgomp.h (struct splay_tree_key_s): Add 'dynamic_refcount' field.
        (gomp_acc_remove_pointer): Adjust declaration.
        (gomp_remove_var): New declaration.
        * libgomp_g.h (GOACC_enter_exit_data): Adjust declaration.
        * oacc-mem.c (acc_map_data): Adjust new key refcount to REFCOUNT_INFINITY.
        (acc_unmap_data): Adjust key refcount to 1 for removal.
        (present_create_copy): Increment mapping refcounts when mapping exists,
        initialize dynamic refcount when creating new mapping.
        (FLAG_FINALIZE): Define macro.
        (delete_copyout): Adjust delete/copyout handling, add handling for FLAG_FINALIZE.
        (acc_delete_finalize): Define new API function.
	(acc_delete_finalize_async): Likewise.
	(acc_copyout_finalize): Likewise.
        (acc_copyout_finalize_async): Likewise.
        (gomp_acc_insert_pointer): Adjust handling.
        (gomp_acc_remove_pointer): Add finalize parameter, adjust handling.
	* oacc-parallel.c (GOACC_parallel_keyed): Disable async registering when no
        copyout needed.
        (GOACC_enter_exit_data): Add and handle finalize argument, adjust
        gomp_acc_insert_pointer and gomp_acc_remove_pointer calls.
        (GOACC_declare): Adjust calls to GOACC_enter_exit_data.
        * target.c (gomp_map_vars): Initialize dynamic_refcount.
        (gomp_remove_var): Abstract out key unreferencing into new function.
	(gomp_unmap_vars): Adjust to call gomp_remove_var.
        (gomp_unload_image_from_device): Likewise.
        (gomp_exit_data): Likewise.
        * testsuite/libgomp.oacc-c-c++-common/data-2.c: Adjust testcase for 2.5 reference counting.
        * testsuite/libgomp.oacc-c-c++-common/lib-38.c: Likewise.
        * testsuite/libgomp.oacc-fortran/data-2.f90: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/data-4.c: New test.
        * testsuite/libgomp.oacc-c-c++-common/data-5.c: Likewise.
        * testsuite/libgomp.oacc-fortran/data-5.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/data-6.f90: Likewise.
diff mbox

Patch

Index: libgomp/oacc-parallel.c
===================================================================
--- libgomp/oacc-parallel.c	(revision 248095)
+++ libgomp/oacc-parallel.c	(revision 248096)
@@ -355,7 +355,22 @@ 
 	}
     }
   else
-    tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
+    {
+      bool async_unmap = false;
+      for (size_t i = 0; i < tgt->list_count; i++)
+	{
+	  splay_tree_key k = tgt->list[i].key;
+	  if (k && k->refcount == 1)
+	    {
+	      async_unmap = true;
+	      break;
+	    }
+	}
+      if (async_unmap)
+	tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
+      else
+	gomp_unmap_vars (tgt, false);
+    }
 
   acc_dev->openacc.async_set_async_func (acc_async_sync);
 
@@ -586,7 +601,7 @@ 
 void
 GOACC_enter_exit_data (int device, size_t mapnum,
 		       void **hostaddrs, size_t *sizes, unsigned short *kinds,
-		       int async, int num_waits, ...)
+		       int async, int finalize, int num_waits, ...)
 {
   struct goacc_thread *thr;
   struct gomp_device_descr *acc_dev;
@@ -749,11 +764,9 @@ 
 	      if (kind == GOMP_MAP_DECLARE_ALLOCATE)
 		gomp_acc_declare_allocate (true, pointer, &hostaddrs[i],
 					   &sizes[i], &kinds[i]);
-	      else if (!acc_is_present (hostaddrs[i], sizes[i]))
-		{
-		  gomp_acc_insert_pointer (pointer, &hostaddrs[i],
-					   &sizes[i], &kinds[i]);
-		}
+	      else
+		gomp_acc_insert_pointer (pointer, &hostaddrs[i],
+					 &sizes[i], &kinds[i]);
 	      /* Increment 'i' by two because OpenACC requires fortran
 		 arrays to be contiguous, so each PSET is associated with
 		 one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
@@ -775,12 +788,20 @@ 
 	      {
 	      case GOMP_MAP_DELETE:
 		if (acc_is_present (hostaddrs[i], sizes[i]))
-		  acc_delete (hostaddrs[i], sizes[i]);
+		  {
+		    if (finalize)
+		      acc_delete_finalize (hostaddrs[i], sizes[i]);
+		    else
+		      acc_delete (hostaddrs[i], sizes[i]);
+		  }
 		break;
 	      case GOMP_MAP_DECLARE_DEALLOCATE:
 	      case GOMP_MAP_FROM:
 	      case GOMP_MAP_FORCE_FROM:
-		acc_copyout (hostaddrs[i], sizes[i]);
+		if (finalize)
+		  acc_copyout_finalize (hostaddrs[i], sizes[i]);
+		else
+		  acc_copyout (hostaddrs[i], sizes[i]);
 		break;
 	      default:
 		gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -793,11 +814,12 @@ 
 	    if (kind == GOMP_MAP_DECLARE_DEALLOCATE)
 	      gomp_acc_declare_allocate (false, pointer, &hostaddrs[i],
 					 &sizes[i], &kinds[i]);
-	    else if (acc_is_present (hostaddrs[i], sizes[i]))
+	    else
 	      {
 		bool copyfrom = (kind == GOMP_MAP_FORCE_FROM
 				 || kind == GOMP_MAP_FROM);
-		gomp_acc_remove_pointer (hostaddrs[i], copyfrom, async, pointer);
+		gomp_acc_remove_pointer (hostaddrs[i], sizes[i], copyfrom, async,
+					 finalize, pointer);
 		/* See the above comment.  */
 	      }
 	    i += pointer - 1;
@@ -1077,7 +1099,7 @@ 
 	  case GOMP_MAP_POINTER:
 	  case GOMP_MAP_DELETE:
 	    GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
-				   &kinds[i], 0, 0);
+				   &kinds[i], 0, 0, 0);
 	    break;
 
 	  case GOMP_MAP_FORCE_DEVICEPTR:
@@ -1086,12 +1108,12 @@ 
 	  case GOMP_MAP_ALLOC:
 	    if (!acc_is_present (hostaddrs[i], sizes[i]))
 	      GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
-				     &kinds[i], 0, 0);
+				     &kinds[i], 0, 0, 0);
 	    break;
 
 	  case GOMP_MAP_TO:
 	    GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
-				   &kinds[i], 0, 0);
+				   &kinds[i], 0, 0, 0);
 
 	    break;
 
@@ -1098,7 +1120,7 @@ 
 	  case GOMP_MAP_FROM:
 	    kinds[i] = GOMP_MAP_FORCE_FROM;
 	    GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
-				   &kinds[i], 0, 0);
+				   &kinds[i], 0, 0, 0);
 	    break;
 
 	  case GOMP_MAP_FORCE_PRESENT:
Index: libgomp/libgomp_g.h
===================================================================
--- libgomp/libgomp_g.h	(revision 248095)
+++ libgomp/libgomp_g.h	(revision 248096)
@@ -304,7 +304,7 @@ 
 			      unsigned short *);
 extern void GOACC_data_end (void);
 extern void GOACC_enter_exit_data (int, size_t, void **,
-				   size_t *, unsigned short *, int, int, ...);
+				   size_t *, unsigned short *, int, int, int, ...);
 extern void GOACC_update (int, size_t, void **, size_t *,
 			  unsigned short *, int, int, ...);
 extern void GOACC_wait (int, int, ...);
Index: libgomp/openacc.h
===================================================================
--- libgomp/openacc.h	(revision 248095)
+++ libgomp/openacc.h	(revision 248096)
@@ -118,6 +118,12 @@ 
 void acc_memcpy_to_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
 void acc_memcpy_from_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
 
+/* Finalize versions of copyout/delete functions, specified in OpenACC 2.5.  */
+void acc_copyout_finalize (void *, size_t) __GOACC_NOTHROW;
+void acc_copyout_finalize_async (void *, size_t, int) __GOACC_NOTHROW;
+void acc_delete_finalize (void *, size_t) __GOACC_NOTHROW;
+void acc_delete_finalize_async (void *, size_t, int) __GOACC_NOTHROW;
+  
 /* Old names.  OpenACC does not specify whether these can or must
    not be macros, inlines or aliases for the new names.  */
 #define acc_pcreate acc_present_or_create
Index: libgomp/libgomp.map
===================================================================
--- libgomp/libgomp.map	(revision 248095)
+++ libgomp/libgomp.map	(revision 248096)
@@ -388,6 +388,14 @@ 
 	acc_copyout_async_32_h_;
 	acc_copyout_async_64_h_;
 	acc_copyout_async_array_h_;
+	acc_copyout_finalize;
+	acc_copyout_finalize_32_h_;
+	acc_copyout_finalize_64_h_;
+	acc_copyout_finalize_array_h_;
+	acc_copyout_finalize_async;
+	acc_copyout_finalize_async_32_h_;
+	acc_copyout_finalize_async_64_h_;
+	acc_copyout_finalize_async_array_h_;
 	acc_create_async;
 	acc_create_async_32_h_;
 	acc_create_async_64_h_;
@@ -396,6 +404,14 @@ 
 	acc_delete_async_32_h_;
 	acc_delete_async_64_h_;
 	acc_delete_async_array_h_;
+	acc_delete_finalize;
+	acc_delete_finalize_32_h_;
+	acc_delete_finalize_64_h_;
+	acc_delete_finalize_array_h_;
+	acc_delete_finalize_async;
+	acc_delete_finalize_async_32_h_;
+	acc_delete_finalize_async_64_h_;
+	acc_delete_finalize_async_array_h_;
 	acc_get_default_async;
 	acc_get_default_async_h_;
 	acc_memcpy_from_device_async;
Index: libgomp/testsuite/libgomp.oacc-fortran/data-5.f90
===================================================================
--- libgomp/testsuite/libgomp.oacc-fortran/data-5.f90	(nonexistent)
+++ libgomp/testsuite/libgomp.oacc-fortran/data-5.f90	(revision 248096)
@@ -0,0 +1,56 @@ 
+! { dg-do run }
+! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } }
+
+program refcount_test
+  use openacc
+  integer, allocatable :: h(:)
+  integer i, N
+
+  N = 256
+  allocate (h(N))
+
+  do i = 1, N
+     h(i) = i
+  end do
+
+  !$acc enter data create (h(1:N))
+  !$acc enter data copyin (h(1:N))
+  !$acc enter data copyin (h(1:N))
+  !$acc enter data copyin (h(1:N))
+
+  call acc_update_self (h)
+  do i = 1, N
+     if (h(i) .eq. i) c = c + 1
+  end do
+  ! h[] should be filled with uninitialized device values,
+  ! abort if it's not.
+  if (c .eq. N) call abort
+
+  h(:) = 0
+
+  !$acc parallel present (h(1:N))
+  do i = 1, N
+     h(i) = 111
+  end do
+  !$acc end parallel
+
+  ! No actual copyout should happen.
+  call acc_copyout (h)
+  do i = 1, N
+     if (h(i) .ne. 0) call abort
+  end do
+
+  !$acc exit data delete (h(1:N))
+
+  ! This should not actually be deleted yet.
+  if (acc_is_present (h) .eqv. .FALSE.) call abort
+
+  !$acc exit data copyout (h(1:N)) finalize
+
+  do i = 1, N
+     if (h(i) .ne. 111) call abort
+  end do
+
+  if (acc_is_present (h) .eqv. .TRUE.) call abort
+
+end program refcount_test
Index: libgomp/testsuite/libgomp.oacc-fortran/data-2.f90
===================================================================
--- libgomp/testsuite/libgomp.oacc-fortran/data-2.f90	(revision 248095)
+++ libgomp/testsuite/libgomp.oacc-fortran/data-2.f90	(revision 248096)
@@ -157,8 +157,8 @@ 
 
   !$acc exit data delete (c(0:N), d(0:N))
 
-  if (acc_is_present (c) .eqv. .TRUE.) call abort
-  if (acc_is_present (d) .eqv. .TRUE.) call abort
+  if (acc_is_present (c) .eqv. .FALSE.) call abort
+  if (acc_is_present (d) .eqv. .FALSE.) call abort
 
   !$acc exit data delete (c(0:N), d(0:N))
 
@@ -177,13 +177,13 @@ 
 
   !$acc exit data delete (c(0:N), d(0:N))
 
-  if (acc_is_present (c) .eqv. .TRUE.) call abort
-  if (acc_is_present (d) .eqv. .TRUE.) call abort
+  !if (acc_is_present (c) .eqv. .TRUE.) call abort
+  !if (acc_is_present (d) .eqv. .TRUE.) call abort
 
   !$acc exit data delete (c(0:N), d(0:N))
 
-  if (acc_is_present (c) .eqv. .TRUE.) call abort
-  if (acc_is_present (d) .eqv. .TRUE.) call abort
+ if (acc_is_present (c) .eqv. .TRUE.) call abort
+ if (acc_is_present (d) .eqv. .TRUE.) call abort
 
   !$acc enter data present_or_copyin (c(0:N))
 
Index: libgomp/testsuite/libgomp.oacc-fortran/data-6.f90
===================================================================
--- libgomp/testsuite/libgomp.oacc-fortran/data-6.f90	(nonexistent)
+++ libgomp/testsuite/libgomp.oacc-fortran/data-6.f90	(revision 248096)
@@ -0,0 +1,26 @@ 
+! { dg-do run }
+! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } }
+
+program refcount_test
+  use openacc
+  integer, allocatable :: h(:)
+  integer i, N
+
+  N = 256
+  allocate (h(N))
+
+  do i = 1, N
+     h(i) = i
+  end do
+
+  !$acc data create (h(1:N))
+  !$acc enter data create (h(1:N))
+  !$acc end data
+
+  if (acc_is_present (h) .eqv. .FALSE.) call abort
+
+  !$acc exit data delete (h(1:N))
+
+  if (acc_is_present (h) .eqv. .TRUE.) call abort
+
+end program refcount_test
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-38.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-38.c	(revision 248095)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-38.c	(revision 248096)
@@ -38,7 +38,7 @@ 
 
   memset (&h[0], 0, N);
 
-  acc_copyout (h, N);
+  acc_copyout_finalize (h, N);
 
   for (i = 0; i < N; i++)
     {
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c	(revision 248095)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c	(revision 248096)
@@ -268,10 +268,10 @@ 
 
 #pragma acc exit data delete (a[0:N], b[0:N])
 
-  if (acc_is_present (a, nbytes))
+  if (!acc_is_present (a, nbytes))
     abort ();
 
-  if (acc_is_present (b, nbytes))
+  if (!acc_is_present (b, nbytes))
     abort ();
 
 #pragma acc exit data delete (a[0:N], b[0:N])
@@ -300,10 +300,10 @@ 
 
 #pragma acc exit data delete (a[0:N], b[0:N])
 
-  if (acc_is_present (a, nbytes))
+  if (!acc_is_present (a, nbytes))
     abort ();
 
-  if (acc_is_present (b, nbytes))
+  if (!acc_is_present (b, nbytes))
     abort ();
 
 #pragma acc exit data delete (a[0:N], b[0:N])
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/data-4.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-4.c	(nonexistent)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-4.c	(revision 248096)
@@ -0,0 +1,38 @@ 
+/* { dg-do run } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */
+
+#include <string.h>
+#include <stdlib.h>
+#include <openacc.h>
+
+int
+main (int argc, char **argv)
+{
+  const int N = 256;
+  int i;
+  unsigned char *h;
+  void *d1, *d2;
+
+  h = (unsigned char *) malloc (N);
+
+  for (i = 0; i < N; i++)
+    {
+      h[i] = i;
+    }
+
+#pragma acc data create (h[0:N])
+  {
+    #pragma acc enter data create (h[0:N])
+  }
+
+  if (!acc_is_present (h, N))
+    abort ();
+
+#pragma acc exit data delete (h[0:N])
+
+  if (acc_is_present (h, N))
+    abort ();
+
+  free (h);
+  return 0;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/data-5.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-5.c	(nonexistent)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-5.c	(revision 248096)
@@ -0,0 +1,66 @@ 
+/* { dg-do run } */
+/* { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } } */
+
+#include <string.h>
+#include <stdlib.h>
+#include <openacc.h>
+
+int
+main (int argc, char **argv)
+{
+  const int N = 256;
+  int i, c;
+  unsigned char *h;
+  void *d1, *d2;
+
+  h = (unsigned char *) malloc (N);
+
+  for (i = 0; i < N; i++)
+    h[i] = i;
+
+  #pragma acc enter data create (h[0:N])
+  #pragma acc enter data copyin (h[0:N])
+  #pragma acc enter data copyin (h[0:N])
+  #pragma acc enter data copyin (h[0:N])
+
+  acc_update_self (h, N);
+  for (i = 0, c = 0; i < N; i++)
+    if (h[i] == i)
+      c++;
+  /* h[] should be filled with uninitialized device values,
+     abort if it's not.  */
+  if (c == N)
+    abort ();
+
+  for (i = 0; i < N; i++)
+    h[i] = 0;
+
+  #pragma acc parallel present(h[0:N])
+  {
+    for (i = 0; i < N; i++)
+      h[i] = 111;
+  }
+
+  /* No actual copyout should happen.  */
+  acc_copyout (h, N);
+  for (i = 0; i < N; i++)
+    if (h[i] != 0)
+      abort ();
+
+  #pragma acc exit data delete (h[0:N])
+  /* This should not actually be deleted yet.  */
+  if (!acc_is_present (h, N))
+    abort ();
+
+  #pragma acc exit data copyout (h[0:N]) finalize
+
+  for (i = 0; i < N; i++)
+    if (h[i] != 111)
+      abort ();
+
+  if (acc_is_present (h, N))
+    abort ();
+
+  free (h);
+  return 0;
+}
Index: libgomp/target.c
===================================================================
--- libgomp/target.c	(revision 248095)
+++ libgomp/target.c	(revision 248096)
@@ -984,6 +984,7 @@ 
 		tgt->list[i].offset = 0;
 		tgt->list[i].length = k->host_end - k->host_start;
 		k->refcount = 1;
+		k->dynamic_refcount = 0;
 		tgt->refcount++;
 		array->left = NULL;
 		array->right = NULL;
@@ -1242,6 +1243,23 @@ 
   free (tgt);
 }
 
+attribute_hidden bool
+gomp_remove_var (struct gomp_device_descr *devicep, splay_tree_key k)
+{
+  bool is_tgt_unmapped = false;
+  splay_tree_remove (&devicep->mem_map, k);
+  if (k->link_key)
+    splay_tree_insert (&devicep->mem_map, (splay_tree_node) k->link_key);
+  if (k->tgt->refcount > 1)
+    k->tgt->refcount--;
+  else
+    {
+      is_tgt_unmapped = true;
+      gomp_unmap_tgt (k->tgt);
+    }      
+  return is_tgt_unmapped;
+}
+
 /* Unmap variables described by TGT.  If DO_COPYFROM is true, copy relevant
    variables back from device to host: if it is false, it is assumed that this
    has been done already.  */
@@ -1290,16 +1308,7 @@ 
 				      + tgt->list[i].offset),
 			    tgt->list[i].length);
       if (do_unmap)
-	{
-	  splay_tree_remove (&devicep->mem_map, k);
-	  if (k->link_key)
-	    splay_tree_insert (&devicep->mem_map,
-			       (splay_tree_node) k->link_key);
-	  if (k->tgt->refcount > 1)
-	    k->tgt->refcount--;
-	  else
-	    gomp_unmap_tgt (k->tgt);
-	}
+	gomp_remove_var (devicep, k);
     }
 
   if (tgt->refcount > 1)
@@ -1536,17 +1545,7 @@ 
       else
 	{
 	  splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &k);
-	  splay_tree_remove (&devicep->mem_map, n);
-	  if (n->link_key)
-	    {
-	      if (n->tgt->refcount > 1)
-		n->tgt->refcount--;
-	      else
-		{
-		  is_tgt_unmapped = true;
-		  gomp_unmap_tgt (n->tgt);
-		}
-	    }
+	  is_tgt_unmapped = gomp_remove_var (devicep, n);
 	}
     }
 
@@ -2229,16 +2228,7 @@ 
 					  - k->host_start),
 				cur_node.host_end - cur_node.host_start);
 	  if (k->refcount == 0)
-	    {
-	      splay_tree_remove (&devicep->mem_map, k);
-	      if (k->link_key)
-		splay_tree_insert (&devicep->mem_map,
-				   (splay_tree_node) k->link_key);
-	      if (k->tgt->refcount > 1)
-		k->tgt->refcount--;
-	      else
-		gomp_unmap_tgt (k->tgt);
-	    }
+	    gomp_remove_var (devicep, k);
 
 	  break;
 	default:
Index: libgomp/oacc-mem.c
===================================================================
--- libgomp/oacc-mem.c	(revision 248095)
+++ libgomp/oacc-mem.c	(revision 248096)
@@ -440,6 +440,7 @@ 
 
       tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
 			   &kinds, true, GOMP_MAP_VARS_OPENACC);
+      tgt->list[0].key->refcount = REFCOUNT_INFINITY;
     }
 
   gomp_mutex_lock (&acc_dev->lock);
@@ -494,6 +495,9 @@ 
 		  (void *) n->host_start, (int) host_size, (void *) h);
     }
 
+  /* Mark for removal.  */
+  n->refcount = 1;
+
   t = n->tgt;
 
   if (t->refcount == 2)
@@ -583,6 +587,11 @@ 
 	  gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s);
 	}
 
+      if (n->refcount != REFCOUNT_INFINITY)
+	{
+	  n->refcount++;
+	  n->dynamic_refcount++;
+	}
       gomp_mutex_unlock (&acc_dev->lock);
     }
   else if (!(f & FLAG_CREATE))
@@ -609,6 +618,8 @@ 
 
       tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true,
 			   GOMP_MAP_VARS_OPENACC);
+      /* Initialize dynamic refcount.  */
+      tgt->list[0].key->dynamic_refcount = 1;
 
       if (async > acc_async_sync)
 	acc_dev->openacc.async_set_async_func (acc_async_sync);
@@ -678,7 +689,8 @@ 
 }
 #endif
 
-#define FLAG_COPYOUT (1 << 0)
+#define FLAG_COPYOUT  (1 << 0)
+#define FLAG_FINALIZE (1 << 1)
 
 static void
 delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
@@ -729,22 +741,58 @@ 
 		  (void *) n->host_start, (int) host_size, (void *) h, (int) s);
     }
 
-  gomp_mutex_unlock (&acc_dev->lock);
+  if (n->refcount == REFCOUNT_INFINITY)
+    {
+      n->refcount = 0;
+      n->dynamic_refcount = 0;
+    }
+  if (n->refcount < n->dynamic_refcount)
+    {
+      gomp_mutex_unlock (&acc_dev->lock);
+      gomp_fatal ("Dynamic reference counting assert fail\n");
+    }
 
-  if (async > acc_async_sync)
-    acc_dev->openacc.async_set_async_func (async);
+  if (f & FLAG_FINALIZE)
+    {
+      n->refcount -= n->dynamic_refcount;
+      n->dynamic_refcount = 0;
+    }
+  else if (n->dynamic_refcount)
+    {
+      n->dynamic_refcount--;
+      n->refcount--;
+    }
 
-  if (f & FLAG_COPYOUT)
-    acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
+  if (n->refcount == 0)
+    {
+      if (n->tgt->refcount == 2)
+	{
+	  struct target_mem_desc *tp, *t;
+	  for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
+	       tp = t, t = t->prev)
+	    if (n->tgt == t)
+	      {
+		if (tp)
+		  tp->prev = t->prev;
+		else
+		  acc_dev->openacc.data_environ = t->prev;
+		break;
+	      }
+	}
 
-  acc_unmap_data (h);
+      if (f & FLAG_COPYOUT)
+	{
+	  if (async > acc_async_sync)
+	    acc_dev->openacc.async_set_async_func (async);
+	  acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
+	  if (async > acc_async_sync)
+	    acc_dev->openacc.async_set_async_func (acc_async_sync);
+	}
+      gomp_remove_var (acc_dev, n);
+    }
 
-  if (async > acc_async_sync)
-    acc_dev->openacc.async_set_async_func (acc_async_sync);
+  gomp_mutex_unlock (&acc_dev->lock);
 
-  if (!acc_dev->free_func (acc_dev->target_id, d))
-    gomp_fatal ("error in freeing device memory in %s", libfnname);
-
   if (profiling_setup_p)
     {
       thr->prof_info = NULL;
@@ -765,6 +813,18 @@ 
 }
 
 void
+acc_delete_finalize (void *h , size_t s)
+{
+  delete_copyout (FLAG_FINALIZE, h, s, acc_async_sync, __FUNCTION__);
+}
+
+void
+acc_delete_finalize_async (void *h , size_t s, int async)
+{
+  delete_copyout (FLAG_FINALIZE, h, s, async, __FUNCTION__);
+}
+
+void
 acc_copyout (void *h, size_t s)
 {
   delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__);
@@ -776,6 +836,19 @@ 
   delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__);
 }
 
+void
+acc_copyout_finalize (void *h, size_t s)
+{
+  delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, acc_async_sync,
+		  __FUNCTION__);
+}
+
+void
+acc_copyout_finalize_async (void *h, size_t s, int async)
+{
+  delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, async, __FUNCTION__);
+}
+
 static void
 update_dev_host (int is_dev, void *h, size_t s, int async)
 {
@@ -895,11 +968,37 @@ 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
 
+  if (acc_is_present (*hostaddrs, *sizes))
+    {
+      splay_tree_key n;
+      gomp_mutex_lock (&acc_dev->lock);
+      n = lookup_host (acc_dev, *hostaddrs, *sizes);
+      gomp_mutex_unlock (&acc_dev->lock);
+
+      tgt = n->tgt;
+      for (size_t i = 0; i < tgt->list_count; i++)
+	if (tgt->list[i].key == n)
+	  {
+	    for (size_t j = 0; j < mapnum; j++)
+	      if (i + j < tgt->list_count && tgt->list[i + j].key)
+		{
+		  tgt->list[i + j].key->refcount++;
+		  tgt->list[i + j].key->dynamic_refcount++;
+		}
+	    return;
+	  }
+      /* Should not reach here.  */
+      gomp_fatal ("Dynamic refcount incrementing failed for pointer/pset");
+    }
+
   gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
   tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs,
 		       NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
   gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
 
+  /* Initialize dynamic refcount.  */
+  tgt->list[0].key->dynamic_refcount = 1;
+
   gomp_mutex_lock (&acc_dev->lock);
   tgt->prev = acc_dev->openacc.data_environ;
   acc_dev->openacc.data_environ = tgt;
@@ -907,7 +1006,8 @@ 
 }
 
 void
-gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum)
+gomp_acc_remove_pointer (void *h, size_t s, bool force_copyfrom, int async,
+			 int finalize, int mapnum)
 {
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
@@ -915,6 +1015,9 @@ 
   struct target_mem_desc *t;
   int minrefs = (mapnum == 1) ? 2 : 3;
 
+  if (!acc_is_present (h, s))
+    return;
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_host (acc_dev, h, 1);
@@ -929,37 +1032,64 @@ 
 
   t = n->tgt;
 
-  struct target_mem_desc *tp;
+  if (n->refcount < n->dynamic_refcount)
+    {
+      gomp_mutex_unlock (&acc_dev->lock);
+      gomp_fatal ("Dynamic reference counting assert fail\n");
+    }
 
-  if (t->refcount == minrefs)
+  if (finalize)
     {
-      /* This is the last reference, so pull the descriptor off the
-	 chain. This pevents gomp_unmap_vars via gomp_unmap_tgt from
-	 freeing the device memory. */
+      n->refcount -= n->dynamic_refcount;
+      n->dynamic_refcount = 0;
+    }
+  else if (n->dynamic_refcount)
+    {
+      n->dynamic_refcount--;
+      n->refcount--;
+    }
 
-      for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
-	   tp = t, t = t->prev)
+  gomp_mutex_unlock (&acc_dev->lock);
+
+  if (n->refcount == 0)
+    {
+      if (t->refcount == minrefs)
 	{
-	  if (n->tgt == t)
+	  /* This is the last reference, so pull the descriptor off the
+	     chain. This prevents gomp_unmap_vars via gomp_unmap_tgt from
+	     freeing the device memory. */
+	  struct target_mem_desc *tp;
+	  for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
+	       tp = t, t = t->prev)
 	    {
-	      if (tp)
-		tp->prev = t->prev;
-	      else
-		acc_dev->openacc.data_environ = t->prev;
-	      break;
+	      if (n->tgt == t)
+		{
+		  if (tp)
+		    tp->prev = t->prev;
+		  else
+		    acc_dev->openacc.data_environ = t->prev;
+		  break;
+		}
 	    }
 	}
+
+      /* Set refcount to 1 to allow gomp_unmap_vars to unmap it.  */
+      n->refcount = 1;
+      t->refcount = minrefs;
+      for (size_t i = 0; i < t->list_count; i++)
+	if (t->list[i].key == n)
+	  {
+	    t->list[i].copy_from = force_copyfrom ? 1 : 0;
+	    break;
+	  }
+      if (async > acc_async_sync)
+	acc_dev->openacc.async_set_async_func (async);
+      gomp_unmap_vars (t, true);
+      if (async > acc_async_sync)
+	acc_dev->openacc.async_set_async_func (acc_async_sync);
     }
 
-  t->list[0].copy_from = force_copyfrom ? 1 : 0;
-
   gomp_mutex_unlock (&acc_dev->lock);
 
-  /* If running synchronously, unmap immediately.  */
-  if (async < acc_async_noval)
-    gomp_unmap_vars (t, true);
-  else
-    t->device_descr->openacc.register_async_cleanup_func (t, async);
-
   gomp_debug (0, "  %s: mappings restored\n", __FUNCTION__);
 }
Index: libgomp/openacc.f90
===================================================================
--- libgomp/openacc.f90	(revision 248095)
+++ libgomp/openacc.f90	(revision 248096)
@@ -233,6 +233,24 @@ 
       type (*), dimension (..), contiguous :: a
     end subroutine
 
+    subroutine acc_copyout_finalize_32_h (a, len)
+      use iso_c_binding, only: c_int32_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+    end subroutine
+
+    subroutine acc_copyout_finalize_64_h (a, len)
+      use iso_c_binding, only: c_int64_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+    end subroutine
+
+    subroutine acc_copyout_finalize_array_h (a)
+      type (*), dimension (..), contiguous :: a
+    end subroutine
+
     subroutine acc_delete_32_h (a, len)
       use iso_c_binding, only: c_int32_t
       !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
@@ -251,6 +269,24 @@ 
       type (*), dimension (..), contiguous :: a
     end subroutine
 
+    subroutine acc_delete_finalize_32_h (a, len)
+      use iso_c_binding, only: c_int32_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+    end subroutine
+
+    subroutine acc_delete_finalize_64_h (a, len)
+      use iso_c_binding, only: c_int64_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+    end subroutine
+
+    subroutine acc_delete_finalize_array_h (a)
+      type (*), dimension (..), contiguous :: a
+    end subroutine
+
     subroutine acc_update_device_32_h (a, len)
       use iso_c_binding, only: c_int32_t
       !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
@@ -380,6 +416,30 @@ 
       integer (acc_handle_kind) async
     end subroutine
 
+    subroutine acc_copyout_finalize_async_32_h (a, len, async)
+      use iso_c_binding, only: c_int32_t
+      use openacc_kinds, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_copyout_finalize_async_64_h (a, len, async)
+      use iso_c_binding, only: c_int64_t
+      use openacc_kinds, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_copyout_finalize_async_array_h (a, async)
+      use openacc_kinds, only: acc_handle_kind
+      type (*), dimension (..), contiguous :: a
+      integer (acc_handle_kind) async
+    end subroutine
+
     subroutine acc_delete_async_32_h (a, len, async)
       use iso_c_binding, only: c_int32_t
       use openacc_kinds, only: acc_handle_kind
@@ -404,6 +464,30 @@ 
       integer (acc_handle_kind) async
     end subroutine
 
+    subroutine acc_delete_finalize_async_32_h (a, len, async)
+      use iso_c_binding, only: c_int32_t
+      use openacc_kinds, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_delete_finalize_async_64_h (a, len, async)
+      use iso_c_binding, only: c_int64_t
+      use openacc_kinds, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_delete_finalize_async_array_h (a, async)
+      use openacc_kinds, only: acc_handle_kind
+      type (*), dimension (..), contiguous :: a
+      integer (acc_handle_kind) async
+    end subroutine
+
     subroutine acc_update_device_async_32_h (a, len, async)
       use iso_c_binding, only: c_int32_t
       use openacc_kinds, only: acc_handle_kind
@@ -581,6 +665,14 @@ 
       integer (c_size_t), value :: len
     end subroutine
 
+    subroutine acc_copyout_finalize_l (a, len) &
+        bind (C, name = "acc_copyout_finalize")
+      use iso_c_binding, only: c_size_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+    end subroutine
+
     subroutine acc_delete_l (a, len) &
         bind (C, name = "acc_delete")
       use iso_c_binding, only: c_size_t
@@ -589,6 +681,14 @@ 
       integer (c_size_t), value :: len
     end subroutine
 
+    subroutine acc_delete_finalize_l (a, len) &
+        bind (C, name = "acc_delete_finalize")
+      use iso_c_binding, only: c_size_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+    end subroutine
+
     subroutine acc_update_device_l (a, len) &
         bind (C, name = "acc_update_device")
       use iso_c_binding, only: c_size_t
@@ -641,6 +741,15 @@ 
       integer (c_int), value :: async
     end subroutine
 
+    subroutine acc_copyout_finalize_async_l (a, len, async) &
+        bind (C, name = "acc_copyout_finalize_async")
+      use iso_c_binding, only: c_size_t, c_int
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+      integer (c_int), value :: async
+    end subroutine
+
     subroutine acc_delete_async_l (a, len, async) &
         bind (C, name = "acc_delete_async")
       use iso_c_binding, only: c_size_t, c_int
@@ -650,6 +759,15 @@ 
       integer (c_int), value :: async
     end subroutine
 
+    subroutine acc_delete_finalize_async_l (a, len, async) &
+        bind (C, name = "acc_delete_finalize_async")
+      use iso_c_binding, only: c_size_t, c_int
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+      integer (c_int), value :: async
+    end subroutine
+
     subroutine acc_update_device_async_l (a, len, async) &
         bind (C, name = "acc_update_device_async")
       use iso_c_binding, only: c_size_t, c_int
@@ -806,6 +924,12 @@ 
     procedure :: acc_copyout_array_h
   end interface
 
+  interface acc_copyout_finalize
+    procedure :: acc_copyout_finalize_32_h
+    procedure :: acc_copyout_finalize_64_h
+    procedure :: acc_copyout_finalize_array_h
+  end interface
+
   interface acc_delete
     procedure :: acc_delete_32_h
     procedure :: acc_delete_64_h
@@ -812,6 +936,12 @@ 
     procedure :: acc_delete_array_h
   end interface
 
+  interface acc_delete_finalize
+    procedure :: acc_delete_finalize_32_h
+    procedure :: acc_delete_finalize_64_h
+    procedure :: acc_delete_finalize_array_h
+  end interface
+
   interface acc_update_device
     procedure :: acc_update_device_32_h
     procedure :: acc_update_device_64_h
@@ -856,6 +986,12 @@ 
     procedure :: acc_copyout_async_array_h
   end interface
 
+  interface acc_copyout_finalize_async
+    procedure :: acc_copyout_finalize_async_32_h
+    procedure :: acc_copyout_finalize_async_64_h
+    procedure :: acc_copyout_finalize_async_array_h
+  end interface
+
   interface acc_delete_async
     procedure :: acc_delete_async_32_h
     procedure :: acc_delete_async_64_h
@@ -862,6 +998,12 @@ 
     procedure :: acc_delete_async_array_h
   end interface
 
+  interface acc_delete_finalize_async
+    procedure :: acc_delete_finalize_async_32_h
+    procedure :: acc_delete_finalize_async_64_h
+    procedure :: acc_delete_finalize_async_array_h
+  end interface
+
   interface acc_update_device_async
     procedure :: acc_update_device_async_32_h
     procedure :: acc_update_device_async_64_h
@@ -1104,6 +1246,30 @@ 
   call acc_copyout_l (a, sizeof (a))
 end subroutine
 
+subroutine acc_copyout_finalize_32_h (a, len)
+  use iso_c_binding, only: c_int32_t, c_size_t
+  use openacc_internal, only: acc_copyout_finalize_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  call acc_copyout_finalize_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_copyout_finalize_64_h (a, len)
+  use iso_c_binding, only: c_int64_t, c_size_t
+  use openacc_internal, only: acc_copyout_finalize_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  call acc_copyout_finalize_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_copyout_finalize_array_h (a)
+  use openacc_internal, only: acc_copyout_finalize_l
+  type (*), dimension (..), contiguous :: a
+  call acc_copyout_finalize_l (a, sizeof (a))
+end subroutine
+
 subroutine acc_delete_32_h (a, len)
   use iso_c_binding, only: c_int32_t, c_size_t
   use openacc_internal, only: acc_delete_l
@@ -1128,6 +1294,30 @@ 
   call acc_delete_l (a, sizeof (a))
 end subroutine
 
+subroutine acc_delete_finalize_32_h (a, len)
+  use iso_c_binding, only: c_int32_t, c_size_t
+  use openacc_internal, only: acc_delete_finalize_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  call acc_delete_finalize_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_delete_finalize_64_h (a, len)
+  use iso_c_binding, only: c_int64_t, c_size_t
+  use openacc_internal, only: acc_delete_finalize_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  call acc_delete_finalize_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_delete_finalize_array_h (a)
+  use openacc_internal, only: acc_delete_finalize_l
+  type (*), dimension (..), contiguous :: a
+  call acc_delete_finalize_l (a, sizeof (a))
+end subroutine
+
 subroutine acc_update_device_32_h (a, len)
   use iso_c_binding, only: c_int32_t, c_size_t
   use openacc_internal, only: acc_update_device_l
@@ -1304,6 +1494,37 @@ 
   call acc_copyout_async_l (a, sizeof (a), int (async, kind = c_int))
 end subroutine
 
+subroutine acc_copyout_finalize_async_32_h (a, len, async)
+  use iso_c_binding, only: c_int32_t, c_size_t, c_int
+  use openacc_internal, only: acc_copyout_finalize_async_l
+  use openacc_kinds, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  integer (acc_handle_kind) async
+  call acc_copyout_finalize_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_copyout_finalize_async_64_h (a, len, async)
+  use iso_c_binding, only: c_int64_t, c_size_t, c_int
+  use openacc_internal, only: acc_copyout_finalize_async_l
+  use openacc_kinds, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  integer (acc_handle_kind) async
+  call acc_copyout_finalize_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_copyout_finalize_async_array_h (a, async)
+  use iso_c_binding, only: c_int
+  use openacc_internal, only: acc_copyout_finalize_async_l
+  use openacc_kinds, only: acc_handle_kind
+  type (*), dimension (..), contiguous :: a
+  integer (acc_handle_kind) async
+  call acc_copyout_finalize_async_l (a, sizeof (a), int (async, kind = c_int))
+end subroutine
+
 subroutine acc_delete_async_32_h (a, len, async)
   use iso_c_binding, only: c_int32_t, c_size_t, c_int
   use openacc_internal, only: acc_delete_async_l
@@ -1335,6 +1556,37 @@ 
   call acc_delete_async_l (a, sizeof (a), int (async, kind = c_int))
 end subroutine
 
+subroutine acc_delete_finalize_async_32_h (a, len, async)
+  use iso_c_binding, only: c_int32_t, c_size_t, c_int
+  use openacc_internal, only: acc_delete_finalize_async_l
+  use openacc_kinds, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  integer (acc_handle_kind) async
+  call acc_delete_finalize_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_delete_finalize_async_64_h (a, len, async)
+  use iso_c_binding, only: c_int64_t, c_size_t, c_int
+  use openacc_internal, only: acc_delete_finalize_async_l
+  use openacc_kinds, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  integer (acc_handle_kind) async
+  call acc_delete_finalize_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_delete_finalize_async_array_h (a, async)
+  use iso_c_binding, only: c_int
+  use openacc_internal, only: acc_delete_finalize_async_l
+  use openacc_kinds, only: acc_handle_kind
+  type (*), dimension (..), contiguous :: a
+  integer (acc_handle_kind) async
+  call acc_delete_finalize_async_l (a, sizeof (a), int (async, kind = c_int))
+end subroutine
+
 subroutine acc_update_device_async_32_h (a, len, async)
   use iso_c_binding, only: c_int32_t, c_size_t, c_int
   use openacc_internal, only: acc_update_device_async_l
Index: libgomp/libgomp.h
===================================================================
--- libgomp/libgomp.h	(revision 248095)
+++ libgomp/libgomp.h	(revision 248096)
@@ -835,6 +835,8 @@ 
   uintptr_t tgt_offset;
   /* Reference count.  */
   uintptr_t refcount;
+  /* Dynamic reference count.  */
+  uintptr_t dynamic_refcount;
   /* Pointer to the original mapping of "omp declare target link" object.  */
   splay_tree_key link_key;
 };
@@ -973,7 +975,7 @@ 
 };
 
 extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *);
-extern void gomp_acc_remove_pointer (void *, bool, int, int);
+extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int);
 extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
 				       unsigned short *);
 
@@ -985,6 +987,7 @@ 
 extern void gomp_init_device (struct gomp_device_descr *);
 extern void gomp_unload_device (struct gomp_device_descr *);
 extern bool gomp_offload_target_available_p (int);
+extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key);
 
 /* work.c */
 
Index: libgomp/openacc_lib.h
===================================================================
--- libgomp/openacc_lib.h	(revision 248095)
+++ libgomp/openacc_lib.h	(revision 248096)
@@ -303,6 +303,26 @@ 
         end subroutine
       end interface
 
+      interface acc_copyout_finalize
+        subroutine acc_copyout_finalize_32_h (a, len)
+          use iso_c_binding, only: c_int32_t
+          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+          type (*), dimension (*) :: a
+          integer (c_int32_t) len
+        end subroutine
+
+        subroutine acc_copyout_finalize_64_h (a, len)
+          use iso_c_binding, only: c_int64_t
+          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+          type (*), dimension (*) :: a
+          integer (c_int64_t) len
+        end subroutine
+
+        subroutine acc_copyout_finalize_array_h (a)
+          type (*), dimension (..), contiguous :: a
+        end subroutine
+      end interface
+
       interface acc_delete
         subroutine acc_delete_32_h (a, len)
           use iso_c_binding, only: c_int32_t
@@ -323,6 +343,26 @@ 
         end subroutine
       end interface
 
+      interface acc_delete_finalize
+        subroutine acc_delete_finalize_32_h (a, len)
+          use iso_c_binding, only: c_int32_t
+          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+          type (*), dimension (*) :: a
+          integer (c_int32_t) len
+        end subroutine
+
+        subroutine acc_delete_finalize_64_h (a, len)
+          use iso_c_binding, only: c_int64_t
+          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+          type (*), dimension (*) :: a
+          integer (c_int64_t) len
+        end subroutine
+
+        subroutine acc_delete_finalize_array_h (a)
+          type (*), dimension (..), contiguous :: a
+        end subroutine
+      end interface
+
       interface acc_update_device
         subroutine acc_update_device_32_h (a, len)
           use iso_c_binding, only: c_int32_t
@@ -472,6 +512,32 @@ 
         end subroutine
       end interface
 
+      interface acc_copyout_finalize_async
+        subroutine acc_copyout_finalize_async_32_h (a, len, async)
+          use iso_c_binding, only: c_int32_t
+          import acc_handle_kind
+          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+          type (*), dimension (*) :: a
+          integer (c_int32_t) len
+          integer (acc_handle_kind) async
+        end subroutine
+
+        subroutine acc_copyout_finalize_async_64_h (a, len, async)
+          use iso_c_binding, only: c_int64_t
+          import acc_handle_kind
+          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+          type (*), dimension (*) :: a
+          integer (c_int64_t) len
+          integer (acc_handle_kind) async
+        end subroutine
+
+        subroutine acc_copyout_finalize_async_array_h (a, async_)
+          import acc_handle_kind
+          type (*), dimension (..), contiguous :: a
+          integer (acc_handle_kind) async_
+        end subroutine
+      end interface
+
       interface acc_delete_async
         subroutine acc_delete_async_32_h (a, len, async)
           use iso_c_binding, only: c_int32_t
@@ -498,6 +564,32 @@ 
         end subroutine
       end interface
 
+      interface acc_delete_finalize_async
+        subroutine acc_delete_finalize_async_32_h (a, len, async)
+          use iso_c_binding, only: c_int32_t
+          import acc_handle_kind
+          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+          type (*), dimension (*) :: a
+          integer (c_int32_t) len
+          integer (acc_handle_kind) async
+        end subroutine
+
+        subroutine acc_delete_finalize_async_64_h (a, len, async)
+          use iso_c_binding, only: c_int64_t
+          import acc_handle_kind
+          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+          type (*), dimension (*) :: a
+          integer (c_int64_t) len
+          integer (acc_handle_kind) async
+        end subroutine
+
+        subroutine acc_delete_finalize_async_array_h (a, async_)
+          import acc_handle_kind
+          type (*), dimension (..), contiguous :: a
+          integer (acc_handle_kind) async_
+        end subroutine
+      end interface
+
       interface acc_update_device_async
         subroutine acc_update_device_async_32_h (a, len, async)
           use iso_c_binding, only: c_int32_t
Index: gcc/c-family/c-pragma.h
===================================================================
--- gcc/c-family/c-pragma.h	(revision 248095)
+++ gcc/c-family/c-pragma.h	(revision 248096)
@@ -157,6 +157,7 @@ 
   PRAGMA_OACC_CLAUSE_DEVICEPTR,
   PRAGMA_OACC_CLAUSE_DEVICE_RESIDENT,
   PRAGMA_OACC_CLAUSE_DEVICE_TYPE,
+  PRAGMA_OACC_CLAUSE_FINALIZE,
   PRAGMA_OACC_CLAUSE_GANG,
   PRAGMA_OACC_CLAUSE_HOST,
   PRAGMA_OACC_CLAUSE_INDEPENDENT,
Index: gcc/c/c-parser.c
===================================================================
--- gcc/c/c-parser.c	(revision 248095)
+++ gcc/c/c-parser.c	(revision 248096)
@@ -10375,6 +10375,8 @@ 
 	case 'f':
 	  if (!strcmp ("final", p))
 	    result = PRAGMA_OMP_CLAUSE_FINAL;
+	  else if (!strcmp ("finalize", p))
+	    result = PRAGMA_OACC_CLAUSE_FINALIZE;
 	  else if (!strcmp ("firstprivate", p))
 	    result = PRAGMA_OMP_CLAUSE_FIRSTPRIVATE;
 	  else if (!strcmp ("from", p))
@@ -11693,8 +11695,9 @@ 
   return list;
 }
 
-/* OpenACC:
+/* OpenACC 2.5:
    auto
+   finalize
    independent
    nohost
    seq */
@@ -13171,6 +13174,11 @@ 
 	  c_name = "device_type";
 	  seen_dtype = true;
 	  break;
+	case PRAGMA_OACC_CLAUSE_FINALIZE:
+	  clauses = c_parser_oacc_simple_clause (parser, here,
+						 OMP_CLAUSE_FINALIZE, clauses);
+	  c_name = "finalize";
+	  break;
 	case PRAGMA_OACC_CLAUSE_FIRSTPRIVATE:
 	  clauses = c_parser_omp_clause_firstprivate (parser, clauses);
 	  c_name = "firstprivate";
@@ -13816,6 +13824,7 @@ 
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DELETE) 		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_FINALIZE) 		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
 
 static void
Index: gcc/c/c-typeck.c
===================================================================
--- gcc/c/c-typeck.c	(revision 248095)
+++ gcc/c/c-typeck.c	(revision 248096)
@@ -13397,6 +13397,7 @@ 
 	case OMP_CLAUSE_NOHOST:
 	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE_IF_PRESENT:
+	case OMP_CLAUSE_FINALIZE:
 	  pc = &OMP_CLAUSE_CHAIN (c);
 	  continue;
 
Index: gcc/tree.c
===================================================================
--- gcc/tree.c	(revision 248095)
+++ gcc/tree.c	(revision 248096)
@@ -331,7 +331,8 @@ 
   3, /* OMP_CLAUSE_TILE  */
   2, /* OMP_CLAUSE__GRIDDIM_  */
   0, /* OMP_CLAUSE_IF_PRESENT */
-  2  /* OMP_CLAUSE_DEVICE_TYPE */
+  2, /* OMP_CLAUSE_DEVICE_TYPE */
+  0  /* OMP_CLAUSE_FINALIZE  */
 };
 
 const char * const omp_clause_code_name[] =
@@ -406,7 +407,8 @@ 
   "tile",
   "_griddim_",
   "if_present",
-  "device_type"
+  "device_type",
+  "finalize"
 };
 
 
@@ -11723,6 +11725,7 @@ 
 	case OMP_CLAUSE_NOHOST:
 	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE_IF_PRESENT:
+	case OMP_CLAUSE_FINALIZE:
 	  WALK_SUBTREE_TAIL (OMP_CLAUSE_CHAIN (*tp));
 
 	case OMP_CLAUSE_DEVICE_TYPE:
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 248095)
+++ gcc/omp-low.c	(revision 248096)
@@ -2431,6 +2431,7 @@ 
 	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE_IF_PRESENT:
 	case OMP_CLAUSE_DEVICE_TYPE:
+	case OMP_CLAUSE_FINALIZE:
 	  break;
 
 	case OMP_CLAUSE_ALIGNED:
@@ -2606,6 +2607,7 @@ 
 	case OMP_CLAUSE__GRIDDIM_:
 	case OMP_CLAUSE_IF_PRESENT:
 	case OMP_CLAUSE_DEVICE_TYPE:
+	case OMP_CLAUSE_FINALIZE:
 	  break;
 
 	case OMP_CLAUSE_BIND:
@@ -14216,6 +14218,13 @@ 
 	if (t_async)
 	  args.safe_push (t_async);
 
+	if (start_ix == BUILT_IN_GOACC_ENTER_EXIT_DATA)
+	  {
+	    c = find_omp_clause (clauses, OMP_CLAUSE_FINALIZE);
+	    tree t_finalize = c ? integer_one_node : integer_zero_node;
+	    args.safe_push (t_finalize);
+	  }
+
 	/* Save the argument index, and ... */
 	unsigned t_wait_idx = args.length ();
 	unsigned num_waits = 0;
Index: gcc/cp/semantics.c
===================================================================
--- gcc/cp/semantics.c	(revision 248095)
+++ gcc/cp/semantics.c	(revision 248096)
@@ -7107,6 +7107,7 @@ 
 	case OMP_CLAUSE_BIND:
 	case OMP_CLAUSE_NOHOST:
 	case OMP_CLAUSE_IF_PRESENT:
+	case OMP_CLAUSE_FINALIZE:
 	  break;
 
 	case OMP_CLAUSE_TILE:
Index: gcc/cp/parser.c
===================================================================
--- gcc/cp/parser.c	(revision 248095)
+++ gcc/cp/parser.c	(revision 248096)
@@ -29815,6 +29815,8 @@ 
 	case 'f':
 	  if (!strcmp ("final", p))
 	    result = PRAGMA_OMP_CLAUSE_FINAL;
+	  else if (!strcmp ("finalize", p))
+	    result = PRAGMA_OACC_CLAUSE_FINALIZE;
 	  else if (!strcmp ("firstprivate", p))
 	    result = PRAGMA_OMP_CLAUSE_FIRSTPRIVATE;
 	  else if (!strcmp ("from", p))
@@ -30275,8 +30277,9 @@ 
   return list;
 }
 
-/* OpenACC 2.0:
+/* OpenACC 2.5:
    auto
+   finalize
    independent
    nohost
    seq */
@@ -32390,6 +32393,11 @@ 
 	  c_name = "device_type";
 	  seen_dtype = true;
 	  break;
+	case PRAGMA_OACC_CLAUSE_FINALIZE:
+	  clauses = cp_parser_oacc_simple_clause (parser, OMP_CLAUSE_FINALIZE,
+						  clauses, here);
+	  c_name = "finalize";
+	  break;
 	case PRAGMA_OACC_CLAUSE_FIRSTPRIVATE:
 	  clauses = cp_parser_omp_var_list (parser, OMP_CLAUSE_FIRSTPRIVATE,
 					    clauses);
@@ -35582,6 +35590,7 @@ 
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DELETE) 		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_FINALIZE) 		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
 
 static tree
Index: gcc/fortran/openmp.c
===================================================================
--- gcc/fortran/openmp.c	(revision 248095)
+++ gcc/fortran/openmp.c	(revision 248096)
@@ -835,6 +835,7 @@ 
   OMP_CLAUSE_NOHOST,
   OMP_CLAUSE_IF_PRESENT,
   OMP_CLAUSE_DEVICE_TYPE,
+  OMP_CLAUSE_FINALIZE,
   /* This must come last.  */
   OMP_MASK2_LAST
 };
@@ -1304,6 +1305,14 @@ 
 	      && c->final_expr == NULL
 	      && gfc_match ("final ( %e )", &c->final_expr) == MATCH_YES)
 	    continue;
+	  if ((mask & OMP_CLAUSE_FINALIZE)
+	      && !c->finalize
+	      && gfc_match ("finalize") == MATCH_YES)
+	    {
+	      c->finalize = true;
+	      needs_space = true;
+	      continue;
+	    }
 	  if ((mask & OMP_CLAUSE_FIRSTPRIVATE)
 	      && gfc_match_omp_variable_list ("firstprivate (",
 					      &c->lists[OMP_LIST_FIRSTPRIVATE],
@@ -2081,7 +2090,7 @@ 
    | OMP_CLAUSE_COPYIN | OMP_CLAUSE_CREATE)
 #define OACC_EXIT_DATA_CLAUSES \
   (omp_mask (OMP_CLAUSE_IF) | OMP_CLAUSE_ASYNC | OMP_CLAUSE_WAIT	      \
-   | OMP_CLAUSE_COPYOUT | OMP_CLAUSE_DELETE)
+   | OMP_CLAUSE_COPYOUT | OMP_CLAUSE_DELETE | OMP_CLAUSE_FINALIZE)
 #define OACC_WAIT_CLAUSES \
   omp_mask (OMP_CLAUSE_ASYNC)
 #define OACC_ROUTINE_CLAUSES \
Index: gcc/fortran/trans-openmp.c
===================================================================
--- gcc/fortran/trans-openmp.c	(revision 248095)
+++ gcc/fortran/trans-openmp.c	(revision 248096)
@@ -2936,6 +2936,11 @@ 
       c = build_omp_clause (where.lb->location, OMP_CLAUSE_IF_PRESENT);
       omp_clauses = gfc_trans_add_clause (c, omp_clauses);
     }
+  if (clauses->finalize)
+    {
+      c = build_omp_clause (where.lb->location, OMP_CLAUSE_FINALIZE);
+      omp_clauses = gfc_trans_add_clause (c, omp_clauses);
+    }
   if (clauses->independent)
     {
       c = build_omp_clause (where.lb->location, OMP_CLAUSE_INDEPENDENT);
Index: gcc/fortran/gfortran.h
===================================================================
--- gcc/fortran/gfortran.h	(revision 248095)
+++ gcc/fortran/gfortran.h	(revision 248096)
@@ -1318,7 +1318,7 @@ 
   gfc_expr_list *tile_list;
   unsigned async:1, gang:1, worker:1, vector:1, seq:1, independent:1;
   unsigned wait:1, par_auto:1, gang_static:1, nohost:1, acc_collapse:1, bind:1;
-  unsigned if_present:1;
+  unsigned if_present:1, finalize:1;
   locus loc;
   char bind_name[GFC_MAX_SYMBOL_LEN+1];
 }
Index: gcc/gimplify.c
===================================================================
--- gcc/gimplify.c	(revision 248095)
+++ gcc/gimplify.c	(revision 248096)
@@ -7669,6 +7669,7 @@ 
 	case OMP_CLAUSE_SIMD:
 	case OMP_CLAUSE_IF_PRESENT:
 	case OMP_CLAUSE_DEVICE_TYPE:
+	case OMP_CLAUSE_FINALIZE:
 	  break;
 
 	case OMP_CLAUSE_DEFAULTMAP:
@@ -8533,6 +8534,7 @@ 
 	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE_IF_PRESENT:
 	case OMP_CLAUSE_DEVICE_TYPE:
+	case OMP_CLAUSE_FINALIZE:
 	  break;
 
 	case OMP_CLAUSE_BIND:
Index: gcc/tree-core.h
===================================================================
--- gcc/tree-core.h	(revision 248095)
+++ gcc/tree-core.h	(revision 248096)
@@ -473,7 +473,10 @@ 
   OMP_CLAUSE_IF_PRESENT,
 
   /* OpenACC clause: device_type ( device-type-list).  */
-  OMP_CLAUSE_DEVICE_TYPE
+  OMP_CLAUSE_DEVICE_TYPE,
+
+  /* OpenACC clause: finalize.  */
+  OMP_CLAUSE_FINALIZE  
 };
 
 #undef DEFTREESTRUCT