diff mbox

[Bulk,OpenACC,0/7] host_data construct

Message ID 562A578E.4080907@codesourcery.com
State New
Headers show

Commit Message

James Norris Oct. 23, 2015, 3:51 p.m. UTC
Hi,

This a re-posting of the original note incorporating the suggestions
from Joseph and Nathan (thank you).

     This patch adds the processing of OpenACC host_data construct in C
     and C++. (Note: Support in Fortran is already in trunk.) The patch
     also adds the required support in the middle-end and libgomp.

     Background
         The host data construct is used to make an address of device
         data available on the host.

         The following illustrates use of the host data construct in
         conjunction with arrays which are already device-resident
         and an accelerator-only function.

                 int main(int argc, char **argv)
                 {
                   float *x, *y;
                   const int n = 1024;
                   int i;

                   x = (float*) malloc (n * sizeof(float));
                   y = (float*) malloc (n * sizeof(float));

                   /* Copy the arrays out to the device. */
                   #pragma acc data create(x[0:n]) copyout(y[0:n])
                   {
                     #pragma acc parallel
                     {
                       for (i = 0; i < n; i++)
                         {
                           x[i] = 1.0f;
                           y[i] = 0.0f;
                         }
                     }

                     /*
                      * The arrays are already on the device, so
                      * pass the device addresses to saxpy. NOTE:
                      * saxpy has been previously defined as an
                      * accelerator function.
                      */
                     #pragma acc host_data use_device(x, y)
                     {
                       saxpy(n, 2.0, x, 1, y, 1);
                     }
                   }

                   fprintf(stdout, "y[0] = %f\n", y[0]);
                   return 0;
                 }


     C and C++ front-ends

         Definitions for use by C and C++ were added to identify the
         host_data construct pragma and its' only valid clause: use_device.

         New functionality was added to do the parsing of the host_data
         pragma and validate the sole clause valid clause: use_device.
         As the host_data construct has associated with it a structured
         block, new functionality was added to build the compound
         statement to represent the block.

     Middle-end

         A gimple definition: GOVD_USE_DEVICE, has been added to indicate
         the use of the use_device clause. This flag is asserted as part
         of installing mappings into a omp context. The flag is subsequently
         reacted to during the gimplying of the host_data region's body.
         When this flag is encountered, an GOACC_deviceptr builtin call
         is inserted at the appropriate place.

     libgomp

         A new function has been added to handle pointer lookup for host
         data regions. As the comment in the code describes, this function
         will return the appropriate address based on whether it is called
         for the host or the target. This function is used in response to
         usage of the use_device clause.

     Tests

         New compile and runtime tests have been added.

     All of the code is in the gomp-4_0-branch.

     Regtested on x86_64-linux.

     Thanks!
     Jim
2015-10-23  Julian Brown  <julian@codesourcery.com>
	    James Norris  <jnorris@codesourcery.com>

	gcc/c-family/	
	* c-pragma.c (oacc_pragmas): Add host_data pragma definition.
	* c-pragma.h (enum pragma_kind): Add PRAGMA_OACC_HOST_DATA.
	(enum pragma_omp_clause): Add PRAGMA_OACC_CLAUSE_USE_DEVICE.

	gcc/c/
	* c-parser.c (c_parser_omp_clause_name): Add handling of use_device
	clause.
	(c_parser_oacc_clause_use_device): New function.
	(c_parser_oacc_all_clauses): Handle PRAGMA_OACC_CLAUSE_USE_DEVICE.
	(OACC_HOST_DATA_CLAUSE_MASK): New definition.
	(c_parser_oacc_host_data): New function.
        (c_parser_omp_construct): Handle PRAGMA_OACC_HOST_DATA.
	* c-tree.h: Add definition for c_finish_oacc_host_data.
	* c-typeck.c (c_finish_oacc_host_data): New function.

	gcc/cp/
	* cp-tree.h (finish_oacc_host_data): New function.
	* parser.c (cp_parser_omp_clause_name): Add handling of use_device
	clause.
	(cp_parser_oacc_all_clauses): Handle PRAGMA_OACC_CLAUSE_USE_DEVICE.
	(OACC_HOST_DATA_CLAUSE_MASK): New definition.
	(cp_parser_oacc_host_data): New function.
	(cp_parser_omp_construct): Handle PRAGMA_OACC_HOST_DATA.
	(cp_parser_pragma): Handle PRAGMA_OACC_HOST_DATA.
	* semantics.c (finish_omp_clauses): Hnadle OMP_CLAUSE_USE_DEVICE.
	(finish_oacc_host_data): New function.

	gcc/
	* gimplify.c (enum gimplify_omp_var_data): Add GOVD_USE_DEVICE.
	(enum omp_region_type): Add ORT_HOST_DATA.
	(gimplify_scan_omp_clauses): Adjust handling of OMP_CLAUSE_USE_DEVICE.
	(gimpify_host_data, gimplify_host_data_1): New functions.
	(gimplify_expr): Handle OACC_HOST_DATA.
	* omp-builtins.def (BUILT_IN_GOACC_DEVICEPTR): New builtin.

	gcc/testsuite/
	* c-c++-common/goacc/host_data-1.c: New test.
	* c-c++-common/goacc/host_data-2.c: Likewise.
	* c-c++-common/goacc/host_data-3.c: Likewise.
	* c-c++-common/goacc/host_data-4.c: Likewise.

	libgomp/
	* libgomp.map (GOACC_2.0): Add GOACC_deviceptr.
	* oacc-mem.c (GOACC_deviceptr): New function.
	* testsuite/libgomp.oacc-c-c++-common/host_data-1.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/host_data-2.c: Likewise.

Comments

Jakub Jelinek Oct. 26, 2015, 6:34 p.m. UTC | #1
On Fri, Oct 23, 2015 at 10:51:42AM -0500, James Norris wrote:
> @@ -12942,6 +12961,7 @@ c_finish_omp_clauses (tree clauses, bool is_omp, bool declare_simd)
>  	case OMP_CLAUSE_GANG:
>  	case OMP_CLAUSE_WORKER:
>  	case OMP_CLAUSE_VECTOR:
> +	case OMP_CLAUSE_USE_DEVICE:
>  	  pc = &OMP_CLAUSE_CHAIN (c);
>  	  continue;
>  

Are there any restrictions on whether you can specify the same var multiple
times in use_device clause?
#pragma acc host_data use_device (x) use_device (x) use_device (y, y, y)
?
If not, have you verified that the gimplifier doesn't ICE on it?  Generally
it doesn't like the same var being mentioned multiple times.
If yes, you can use e.g. the generic_head bitmap for that and in any case,
cover that with sufficient testsuite coverage.

> diff --git a/gcc/gimplify.c b/gcc/gimplify.c
> index ab9e540..0c32219 100644
> --- a/gcc/gimplify.c
> +++ b/gcc/gimplify.c
> @@ -93,6 +93,8 @@ enum gimplify_omp_var_data
>  
>    GOVD_MAP_0LEN_ARRAY = 32768,
>  
> +  GOVD_USE_DEVICE = 65536,
> +
>    GOVD_DATA_SHARE_CLASS = (GOVD_SHARED | GOVD_PRIVATE | GOVD_FIRSTPRIVATE
>  			   | GOVD_LASTPRIVATE | GOVD_REDUCTION | GOVD_LINEAR
>  			   | GOVD_LOCAL)
> @@ -116,7 +118,9 @@ enum omp_region_type
>    ORT_COMBINED_TARGET = 33,
>    /* Dummy OpenMP region, used to disable expansion of
>       DECL_VALUE_EXPRs in taskloop pre body.  */
> -  ORT_NONE = 64
> +  ORT_NONE = 64,
> +  /* An OpenACC host-data region.  */
> +  ORT_HOST_DATA = 128

I'd prefer ORT_NONE to be the last one, can you just renumber it and put
ORT_HOST_DATA before it?

> +static tree
> +gimplify_oacc_host_data_1 (tree *tp, int *walk_subtrees,
> +			   void *data ATTRIBUTE_UNUSED)
> +{

Your use_device sounds very similar to use_device_ptr clause in OpenMP,
which is allowed on #pragma omp target data construct and is implemented
quite a bit differently from this; it is unclear if the OpenACC standard
requires this kind of implementation, or you just chose to implement it this
way.  In particular, the GOMP_target_data call puts the variables mentioned
in the use_device_ptr clauses into the mapping structures (similarly how
map clause appears) and the corresponding vars are privatized within the
target data region (which is a host region, basically a fancy { } braces),
where the private variables contain the offloading device's pointers.

> +  splay_tree_node n = NULL;
> +  location_t loc = EXPR_LOCATION (*tp);
> +
> +  switch (TREE_CODE (*tp))
> +    {
> +    case ADDR_EXPR:
> +      {
> +	tree decl = TREE_OPERAND (*tp, 0);
> +
> +	switch (TREE_CODE (decl))
> +	  {
> +	  case ARRAY_REF:
> +	  case ARRAY_RANGE_REF:
> +	  case COMPONENT_REF:
> +	  case VIEW_CONVERT_EXPR:
> +	  case REALPART_EXPR:
> +	  case IMAGPART_EXPR:
> +	    if (TREE_CODE (TREE_OPERAND (decl, 0)) == VAR_DECL)
> +	      n = splay_tree_lookup (gimplify_omp_ctxp->variables,
> +				     (splay_tree_key) TREE_OPERAND (decl, 0));
> +	    break;

I must say this looks really strange, you throw away all the offsets
embedded in the component codes (fixed or variable).
Where comes the above list?  What about other components (say bit field refs,
etc.)?

> +    case VAR_DECL:

What is so special about VAR_DECLs?  Shouldn't PARM_DECLs / RESULT_DECLs
be treated the same way?
> --- a/libgomp/libgomp.map
> +++ b/libgomp/libgomp.map
> @@ -378,6 +378,7 @@ GOACC_2.0 {
>  	GOACC_wait;
>  	GOACC_get_thread_num;
>  	GOACC_get_num_threads;
> +	GOACC_deviceptr;
>  };
>  
>  GOACC_2.0.1 {

You shouldn't be adding new symbols into a symbol version that appeared in a
compiler that shipped already (GCC 5 already had GOACC_2.0 symbols).
So it should go into GOACC_2.0.1.

> diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
> index af067d6..497ab92 100644
> --- a/libgomp/oacc-mem.c
> +++ b/libgomp/oacc-mem.c
> @@ -204,6 +204,38 @@ acc_deviceptr (void *h)
>    return d;
>  }
>  
> +/* This function is used as a helper in generated code to implement pointer
> +   lookup in host_data regions.  Unlike acc_deviceptr, it returns its argument
> +   unchanged on a shared-memory system (e.g. the host).  */
> +
> +void *
> +GOACC_deviceptr (void *h)
> +{
> +  splay_tree_key n;
> +  void *d;
> +  void *offset;
> +
> +  goacc_lazy_initialize ();
> +
> +  struct goacc_thread *thr = goacc_thread ();
> +
> +  if ((thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) == 0)
> +    {
> +      n = lookup_host (thr->dev, h, 1);

What is supposed to be the behavior when the h pointer points at object
boundary, rather than into the middle of existing mapped object?

Say you have:
  char a[16], b[0], c[16]; // b is GCC extension
Now, char *p = &a[5]; is unambiguous, either a is mapped, or not.
But, if p = &a[16];, then it could be either the one-past-last byte in a,
or it could be the start of b (== one-past-last byte in b) or it could be
the pointer to start of c.

In OpenMP 4.5, I had endless discussions about this and the end result is
that one-past-last byte addresses are unspecified behavior

	Jakub
Cesar Philippidis Oct. 27, 2015, 3:45 p.m. UTC | #2
On 10/26/2015 11:34 AM, Jakub Jelinek wrote:
> On Fri, Oct 23, 2015 at 10:51:42AM -0500, James Norris wrote:
>> @@ -12942,6 +12961,7 @@ c_finish_omp_clauses (tree clauses, bool is_omp, bool declare_simd)
>>  	case OMP_CLAUSE_GANG:
>>  	case OMP_CLAUSE_WORKER:
>>  	case OMP_CLAUSE_VECTOR:
>> +	case OMP_CLAUSE_USE_DEVICE:
>>  	  pc = &OMP_CLAUSE_CHAIN (c);
>>  	  continue;
>>  
> 
> Are there any restrictions on whether you can specify the same var multiple
> times in use_device clause?
> #pragma acc host_data use_device (x) use_device (x) use_device (y, y, y)
> ?
> If not, have you verified that the gimplifier doesn't ICE on it?  Generally
> it doesn't like the same var being mentioned multiple times.
> If yes, you can use e.g. the generic_head bitmap for that and in any case,
> cover that with sufficient testsuite coverage.

Generally variables cannot appear in multiple clauses. I'll add more
testing for this.

>> diff --git a/gcc/gimplify.c b/gcc/gimplify.c
>> index ab9e540..0c32219 100644
>> --- a/gcc/gimplify.c
>> +++ b/gcc/gimplify.c
>> @@ -93,6 +93,8 @@ enum gimplify_omp_var_data
>>  
>>    GOVD_MAP_0LEN_ARRAY = 32768,
>>  
>> +  GOVD_USE_DEVICE = 65536,
>> +
>>    GOVD_DATA_SHARE_CLASS = (GOVD_SHARED | GOVD_PRIVATE | GOVD_FIRSTPRIVATE
>>  			   | GOVD_LASTPRIVATE | GOVD_REDUCTION | GOVD_LINEAR
>>  			   | GOVD_LOCAL)
>> @@ -116,7 +118,9 @@ enum omp_region_type
>>    ORT_COMBINED_TARGET = 33,
>>    /* Dummy OpenMP region, used to disable expansion of
>>       DECL_VALUE_EXPRs in taskloop pre body.  */
>> -  ORT_NONE = 64
>> +  ORT_NONE = 64,
>> +  /* An OpenACC host-data region.  */
>> +  ORT_HOST_DATA = 128
> 
> I'd prefer ORT_NONE to be the last one, can you just renumber it and put
> ORT_HOST_DATA before it?

OK.

>> +static tree
>> +gimplify_oacc_host_data_1 (tree *tp, int *walk_subtrees,
>> +			   void *data ATTRIBUTE_UNUSED)
>> +{
> 
> Your use_device sounds very similar to use_device_ptr clause in OpenMP,
> which is allowed on #pragma omp target data construct and is implemented
> quite a bit differently from this; it is unclear if the OpenACC standard
> requires this kind of implementation, or you just chose to implement it this
> way.  In particular, the GOMP_target_data call puts the variables mentioned
> in the use_device_ptr clauses into the mapping structures (similarly how
> map clause appears) and the corresponding vars are privatized within the
> target data region (which is a host region, basically a fancy { } braces),
> where the private variables contain the offloading device's pointers.

Is this a new OpenMP 4.5 feature? I'll take a closer look and see if
they are similar enough. I also noticed that OpenMP 4.5 has something
similar to OpenACC's enter/exit data construct now.

>> +  splay_tree_node n = NULL;
>> +  location_t loc = EXPR_LOCATION (*tp);
>> +
>> +  switch (TREE_CODE (*tp))
>> +    {
>> +    case ADDR_EXPR:
>> +      {
>> +	tree decl = TREE_OPERAND (*tp, 0);
>> +
>> +	switch (TREE_CODE (decl))
>> +	  {
>> +	  case ARRAY_REF:
>> +	  case ARRAY_RANGE_REF:
>> +	  case COMPONENT_REF:
>> +	  case VIEW_CONVERT_EXPR:
>> +	  case REALPART_EXPR:
>> +	  case IMAGPART_EXPR:
>> +	    if (TREE_CODE (TREE_OPERAND (decl, 0)) == VAR_DECL)
>> +	      n = splay_tree_lookup (gimplify_omp_ctxp->variables,
>> +				     (splay_tree_key) TREE_OPERAND (decl, 0));
>> +	    break;
> 
> I must say this looks really strange, you throw away all the offsets
> embedded in the component codes (fixed or variable).
> Where comes the above list?  What about other components (say bit field refs,
> etc.)?

I'm not sure. This is one of those things where multiple developers
worked on it, and the history got lost. I'll investigate it.

>> +    case VAR_DECL:
> 
> What is so special about VAR_DECLs?  Shouldn't PARM_DECLs / RESULT_DECLs
> be treated the same way?
>> --- a/libgomp/libgomp.map
>> +++ b/libgomp/libgomp.map
>> @@ -378,6 +378,7 @@ GOACC_2.0 {
>>  	GOACC_wait;
>>  	GOACC_get_thread_num;
>>  	GOACC_get_num_threads;
>> +	GOACC_deviceptr;
>>  };
>>  
>>  GOACC_2.0.1 {
> 
> You shouldn't be adding new symbols into a symbol version that appeared in a
> compiler that shipped already (GCC 5 already had GOACC_2.0 symbols).
> So it should go into GOACC_2.0.1.

OK.

>> diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
>> index af067d6..497ab92 100644
>> --- a/libgomp/oacc-mem.c
>> +++ b/libgomp/oacc-mem.c
>> @@ -204,6 +204,38 @@ acc_deviceptr (void *h)
>>    return d;
>>  }
>>  
>> +/* This function is used as a helper in generated code to implement pointer
>> +   lookup in host_data regions.  Unlike acc_deviceptr, it returns its argument
>> +   unchanged on a shared-memory system (e.g. the host).  */
>> +
>> +void *
>> +GOACC_deviceptr (void *h)
>> +{
>> +  splay_tree_key n;
>> +  void *d;
>> +  void *offset;
>> +
>> +  goacc_lazy_initialize ();
>> +
>> +  struct goacc_thread *thr = goacc_thread ();
>> +
>> +  if ((thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) == 0)
>> +    {
>> +      n = lookup_host (thr->dev, h, 1);
> 
> What is supposed to be the behavior when the h pointer points at object
> boundary, rather than into the middle of existing mapped object?

Probably undefined with the way that OpenACC is defined.

> Say you have:
>   char a[16], b[0], c[16]; // b is GCC extension
> Now, char *p = &a[5]; is unambiguous, either a is mapped, or not.
> But, if p = &a[16];, then it could be either the one-past-last byte in a,
> or it could be the start of b (== one-past-last byte in b) or it could be
> the pointer to start of c.
> 
> In OpenMP 4.5, I had endless discussions about this and the end result is
> that one-past-last byte addresses are unspecified behavior

OK.

Thanks for you feedback.

Cesar
Julian Brown Nov. 2, 2015, 6:33 p.m. UTC | #3
On Mon, 26 Oct 2015 19:34:22 +0100
Jakub Jelinek <jakub@redhat.com> wrote:

> Your use_device sounds very similar to use_device_ptr clause in
> OpenMP, which is allowed on #pragma omp target data construct and is
> implemented quite a bit differently from this; it is unclear if the
> OpenACC standard requires this kind of implementation, or you just
> chose to implement it this way.  In particular, the GOMP_target_data
> call puts the variables mentioned in the use_device_ptr clauses into
> the mapping structures (similarly how map clause appears) and the
> corresponding vars are privatized within the target data region
> (which is a host region, basically a fancy { } braces), where the
> private variables contain the offloading device's pointers.

As the author of the original patch, I have to say using the mapping
structures seems like a far better approach, but I've hit some trouble
with the details of adapting OpenACC to use that method.

Firstly, on trunk at least, use_device_ptr variables are restricted to
pointer or array types: that restriction doesn't exist in OpenACC, nor
actually could I find it in the OpenMP 4.1 document (my guess is the
standards are supposed to match in this regard). I think that a program
such as this should work:

void target_fn (int *targ_data);

int
main (int argc, char *argv[])
{
  char out;
  int myvar;
#pragma omp target enter data map(to: myvar)

#pragma omp target data use_device_ptr(myvar) map(from:out)
  {
    target_fn (&myvar);
    out = 5;
  }

  return 0;
}

"myvar" would have its address taken in the use_device_ptr region, and
places where the corresponding mapped variable has its address taken
would be replaced by a direct use of the mapped pointer. (Or is that
not a well-formed thing to do, in general?). This fails with "error:
'use_device_ptr' variable is neither a pointer nor an array".

Secondly, attempts to use use_device_ptr on (e.g.
dynamically-allocated) arrays accessed through a pointer cause an ICE
with the existing trunk OpenMP code:

#include <stdlib.h>

void target_fn (char *targ_data);

int
main (int argc, char *argv[])
{
  char *myarr, out;

  myarr = malloc (1024);

#pragma omp target data map(to: myarr[0:1024])
  {
#pragma omp target data use_device_ptr(myarr) map(from:out)
    {
      target_fn (myarr);
      out = 5;
    }
  }

  return 0;
}

udp3.c: In function 'main':
udp3.c:6:1: internal compiler error: in make_decl_rtl, at varasm.c:1298
 main (int argc, char *argv[])
 ^
0x111256b make_decl_rtl(tree_node*)
        /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/varasm.c:1294
0x9ea005 expand_expr_real_1(tree_node*, rtx_def*, machine_mode, expand_modifier, rtx_def**, bool)
        /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/expr.c:9559
0x9e31c2 expand_expr_real(tree_node*, rtx_def*, machine_mode, expand_modifier, rtx_def**, bool)
        /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/expr.c:7892
0x9cb4ae expand_expr
        /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/expr.h:255
0x9d907d expand_assignment(tree_node*, tree_node*, bool)
        /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/expr.c:5089
0x89e219 expand_gimple_stmt_1
        /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/cfgexpand.c:3576
0x89e60d expand_gimple_stmt
        /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/cfgexpand.c:3672
0x8a5773 expand_gimple_basic_block
        /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/cfgexpand.c:5676
0x8a72d4 execute
        /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/cfgexpand.c:6288

Furthermore, this looks strange to me (006t.omplower):

  .omp_data_arr.5.out = &out;
  myarr.8 = myarr;
  .omp_data_arr.5.myarr = myarr.8;
  #pragma omp target data map(from:out [len: 1]) use_device_ptr(myarr)
    {
      D.2436 = .omp_data_arr.5.myarr;
      myarr = D.2436;

That's clobbering the original myarr variable, right?

Any clues on these two? The omp-low.c code is rather opaque to me...

Thanks,

Julian
Jakub Jelinek Nov. 2, 2015, 7:29 p.m. UTC | #4
On Mon, Nov 02, 2015 at 06:33:39PM +0000, Julian Brown wrote:
> As the author of the original patch, I have to say using the mapping
> structures seems like a far better approach, but I've hit some trouble
> with the details of adapting OpenACC to use that method.
> 
> Firstly, on trunk at least, use_device_ptr variables are restricted to
> pointer or array types: that restriction doesn't exist in OpenACC, nor
> actually could I find it in the OpenMP 4.1 document (my guess is the

^^ 4.5 ;)

> standards are supposed to match in this regard). I think that a program
> such as this should work:
> 
> void target_fn (int *targ_data);
> 
> int
> main (int argc, char *argv[])
> {
>   char out;
>   int myvar;
> #pragma omp target enter data map(to: myvar)
> 
> #pragma omp target data use_device_ptr(myvar) map(from:out)
>   {
>     target_fn (&myvar);
>     out = 5;
>   }
> 
>   return 0;
> }

You are right, I've been misreading the standard (the only testcases that
have been provided with the change were using arrays and so it was
non-obvious that it relies on array to pointer decay).
I'll work on changing the implementation accordingly tomorrow.

	Jakub
Jakub Jelinek Nov. 13, 2015, 3:31 p.m. UTC | #5
On Mon, Nov 02, 2015 at 06:33:39PM +0000, Julian Brown wrote:
> Firstly, on trunk at least, use_device_ptr variables are restricted to
> pointer or array types: that restriction doesn't exist in OpenACC, nor
> actually could I find it in the OpenMP 4.1 document (my guess is the
> standards are supposed to match in this regard). I think that a program
> such as this should work:

So, after talking about this on omp-lang, it seems there is agreement
that only arrays and pointer types (or reference to arrays or pointers)
should be allowed in use_device_ptr clause and that for pointers/reference
to pointers it should probably act the way I've coded it up, i.e. that
for them it translates the pointer to point to corresponding object to the
one to which it points on the host.  It is too late to change the standard
now, but will be changed soon, and hopefully clarified in examples.

> void target_fn (int *targ_data);
> 
> int
> main (int argc, char *argv[])
> {
>   char out;
>   int myvar;
> #pragma omp target enter data map(to: myvar)
> 
> #pragma omp target data use_device_ptr(myvar) map(from:out)
>   {
>     target_fn (&myvar);
>     out = 5;
>   }
> 
>   return 0;
> }

That would make the above non-conforming for OpenMP.

> Secondly, attempts to use use_device_ptr on (e.g.
> dynamically-allocated) arrays accessed through a pointer cause an ICE
> with the existing trunk OpenMP code:
> 
> #include <stdlib.h>
> 
> void target_fn (char *targ_data);
> 
> int
> main (int argc, char *argv[])
> {
>   char *myarr, out;
> 
>   myarr = malloc (1024);
> 
> #pragma omp target data map(to: myarr[0:1024])
>   {
> #pragma omp target data use_device_ptr(myarr) map(from:out)
>     {
>       target_fn (myarr);
>       out = 5;
>     }
>   }
> 
>   return 0;
> }

Can't reproduce this ICE (at least not on gomp-4_5-branch, but there
aren't significant changes from the trunk there).

> Furthermore, this looks strange to me (006t.omplower):
> 
>   .omp_data_arr.5.out = &out;
>   myarr.8 = myarr;
>   .omp_data_arr.5.myarr = myarr.8;
>   #pragma omp target data map(from:out [len: 1]) use_device_ptr(myarr)
>     {
>       D.2436 = .omp_data_arr.5.myarr;
>       myarr = D.2436;
> 
> That's clobbering the original myarr variable, right?

Just use -fdump-tree-omplower-uid to see that it is a different variable.
Basically, for OpenMP use_device_ptr creates a private copy of the
pointer for the body of the target data construct, and that pointer is
assigned the target device's address.  For arrays the implementation
creates an artificial pointer variable (holding the start of the array
initially) and replaces all references to the array in the target data
body with dereference of the pointer.

	Jakub
diff mbox

Patch

diff --git a/gcc/c-family/c-pragma.c b/gcc/c-family/c-pragma.c
index 834a916..b748e2f 100644
--- a/gcc/c-family/c-pragma.c
+++ b/gcc/c-family/c-pragma.c
@@ -1214,6 +1214,7 @@  static const struct omp_pragma_def oacc_pragmas[] = {
   { "data", PRAGMA_OACC_DATA },
   { "enter", PRAGMA_OACC_ENTER_DATA },
   { "exit", PRAGMA_OACC_EXIT_DATA },
+  { "host_data", PRAGMA_OACC_HOST_DATA },
   { "kernels", PRAGMA_OACC_KERNELS },
   { "loop", PRAGMA_OACC_LOOP },
   { "parallel", PRAGMA_OACC_PARALLEL },
diff --git a/gcc/c-family/c-pragma.h b/gcc/c-family/c-pragma.h
index cec920f..23a72a3 100644
--- a/gcc/c-family/c-pragma.h
+++ b/gcc/c-family/c-pragma.h
@@ -31,6 +31,7 @@  enum pragma_kind {
   PRAGMA_OACC_DATA,
   PRAGMA_OACC_ENTER_DATA,
   PRAGMA_OACC_EXIT_DATA,
+  PRAGMA_OACC_HOST_DATA,
   PRAGMA_OACC_KERNELS,
   PRAGMA_OACC_LOOP,
   PRAGMA_OACC_PARALLEL,
@@ -161,6 +162,7 @@  enum pragma_omp_clause {
   PRAGMA_OACC_CLAUSE_PRESENT_OR_CREATE,
   PRAGMA_OACC_CLAUSE_SELF,
   PRAGMA_OACC_CLAUSE_SEQ,
+  PRAGMA_OACC_CLAUSE_USE_DEVICE,
   PRAGMA_OACC_CLAUSE_VECTOR,
   PRAGMA_OACC_CLAUSE_VECTOR_LENGTH,
   PRAGMA_OACC_CLAUSE_WAIT,
diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 704ebc6..ead98b9 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -10116,6 +10116,8 @@  c_parser_omp_clause_name (c_parser *parser)
 	    result = PRAGMA_OMP_CLAUSE_UNTIED;
 	  else if (!strcmp ("use_device_ptr", p))
 	    result = PRAGMA_OMP_CLAUSE_USE_DEVICE_PTR;
+	  else if (!strcmp ("use_device", p))
+	    result = PRAGMA_OACC_CLAUSE_USE_DEVICE;
 	  break;
 	case 'v':
 	  if (!strcmp ("vector", p))
@@ -11219,6 +11221,15 @@  c_parser_oacc_clause_async (c_parser *parser, tree list)
   return list;
 }
 
+/* OpenACC 2.0:
+   use_device ( variable-list ) */
+
+static tree
+c_parser_oacc_clause_use_device (c_parser *parser, tree list)
+{
+  return c_parser_omp_var_list_parens (parser, OMP_CLAUSE_USE_DEVICE, list);
+}
+
 /* OpenACC:
    wait ( int-expr-list ) */
 
@@ -12474,6 +12485,10 @@  c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask,
 	  clauses = c_parser_oacc_data_clause (parser, c_kind, clauses);
 	  c_name = "self";
 	  break;
+	case PRAGMA_OACC_CLAUSE_USE_DEVICE:
+	  clauses = c_parser_oacc_clause_use_device (parser, clauses);
+	  c_name = "use_device";
+	  break;
 	case PRAGMA_OACC_CLAUSE_VECTOR_LENGTH:
 	  clauses = c_parser_omp_clause_vector_length (parser, clauses);
 	  c_name = "vector_length";
@@ -13003,6 +13018,29 @@  c_parser_oacc_enter_exit_data (c_parser *parser, bool enter)
 
 
 /* OpenACC 2.0:
+   # pragma acc host_data oacc-data-clause[optseq] new-line
+     structured-block
+*/
+
+#define OACC_HOST_DATA_CLAUSE_MASK					\
+	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_USE_DEVICE) )
+
+static tree
+c_parser_oacc_host_data (location_t loc, c_parser *parser)
+{
+  tree stmt, clauses, block;
+
+  clauses = c_parser_oacc_all_clauses (parser, OACC_HOST_DATA_CLAUSE_MASK,
+				       "#pragma acc host_data");
+
+  block = c_begin_omp_parallel ();
+  add_stmt (c_parser_omp_structured_block (parser));
+  stmt = c_finish_oacc_host_data (loc, clauses, block);
+  return stmt;
+}
+
+
+/* OpenACC 2.0:
 
    # pragma acc loop oacc-loop-clause[optseq] new-line
      structured-block
@@ -16075,6 +16113,9 @@  c_parser_omp_construct (c_parser *parser)
     case PRAGMA_OACC_DATA:
       stmt = c_parser_oacc_data (loc, parser);
       break;
+    case PRAGMA_OACC_HOST_DATA:
+      stmt = c_parser_oacc_host_data (loc, parser);
+      break;
     case PRAGMA_OACC_KERNELS:
       strcpy (p_name, "#pragma acc");
       stmt = c_parser_oacc_kernels (loc, parser, p_name);
diff --git a/gcc/c/c-tree.h b/gcc/c/c-tree.h
index bee03d3..a9c5975 100644
--- a/gcc/c/c-tree.h
+++ b/gcc/c/c-tree.h
@@ -643,6 +643,7 @@  extern tree c_expr_to_decl (tree, bool *, bool *);
 extern tree c_finish_oacc_parallel (location_t, tree, tree);
 extern tree c_finish_oacc_kernels (location_t, tree, tree);
 extern tree c_finish_oacc_data (location_t, tree, tree);
+extern tree c_finish_oacc_host_data (location_t, tree, tree);
 extern tree c_begin_omp_parallel (void);
 extern tree c_finish_omp_parallel (location_t, tree, tree);
 extern tree c_begin_omp_task (void);
diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
index bc43602..a5e2a4a 100644
--- a/gcc/c/c-typeck.c
+++ b/gcc/c/c-typeck.c
@@ -11510,6 +11510,25 @@  c_finish_oacc_data (location_t loc, tree clauses, tree block)
   return add_stmt (stmt);
 }
 
+/* Generate OACC_HOST_DATA, with CLAUSES and BLOCK as its compound
+   statement.  LOC is the location of the OACC_HOST_DATA.  */
+
+tree
+c_finish_oacc_host_data (location_t loc, tree clauses, tree block)
+{
+  tree stmt;
+
+  block = c_end_compound_stmt (loc, block, true);
+
+  stmt = make_node (OACC_HOST_DATA);
+  TREE_TYPE (stmt) = void_type_node;
+  OACC_HOST_DATA_CLAUSES (stmt) = clauses;
+  OACC_HOST_DATA_BODY (stmt) = block;
+  SET_EXPR_LOCATION (stmt, loc);
+
+  return add_stmt (stmt);
+}
+
 /* Like c_begin_compound_stmt, except force the retention of the BLOCK.  */
 
 tree
@@ -12942,6 +12961,7 @@  c_finish_omp_clauses (tree clauses, bool is_omp, bool declare_simd)
 	case OMP_CLAUSE_GANG:
 	case OMP_CLAUSE_WORKER:
 	case OMP_CLAUSE_VECTOR:
+	case OMP_CLAUSE_USE_DEVICE:
 	  pc = &OMP_CLAUSE_CHAIN (c);
 	  continue;
 
diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 16db41f..76ece42 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -6318,6 +6318,7 @@  extern void finish_omp_threadprivate		(tree);
 extern tree begin_omp_structured_block		(void);
 extern tree finish_omp_structured_block		(tree);
 extern tree finish_oacc_data			(tree, tree);
+extern tree finish_oacc_host_data		(tree, tree);
 extern tree finish_oacc_kernels			(tree, tree);
 extern tree finish_oacc_parallel		(tree, tree);
 extern tree begin_omp_parallel			(void);
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index f07a5e4..714e69c 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -29235,6 +29235,8 @@  cp_parser_omp_clause_name (cp_parser *parser)
 	    result = PRAGMA_OMP_CLAUSE_UNTIED;
 	  else if (!strcmp ("use_device_ptr", p))
 	    result = PRAGMA_OMP_CLAUSE_USE_DEVICE_PTR;
+	  else if (!strcmp ("use_device", p))
+	    result = PRAGMA_OACC_CLAUSE_USE_DEVICE;
 	  break;
 	case 'v':
 	  if (!strcmp ("vector_length", p))
@@ -31381,6 +31383,11 @@  cp_parser_oacc_all_clauses (cp_parser *parser, omp_clause_mask mask,
 	  clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses);
 	  c_name = "self";
 	  break;
+	case PRAGMA_OACC_CLAUSE_USE_DEVICE:
+	  clauses = cp_parser_omp_var_list (parser, OMP_CLAUSE_USE_DEVICE,
+					    clauses);
+	  c_name = "use_device";
+	  break;
 	case PRAGMA_OACC_CLAUSE_VECTOR_LENGTH:
 	  clauses = cp_parser_oacc_clause_vector_length (parser, clauses);
 	  c_name = "vector_length";
@@ -34221,6 +34228,30 @@  cp_parser_oacc_data (cp_parser *parser, cp_token *pragma_tok)
   return stmt;
 }
 
+#define OACC_HOST_DATA_CLAUSE_MASK					\
+  ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_USE_DEVICE) )
+
+/* OpenACC 2.0:
+  # pragma acc host_data <clauses> new-line
+  structured-block  */
+
+static tree
+cp_parser_oacc_host_data (cp_parser *parser, cp_token *pragma_tok)
+{
+  tree stmt, clauses, block;
+  unsigned int save;
+
+  clauses = cp_parser_oacc_all_clauses (parser, OACC_HOST_DATA_CLAUSE_MASK,
+					"#pragma acc host_data", pragma_tok);
+
+  block = begin_omp_parallel ();
+  save = cp_parser_begin_omp_structured_block (parser);
+  cp_parser_statement (parser, NULL_TREE, false, NULL);
+  cp_parser_end_omp_structured_block (parser, save);
+  stmt = finish_oacc_host_data (clauses, block);
+  return stmt;
+}
+
 /* OpenACC 2.0:
    # pragma acc enter data oacc-enter-data-clause[optseq] new-line
 
@@ -35288,6 +35319,9 @@  cp_parser_omp_construct (cp_parser *parser, cp_token *pragma_tok)
     case PRAGMA_OACC_EXIT_DATA:
       stmt = cp_parser_oacc_enter_exit_data (parser, pragma_tok, false);
       break;
+    case PRAGMA_OACC_HOST_DATA:
+      stmt = cp_parser_oacc_host_data (parser, pragma_tok);
+      break;
     case PRAGMA_OACC_KERNELS:
       stmt = cp_parser_oacc_kernels (parser, pragma_tok);
       break;
@@ -35856,6 +35890,7 @@  cp_parser_pragma (cp_parser *parser, enum pragma_context context)
     case PRAGMA_OACC_DATA:
     case PRAGMA_OACC_ENTER_DATA:
     case PRAGMA_OACC_EXIT_DATA:
+    case PRAGMA_OACC_HOST_DATA:
     case PRAGMA_OACC_KERNELS:
     case PRAGMA_OACC_PARALLEL:
     case PRAGMA_OACC_LOOP:
diff --git a/gcc/cp/semantics.c b/gcc/cp/semantics.c
index c0a8b32..25482e7 100644
--- a/gcc/cp/semantics.c
+++ b/gcc/cp/semantics.c
@@ -6689,6 +6689,7 @@  finish_omp_clauses (tree clauses, bool allow_fields, bool declare_simd)
 	case OMP_CLAUSE_SIMD:
 	case OMP_CLAUSE_DEFAULTMAP:
 	case OMP_CLAUSE__CILK_FOR_COUNT_:
+	case OMP_CLAUSE_USE_DEVICE:
 	  break;
 
 	case OMP_CLAUSE_INBRANCH:
@@ -7119,6 +7120,24 @@  finish_oacc_data (tree clauses, tree block)
   return add_stmt (stmt);
 }
 
+/* Generate OACC_HOST_DATA, with CLAUSES and BLOCK as its compound
+   statement.  */
+
+tree
+finish_oacc_host_data (tree clauses, tree block)
+{
+  tree stmt;
+
+  block = finish_omp_structured_block (block);
+
+  stmt = make_node (OACC_HOST_DATA);
+  TREE_TYPE (stmt) = void_type_node;
+  OACC_HOST_DATA_CLAUSES (stmt) = clauses;
+  OACC_HOST_DATA_BODY (stmt) = block;
+
+  return add_stmt (stmt);
+}
+
 /* Generate OACC_KERNELS, with CLAUSES and BLOCK as its compound
    statement.  LOC is the location of the OACC_KERNELS.  */
 
diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index ab9e540..0c32219 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -93,6 +93,8 @@  enum gimplify_omp_var_data
 
   GOVD_MAP_0LEN_ARRAY = 32768,
 
+  GOVD_USE_DEVICE = 65536,
+
   GOVD_DATA_SHARE_CLASS = (GOVD_SHARED | GOVD_PRIVATE | GOVD_FIRSTPRIVATE
 			   | GOVD_LASTPRIVATE | GOVD_REDUCTION | GOVD_LINEAR
 			   | GOVD_LOCAL)
@@ -116,7 +118,9 @@  enum omp_region_type
   ORT_COMBINED_TARGET = 33,
   /* Dummy OpenMP region, used to disable expansion of
      DECL_VALUE_EXPRs in taskloop pre body.  */
-  ORT_NONE = 64
+  ORT_NONE = 64,
+  /* An OpenACC host-data region.  */
+  ORT_HOST_DATA = 128
 };
 
 /* Gimplify hashtable helper.  */
@@ -6338,6 +6342,10 @@  gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p,
 		decl = TREE_OPERAND (decl, 0);
 	    }
 	  goto do_add_decl;
+	case OMP_CLAUSE_USE_DEVICE:
+	  flags = GOVD_USE_DEVICE | GOVD_EXPLICIT;
+	  check_non_private = "use_device";
+	  goto do_add;
 	case OMP_CLAUSE_LINEAR:
 	  if (gimplify_expr (&OMP_CLAUSE_LINEAR_STEP (c), pre_p, NULL,
 			     is_gimple_val, fb_rvalue) == GS_ERROR)
@@ -7005,7 +7013,6 @@  gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p,
 	  break;
 
 	case OMP_CLAUSE_DEVICE_RESIDENT:
-	case OMP_CLAUSE_USE_DEVICE:
 	case OMP_CLAUSE_INDEPENDENT:
 	  remove = true;
 	  break;
@@ -7529,6 +7536,127 @@  gimplify_oacc_cache (tree *expr_p, gimple_seq *pre_p)
   *expr_p = NULL_TREE;
 }
 
+static tree
+gimplify_oacc_host_data_1 (tree *tp, int *walk_subtrees,
+			   void *data ATTRIBUTE_UNUSED)
+{
+  splay_tree_node n = NULL;
+  location_t loc = EXPR_LOCATION (*tp);
+
+  switch (TREE_CODE (*tp))
+    {
+    case ADDR_EXPR:
+      {
+	tree decl = TREE_OPERAND (*tp, 0);
+
+	switch (TREE_CODE (decl))
+	  {
+	  case ARRAY_REF:
+	  case ARRAY_RANGE_REF:
+	  case COMPONENT_REF:
+	  case VIEW_CONVERT_EXPR:
+	  case REALPART_EXPR:
+	  case IMAGPART_EXPR:
+	    if (TREE_CODE (TREE_OPERAND (decl, 0)) == VAR_DECL)
+	      n = splay_tree_lookup (gimplify_omp_ctxp->variables,
+				     (splay_tree_key) TREE_OPERAND (decl, 0));
+	    break;
+
+	  case VAR_DECL:
+	    n = splay_tree_lookup (gimplify_omp_ctxp->variables,
+				   (splay_tree_key) decl);
+	    break;
+
+	  default:
+	    ;
+	  }
+
+	if (n != NULL && (n->value & GOVD_USE_DEVICE) != 0)
+	  {
+	    tree t = builtin_decl_explicit (BUILT_IN_GOACC_DEVICEPTR);
+	    *tp = build_call_expr_loc (loc, t, 1, *tp);
+	  }
+
+	*walk_subtrees = 0;
+      }
+      break;
+
+    case VAR_DECL:
+      {
+	tree decl = *tp;
+
+	n = splay_tree_lookup (gimplify_omp_ctxp->variables,
+			       (splay_tree_key) decl);
+
+	if (n != NULL && (n->value & GOVD_USE_DEVICE) != 0)
+	  {
+	    if (!POINTER_TYPE_P (TREE_TYPE (decl)))
+	      return decl;
+
+	    tree t = builtin_decl_explicit (BUILT_IN_GOACC_DEVICEPTR);
+	    *tp = build_call_expr_loc (loc, t, 1, *tp);
+	    *walk_subtrees = 0;
+	  }
+      }
+      break;
+
+    case OACC_PARALLEL:
+    case OACC_KERNELS:
+    case OACC_LOOP:
+      *walk_subtrees = 0;
+      break;
+
+    default:
+      ;
+    }
+
+  return NULL_TREE;
+}
+
+static enum gimplify_status
+gimplify_oacc_host_data (tree *expr_p, gimple_seq *pre_p)
+{
+  tree expr = *expr_p, orig_body;
+  gimple_seq body = NULL;
+
+  gimplify_scan_omp_clauses (&OACC_HOST_DATA_CLAUSES (expr), pre_p,
+			     ORT_HOST_DATA, OACC_HOST_DATA);
+
+  orig_body = OACC_HOST_DATA_BODY (expr);
+
+  /* Perform a pre-pass over the host_data region's body, inserting calls to
+     GOACC_deviceptr where appropriate.  */
+
+  tree ret = walk_tree_without_duplicates (&orig_body,
+					   &gimplify_oacc_host_data_1, 0);
+
+  if (ret)
+    {
+      error_at (EXPR_LOCATION (expr),
+		"undefined use of variable %qE in host_data region",
+		DECL_NAME (ret));
+      gimplify_adjust_omp_clauses (pre_p, &OACC_HOST_DATA_CLAUSES (expr),
+				   OACC_HOST_DATA);
+      return GS_ERROR;
+    }
+
+  push_gimplify_context ();
+
+  gimple *g = gimplify_and_return_first (orig_body, &body);
+
+  if (gimple_code (g) == GIMPLE_BIND)
+    pop_gimplify_context (g);
+  else
+    pop_gimplify_context (NULL);
+
+  gimplify_adjust_omp_clauses (pre_p, &OACC_HOST_DATA_CLAUSES (expr),
+			       OACC_HOST_DATA);
+
+  gimplify_seq_add_stmt (pre_p, g);
+
+  return GS_ALL_DONE;
+}
+
 /* Gimplify the contents of an OMP_PARALLEL statement.  This involves
    gimplification of the body, as well as scanning the body for used
    variables.  We need to do this scan now, because variable-sized
@@ -9595,6 +9723,9 @@  gimplify_expr (tree *expr_p, gimple_seq *pre_p, gimple_seq *post_p,
 	  break;
 
 	case OACC_HOST_DATA:
+	  ret = gimplify_oacc_host_data (expr_p, pre_p);
+	  break;
+
 	case OACC_DECLARE:
 	  sorry ("directive not yet implemented");
 	  ret = GS_ALL_DONE;
diff --git a/gcc/omp-builtins.def b/gcc/omp-builtins.def
index ea9cf0d..9ed075f 100644
--- a/gcc/omp-builtins.def
+++ b/gcc/omp-builtins.def
@@ -47,6 +47,8 @@  DEF_GOACC_BUILTIN (BUILT_IN_GOACC_UPDATE, "GOACC_update",
 DEF_GOACC_BUILTIN (BUILT_IN_GOACC_WAIT, "GOACC_wait",
 		   BT_FN_VOID_INT_INT_VAR,
 		   ATTR_NOTHROW_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_DEVICEPTR, "GOACC_deviceptr",
+		   BT_FN_PTR_PTR, ATTR_CONST_NOTHROW_LEAF_LIST)
 DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_THREAD_NUM, "GOACC_get_thread_num",
 		   BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
 DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_NUM_THREADS, "GOACC_get_num_threads",
diff --git a/gcc/testsuite/c-c++-common/goacc/host_data-1.c b/gcc/testsuite/c-c++-common/goacc/host_data-1.c
new file mode 100644
index 0000000..521c854
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/host_data-1.c
@@ -0,0 +1,13 @@ 
+/* Test valid use of host_data directive.  */
+/* { dg-do compile } */
+
+int v0;
+int v1[3][3];
+
+void
+f (void)
+{
+  int v2 = 3;
+#pragma acc host_data use_device(v2, v0, v1)
+  ;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/host_data-2.c b/gcc/testsuite/c-c++-common/goacc/host_data-2.c
new file mode 100644
index 0000000..e5213a0
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/host_data-2.c
@@ -0,0 +1,13 @@ 
+/* Test invalid use of host_data directive.  */
+/* { dg-do compile } */
+
+int v0;
+#pragma acc host_data use_device(v0) /* { dg-error "expected" } */
+
+void
+f (void)
+{
+  int v2 = 3;
+#pragma acc host_data copy(v2) /* { dg-error "not valid for" } */
+  ;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/host_data-3.c b/gcc/testsuite/c-c++-common/goacc/host_data-3.c
new file mode 100644
index 0000000..f9621c9
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/host_data-3.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+
+int main (int argc, char* argv[])
+{
+  int x = 5, y;
+
+  #pragma acc enter data copyin (x)
+  /* It's not clear what attempts to use non-pointer variables "directly"
+     (rather than merely taking their address) should do in host_data regions. 
+     We choose to make it an error.  */
+  #pragma acc host_data use_device (x) /* TODO { dg-error "" } */
+  {
+    y = x;
+  }
+  #pragma acc exit data delete (x)
+
+  return y - 5;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/host_data-4.c b/gcc/testsuite/c-c++-common/goacc/host_data-4.c
new file mode 100644
index 0000000..3dac5f3
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/host_data-4.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+
+int main (int argc, char* argv[])
+{
+  int x[100];
+
+  #pragma acc enter data copyin (x)
+  /* Specifying an array index is not valid for host_data/use_device.  */
+  #pragma acc host_data use_device (x[4]) /* { dg-error "expected '\\\)' before '\\\[' token" } */
+    ;
+  #pragma acc exit data delete (x)
+
+  return 0;
+}
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index 2153661..2a43a8c 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -378,6 +378,7 @@  GOACC_2.0 {
 	GOACC_wait;
 	GOACC_get_thread_num;
 	GOACC_get_num_threads;
+	GOACC_deviceptr;
 };
 
 GOACC_2.0.1 {
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index af067d6..497ab92 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -204,6 +204,38 @@  acc_deviceptr (void *h)
   return d;
 }
 
+/* This function is used as a helper in generated code to implement pointer
+   lookup in host_data regions.  Unlike acc_deviceptr, it returns its argument
+   unchanged on a shared-memory system (e.g. the host).  */
+
+void *
+GOACC_deviceptr (void *h)
+{
+  splay_tree_key n;
+  void *d;
+  void *offset;
+
+  goacc_lazy_initialize ();
+
+  struct goacc_thread *thr = goacc_thread ();
+
+  if ((thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) == 0)
+    {
+      n = lookup_host (thr->dev, h, 1);
+
+      if (!n)
+	return NULL;
+
+      offset = h - n->host_start;
+
+      d = n->tgt->tgt_start + n->tgt_offset + offset;
+
+      return d;
+    }
+  else
+    return h;
+}
+
 /* Return the host pointer that corresponds to device data D.  Or NULL
    if no mapping.  */
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c
new file mode 100644
index 0000000..15ccb27
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c
@@ -0,0 +1,125 @@ 
+/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* { dg-additional-options "-lcuda -lcublas -lcudart" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openacc.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cublas_v2.h>
+
+void
+saxpy_host (int n, float a, float *x, float *y)
+{
+  int i;
+
+  for (i = 0; i < n; i++)
+    y[i] = y[i] + a * x[i];
+}
+
+#pragma acc routine
+void
+saxpy_target (int n, float a, float *x, float *y)
+{
+  int i;
+
+  for (i = 0; i < n; i++)
+    y[i] = y[i] + a * x[i];
+}
+
+int
+main(int argc, char **argv)
+{
+  const int N = 8;
+  int i;
+  float *x_ref, *y_ref;
+  float *x, *y;
+  cublasHandle_t h;
+  float a = 2.0;
+
+  x_ref = (float*) malloc (N * sizeof(float));
+  y_ref = (float*) malloc (N * sizeof(float));
+
+  x = (float*) malloc (N * sizeof(float));
+  y = (float*) malloc (N * sizeof(float));
+
+#pragma acc data copyin (x[0:N]) copy (y[0:N])
+  {
+    float *xp, *yp;
+#pragma acc host_data use_device (x, y)
+    {
+#pragma acc parallel pcopy (xp, yp) present (x, y)
+      {
+        xp = x;
+	yp = y;
+      }
+    }
+
+    if (xp != acc_deviceptr (x) || yp != acc_deviceptr (y))
+	abort ();
+  }
+
+  for (i = 0; i < N; i++)
+    {
+      x[i] = x_ref[i] = 4.0 + i;
+      y[i] = y_ref[i] = 3.0;
+    }
+
+  saxpy_host (N, a, x_ref, y_ref);
+
+  cublasCreate (&h);
+
+#pragma acc data copyin (x[0:N]) copy (y[0:N])
+  {
+#pragma acc host_data use_device (x, y)
+    {
+      cublasSaxpy (h, N, &a, x, 1, y, 1);
+    }
+  }
+
+  for (i = 0; i < N; i++)
+    {
+      if (y[i] != y_ref[i])
+        abort ();
+    }
+
+#pragma acc data create (x[0:N]) copyout (y[0:N])
+  {
+#pragma acc kernels
+    for (i = 0; i < N; i++)
+      y[i] = 3.0;
+
+#pragma acc host_data use_device (x, y)
+    {
+      cublasSaxpy (h, N, &a, x, 1, y, 1);
+    }
+  }
+
+  cublasDestroy (h);
+
+  for (i = 0; i < N; i++)
+    {
+      if (y[i] != y_ref[i])
+        abort ();
+    }
+
+  for (i = 0; i < N; i++)
+    y[i] = 3.0;
+
+#pragma acc data copyin (x[0:N]) copyin (a, N) copy (y[0:N])
+  {
+#pragma acc host_data use_device (x, y)
+    {
+#pragma acc parallel present (x[0:N]) pcopy (y[0:N]) present (a, N)
+      saxpy_target (N, a, x, y);
+    }
+  }
+
+  for (i = 0; i < N; i++)
+    {
+      if (y[i] != y_ref[i])
+        abort ();
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-2.c
new file mode 100644
index 0000000..511ec64
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-2.c
@@ -0,0 +1,50 @@ 
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+struct by_lightning {
+  int a;
+  int b;
+  int c;
+};
+
+int main (int argc, char* argv[])
+{
+  int x;
+  void *q = NULL, *r = NULL, *p = NULL, *s = NULL, *t = NULL;
+  long u;
+  struct by_lightning on_the_head = {1, 2, 3};
+  int arr[10], *f = NULL;
+  _Complex float cf;
+  #pragma acc enter data copyin (x, arr, on_the_head, cf)
+  #pragma acc host_data use_device (x, arr, on_the_head, cf)
+  {
+    q = &x;
+    {
+      f = &arr[5];
+      r = f;
+      s = &__real__ cf;
+      t = &on_the_head.c;
+      u = (long) &__imag__ cf;
+      #pragma acc parallel copyout(p) present (x, arr, on_the_head, cf)
+      {
+        /* This will not (and must not) call GOACC_deviceptr, but '&x' will be
+	   the address on the device (if appropriate) regardless.  */
+	p = &x;
+      }
+    }
+  }
+  #pragma acc exit data delete (x)
+
+#if ACC_MEM_SHARED
+  if (q != &x || f != &arr[5] || r != f || s != &(__real__ cf)
+      || t != &on_the_head.c || u != (long) &(__imag__ cf) || p != &x)
+    abort ();
+#else
+  if (q == &x || f == &arr[5] || r != f || s == &(__real__ cf)
+      || t == &on_the_head.c || u == (long) &(__imag__ cf) || p == &x)
+    abort ();
+#endif
+
+  return 0;
+}