Message ID | 562A578E.4080907@codesourcery.com |
---|---|
State | New |
Headers | show |
On Fri, Oct 23, 2015 at 10:51:42AM -0500, James Norris wrote: > @@ -12942,6 +12961,7 @@ c_finish_omp_clauses (tree clauses, bool is_omp, bool declare_simd) > case OMP_CLAUSE_GANG: > case OMP_CLAUSE_WORKER: > case OMP_CLAUSE_VECTOR: > + case OMP_CLAUSE_USE_DEVICE: > pc = &OMP_CLAUSE_CHAIN (c); > continue; > Are there any restrictions on whether you can specify the same var multiple times in use_device clause? #pragma acc host_data use_device (x) use_device (x) use_device (y, y, y) ? If not, have you verified that the gimplifier doesn't ICE on it? Generally it doesn't like the same var being mentioned multiple times. If yes, you can use e.g. the generic_head bitmap for that and in any case, cover that with sufficient testsuite coverage. > diff --git a/gcc/gimplify.c b/gcc/gimplify.c > index ab9e540..0c32219 100644 > --- a/gcc/gimplify.c > +++ b/gcc/gimplify.c > @@ -93,6 +93,8 @@ enum gimplify_omp_var_data > > GOVD_MAP_0LEN_ARRAY = 32768, > > + GOVD_USE_DEVICE = 65536, > + > GOVD_DATA_SHARE_CLASS = (GOVD_SHARED | GOVD_PRIVATE | GOVD_FIRSTPRIVATE > | GOVD_LASTPRIVATE | GOVD_REDUCTION | GOVD_LINEAR > | GOVD_LOCAL) > @@ -116,7 +118,9 @@ enum omp_region_type > ORT_COMBINED_TARGET = 33, > /* Dummy OpenMP region, used to disable expansion of > DECL_VALUE_EXPRs in taskloop pre body. */ > - ORT_NONE = 64 > + ORT_NONE = 64, > + /* An OpenACC host-data region. */ > + ORT_HOST_DATA = 128 I'd prefer ORT_NONE to be the last one, can you just renumber it and put ORT_HOST_DATA before it? > +static tree > +gimplify_oacc_host_data_1 (tree *tp, int *walk_subtrees, > + void *data ATTRIBUTE_UNUSED) > +{ Your use_device sounds very similar to use_device_ptr clause in OpenMP, which is allowed on #pragma omp target data construct and is implemented quite a bit differently from this; it is unclear if the OpenACC standard requires this kind of implementation, or you just chose to implement it this way. In particular, the GOMP_target_data call puts the variables mentioned in the use_device_ptr clauses into the mapping structures (similarly how map clause appears) and the corresponding vars are privatized within the target data region (which is a host region, basically a fancy { } braces), where the private variables contain the offloading device's pointers. > + splay_tree_node n = NULL; > + location_t loc = EXPR_LOCATION (*tp); > + > + switch (TREE_CODE (*tp)) > + { > + case ADDR_EXPR: > + { > + tree decl = TREE_OPERAND (*tp, 0); > + > + switch (TREE_CODE (decl)) > + { > + case ARRAY_REF: > + case ARRAY_RANGE_REF: > + case COMPONENT_REF: > + case VIEW_CONVERT_EXPR: > + case REALPART_EXPR: > + case IMAGPART_EXPR: > + if (TREE_CODE (TREE_OPERAND (decl, 0)) == VAR_DECL) > + n = splay_tree_lookup (gimplify_omp_ctxp->variables, > + (splay_tree_key) TREE_OPERAND (decl, 0)); > + break; I must say this looks really strange, you throw away all the offsets embedded in the component codes (fixed or variable). Where comes the above list? What about other components (say bit field refs, etc.)? > + case VAR_DECL: What is so special about VAR_DECLs? Shouldn't PARM_DECLs / RESULT_DECLs be treated the same way? > --- a/libgomp/libgomp.map > +++ b/libgomp/libgomp.map > @@ -378,6 +378,7 @@ GOACC_2.0 { > GOACC_wait; > GOACC_get_thread_num; > GOACC_get_num_threads; > + GOACC_deviceptr; > }; > > GOACC_2.0.1 { You shouldn't be adding new symbols into a symbol version that appeared in a compiler that shipped already (GCC 5 already had GOACC_2.0 symbols). So it should go into GOACC_2.0.1. > diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c > index af067d6..497ab92 100644 > --- a/libgomp/oacc-mem.c > +++ b/libgomp/oacc-mem.c > @@ -204,6 +204,38 @@ acc_deviceptr (void *h) > return d; > } > > +/* This function is used as a helper in generated code to implement pointer > + lookup in host_data regions. Unlike acc_deviceptr, it returns its argument > + unchanged on a shared-memory system (e.g. the host). */ > + > +void * > +GOACC_deviceptr (void *h) > +{ > + splay_tree_key n; > + void *d; > + void *offset; > + > + goacc_lazy_initialize (); > + > + struct goacc_thread *thr = goacc_thread (); > + > + if ((thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) == 0) > + { > + n = lookup_host (thr->dev, h, 1); What is supposed to be the behavior when the h pointer points at object boundary, rather than into the middle of existing mapped object? Say you have: char a[16], b[0], c[16]; // b is GCC extension Now, char *p = &a[5]; is unambiguous, either a is mapped, or not. But, if p = &a[16];, then it could be either the one-past-last byte in a, or it could be the start of b (== one-past-last byte in b) or it could be the pointer to start of c. In OpenMP 4.5, I had endless discussions about this and the end result is that one-past-last byte addresses are unspecified behavior Jakub
On 10/26/2015 11:34 AM, Jakub Jelinek wrote: > On Fri, Oct 23, 2015 at 10:51:42AM -0500, James Norris wrote: >> @@ -12942,6 +12961,7 @@ c_finish_omp_clauses (tree clauses, bool is_omp, bool declare_simd) >> case OMP_CLAUSE_GANG: >> case OMP_CLAUSE_WORKER: >> case OMP_CLAUSE_VECTOR: >> + case OMP_CLAUSE_USE_DEVICE: >> pc = &OMP_CLAUSE_CHAIN (c); >> continue; >> > > Are there any restrictions on whether you can specify the same var multiple > times in use_device clause? > #pragma acc host_data use_device (x) use_device (x) use_device (y, y, y) > ? > If not, have you verified that the gimplifier doesn't ICE on it? Generally > it doesn't like the same var being mentioned multiple times. > If yes, you can use e.g. the generic_head bitmap for that and in any case, > cover that with sufficient testsuite coverage. Generally variables cannot appear in multiple clauses. I'll add more testing for this. >> diff --git a/gcc/gimplify.c b/gcc/gimplify.c >> index ab9e540..0c32219 100644 >> --- a/gcc/gimplify.c >> +++ b/gcc/gimplify.c >> @@ -93,6 +93,8 @@ enum gimplify_omp_var_data >> >> GOVD_MAP_0LEN_ARRAY = 32768, >> >> + GOVD_USE_DEVICE = 65536, >> + >> GOVD_DATA_SHARE_CLASS = (GOVD_SHARED | GOVD_PRIVATE | GOVD_FIRSTPRIVATE >> | GOVD_LASTPRIVATE | GOVD_REDUCTION | GOVD_LINEAR >> | GOVD_LOCAL) >> @@ -116,7 +118,9 @@ enum omp_region_type >> ORT_COMBINED_TARGET = 33, >> /* Dummy OpenMP region, used to disable expansion of >> DECL_VALUE_EXPRs in taskloop pre body. */ >> - ORT_NONE = 64 >> + ORT_NONE = 64, >> + /* An OpenACC host-data region. */ >> + ORT_HOST_DATA = 128 > > I'd prefer ORT_NONE to be the last one, can you just renumber it and put > ORT_HOST_DATA before it? OK. >> +static tree >> +gimplify_oacc_host_data_1 (tree *tp, int *walk_subtrees, >> + void *data ATTRIBUTE_UNUSED) >> +{ > > Your use_device sounds very similar to use_device_ptr clause in OpenMP, > which is allowed on #pragma omp target data construct and is implemented > quite a bit differently from this; it is unclear if the OpenACC standard > requires this kind of implementation, or you just chose to implement it this > way. In particular, the GOMP_target_data call puts the variables mentioned > in the use_device_ptr clauses into the mapping structures (similarly how > map clause appears) and the corresponding vars are privatized within the > target data region (which is a host region, basically a fancy { } braces), > where the private variables contain the offloading device's pointers. Is this a new OpenMP 4.5 feature? I'll take a closer look and see if they are similar enough. I also noticed that OpenMP 4.5 has something similar to OpenACC's enter/exit data construct now. >> + splay_tree_node n = NULL; >> + location_t loc = EXPR_LOCATION (*tp); >> + >> + switch (TREE_CODE (*tp)) >> + { >> + case ADDR_EXPR: >> + { >> + tree decl = TREE_OPERAND (*tp, 0); >> + >> + switch (TREE_CODE (decl)) >> + { >> + case ARRAY_REF: >> + case ARRAY_RANGE_REF: >> + case COMPONENT_REF: >> + case VIEW_CONVERT_EXPR: >> + case REALPART_EXPR: >> + case IMAGPART_EXPR: >> + if (TREE_CODE (TREE_OPERAND (decl, 0)) == VAR_DECL) >> + n = splay_tree_lookup (gimplify_omp_ctxp->variables, >> + (splay_tree_key) TREE_OPERAND (decl, 0)); >> + break; > > I must say this looks really strange, you throw away all the offsets > embedded in the component codes (fixed or variable). > Where comes the above list? What about other components (say bit field refs, > etc.)? I'm not sure. This is one of those things where multiple developers worked on it, and the history got lost. I'll investigate it. >> + case VAR_DECL: > > What is so special about VAR_DECLs? Shouldn't PARM_DECLs / RESULT_DECLs > be treated the same way? >> --- a/libgomp/libgomp.map >> +++ b/libgomp/libgomp.map >> @@ -378,6 +378,7 @@ GOACC_2.0 { >> GOACC_wait; >> GOACC_get_thread_num; >> GOACC_get_num_threads; >> + GOACC_deviceptr; >> }; >> >> GOACC_2.0.1 { > > You shouldn't be adding new symbols into a symbol version that appeared in a > compiler that shipped already (GCC 5 already had GOACC_2.0 symbols). > So it should go into GOACC_2.0.1. OK. >> diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c >> index af067d6..497ab92 100644 >> --- a/libgomp/oacc-mem.c >> +++ b/libgomp/oacc-mem.c >> @@ -204,6 +204,38 @@ acc_deviceptr (void *h) >> return d; >> } >> >> +/* This function is used as a helper in generated code to implement pointer >> + lookup in host_data regions. Unlike acc_deviceptr, it returns its argument >> + unchanged on a shared-memory system (e.g. the host). */ >> + >> +void * >> +GOACC_deviceptr (void *h) >> +{ >> + splay_tree_key n; >> + void *d; >> + void *offset; >> + >> + goacc_lazy_initialize (); >> + >> + struct goacc_thread *thr = goacc_thread (); >> + >> + if ((thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) == 0) >> + { >> + n = lookup_host (thr->dev, h, 1); > > What is supposed to be the behavior when the h pointer points at object > boundary, rather than into the middle of existing mapped object? Probably undefined with the way that OpenACC is defined. > Say you have: > char a[16], b[0], c[16]; // b is GCC extension > Now, char *p = &a[5]; is unambiguous, either a is mapped, or not. > But, if p = &a[16];, then it could be either the one-past-last byte in a, > or it could be the start of b (== one-past-last byte in b) or it could be > the pointer to start of c. > > In OpenMP 4.5, I had endless discussions about this and the end result is > that one-past-last byte addresses are unspecified behavior OK. Thanks for you feedback. Cesar
On Mon, 26 Oct 2015 19:34:22 +0100 Jakub Jelinek <jakub@redhat.com> wrote: > Your use_device sounds very similar to use_device_ptr clause in > OpenMP, which is allowed on #pragma omp target data construct and is > implemented quite a bit differently from this; it is unclear if the > OpenACC standard requires this kind of implementation, or you just > chose to implement it this way. In particular, the GOMP_target_data > call puts the variables mentioned in the use_device_ptr clauses into > the mapping structures (similarly how map clause appears) and the > corresponding vars are privatized within the target data region > (which is a host region, basically a fancy { } braces), where the > private variables contain the offloading device's pointers. As the author of the original patch, I have to say using the mapping structures seems like a far better approach, but I've hit some trouble with the details of adapting OpenACC to use that method. Firstly, on trunk at least, use_device_ptr variables are restricted to pointer or array types: that restriction doesn't exist in OpenACC, nor actually could I find it in the OpenMP 4.1 document (my guess is the standards are supposed to match in this regard). I think that a program such as this should work: void target_fn (int *targ_data); int main (int argc, char *argv[]) { char out; int myvar; #pragma omp target enter data map(to: myvar) #pragma omp target data use_device_ptr(myvar) map(from:out) { target_fn (&myvar); out = 5; } return 0; } "myvar" would have its address taken in the use_device_ptr region, and places where the corresponding mapped variable has its address taken would be replaced by a direct use of the mapped pointer. (Or is that not a well-formed thing to do, in general?). This fails with "error: 'use_device_ptr' variable is neither a pointer nor an array". Secondly, attempts to use use_device_ptr on (e.g. dynamically-allocated) arrays accessed through a pointer cause an ICE with the existing trunk OpenMP code: #include <stdlib.h> void target_fn (char *targ_data); int main (int argc, char *argv[]) { char *myarr, out; myarr = malloc (1024); #pragma omp target data map(to: myarr[0:1024]) { #pragma omp target data use_device_ptr(myarr) map(from:out) { target_fn (myarr); out = 5; } } return 0; } udp3.c: In function 'main': udp3.c:6:1: internal compiler error: in make_decl_rtl, at varasm.c:1298 main (int argc, char *argv[]) ^ 0x111256b make_decl_rtl(tree_node*) /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/varasm.c:1294 0x9ea005 expand_expr_real_1(tree_node*, rtx_def*, machine_mode, expand_modifier, rtx_def**, bool) /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/expr.c:9559 0x9e31c2 expand_expr_real(tree_node*, rtx_def*, machine_mode, expand_modifier, rtx_def**, bool) /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/expr.c:7892 0x9cb4ae expand_expr /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/expr.h:255 0x9d907d expand_assignment(tree_node*, tree_node*, bool) /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/expr.c:5089 0x89e219 expand_gimple_stmt_1 /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/cfgexpand.c:3576 0x89e60d expand_gimple_stmt /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/cfgexpand.c:3672 0x8a5773 expand_gimple_basic_block /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/cfgexpand.c:5676 0x8a72d4 execute /scratch/jbrown/openacc-trunk/src/gcc-mainline/gcc/cfgexpand.c:6288 Furthermore, this looks strange to me (006t.omplower): .omp_data_arr.5.out = &out; myarr.8 = myarr; .omp_data_arr.5.myarr = myarr.8; #pragma omp target data map(from:out [len: 1]) use_device_ptr(myarr) { D.2436 = .omp_data_arr.5.myarr; myarr = D.2436; That's clobbering the original myarr variable, right? Any clues on these two? The omp-low.c code is rather opaque to me... Thanks, Julian
On Mon, Nov 02, 2015 at 06:33:39PM +0000, Julian Brown wrote: > As the author of the original patch, I have to say using the mapping > structures seems like a far better approach, but I've hit some trouble > with the details of adapting OpenACC to use that method. > > Firstly, on trunk at least, use_device_ptr variables are restricted to > pointer or array types: that restriction doesn't exist in OpenACC, nor > actually could I find it in the OpenMP 4.1 document (my guess is the ^^ 4.5 ;) > standards are supposed to match in this regard). I think that a program > such as this should work: > > void target_fn (int *targ_data); > > int > main (int argc, char *argv[]) > { > char out; > int myvar; > #pragma omp target enter data map(to: myvar) > > #pragma omp target data use_device_ptr(myvar) map(from:out) > { > target_fn (&myvar); > out = 5; > } > > return 0; > } You are right, I've been misreading the standard (the only testcases that have been provided with the change were using arrays and so it was non-obvious that it relies on array to pointer decay). I'll work on changing the implementation accordingly tomorrow. Jakub
On Mon, Nov 02, 2015 at 06:33:39PM +0000, Julian Brown wrote: > Firstly, on trunk at least, use_device_ptr variables are restricted to > pointer or array types: that restriction doesn't exist in OpenACC, nor > actually could I find it in the OpenMP 4.1 document (my guess is the > standards are supposed to match in this regard). I think that a program > such as this should work: So, after talking about this on omp-lang, it seems there is agreement that only arrays and pointer types (or reference to arrays or pointers) should be allowed in use_device_ptr clause and that for pointers/reference to pointers it should probably act the way I've coded it up, i.e. that for them it translates the pointer to point to corresponding object to the one to which it points on the host. It is too late to change the standard now, but will be changed soon, and hopefully clarified in examples. > void target_fn (int *targ_data); > > int > main (int argc, char *argv[]) > { > char out; > int myvar; > #pragma omp target enter data map(to: myvar) > > #pragma omp target data use_device_ptr(myvar) map(from:out) > { > target_fn (&myvar); > out = 5; > } > > return 0; > } That would make the above non-conforming for OpenMP. > Secondly, attempts to use use_device_ptr on (e.g. > dynamically-allocated) arrays accessed through a pointer cause an ICE > with the existing trunk OpenMP code: > > #include <stdlib.h> > > void target_fn (char *targ_data); > > int > main (int argc, char *argv[]) > { > char *myarr, out; > > myarr = malloc (1024); > > #pragma omp target data map(to: myarr[0:1024]) > { > #pragma omp target data use_device_ptr(myarr) map(from:out) > { > target_fn (myarr); > out = 5; > } > } > > return 0; > } Can't reproduce this ICE (at least not on gomp-4_5-branch, but there aren't significant changes from the trunk there). > Furthermore, this looks strange to me (006t.omplower): > > .omp_data_arr.5.out = &out; > myarr.8 = myarr; > .omp_data_arr.5.myarr = myarr.8; > #pragma omp target data map(from:out [len: 1]) use_device_ptr(myarr) > { > D.2436 = .omp_data_arr.5.myarr; > myarr = D.2436; > > That's clobbering the original myarr variable, right? Just use -fdump-tree-omplower-uid to see that it is a different variable. Basically, for OpenMP use_device_ptr creates a private copy of the pointer for the body of the target data construct, and that pointer is assigned the target device's address. For arrays the implementation creates an artificial pointer variable (holding the start of the array initially) and replaces all references to the array in the target data body with dereference of the pointer. Jakub
diff --git a/gcc/c-family/c-pragma.c b/gcc/c-family/c-pragma.c index 834a916..b748e2f 100644 --- a/gcc/c-family/c-pragma.c +++ b/gcc/c-family/c-pragma.c @@ -1214,6 +1214,7 @@ static const struct omp_pragma_def oacc_pragmas[] = { { "data", PRAGMA_OACC_DATA }, { "enter", PRAGMA_OACC_ENTER_DATA }, { "exit", PRAGMA_OACC_EXIT_DATA }, + { "host_data", PRAGMA_OACC_HOST_DATA }, { "kernels", PRAGMA_OACC_KERNELS }, { "loop", PRAGMA_OACC_LOOP }, { "parallel", PRAGMA_OACC_PARALLEL }, diff --git a/gcc/c-family/c-pragma.h b/gcc/c-family/c-pragma.h index cec920f..23a72a3 100644 --- a/gcc/c-family/c-pragma.h +++ b/gcc/c-family/c-pragma.h @@ -31,6 +31,7 @@ enum pragma_kind { PRAGMA_OACC_DATA, PRAGMA_OACC_ENTER_DATA, PRAGMA_OACC_EXIT_DATA, + PRAGMA_OACC_HOST_DATA, PRAGMA_OACC_KERNELS, PRAGMA_OACC_LOOP, PRAGMA_OACC_PARALLEL, @@ -161,6 +162,7 @@ enum pragma_omp_clause { PRAGMA_OACC_CLAUSE_PRESENT_OR_CREATE, PRAGMA_OACC_CLAUSE_SELF, PRAGMA_OACC_CLAUSE_SEQ, + PRAGMA_OACC_CLAUSE_USE_DEVICE, PRAGMA_OACC_CLAUSE_VECTOR, PRAGMA_OACC_CLAUSE_VECTOR_LENGTH, PRAGMA_OACC_CLAUSE_WAIT, diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 704ebc6..ead98b9 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -10116,6 +10116,8 @@ c_parser_omp_clause_name (c_parser *parser) result = PRAGMA_OMP_CLAUSE_UNTIED; else if (!strcmp ("use_device_ptr", p)) result = PRAGMA_OMP_CLAUSE_USE_DEVICE_PTR; + else if (!strcmp ("use_device", p)) + result = PRAGMA_OACC_CLAUSE_USE_DEVICE; break; case 'v': if (!strcmp ("vector", p)) @@ -11219,6 +11221,15 @@ c_parser_oacc_clause_async (c_parser *parser, tree list) return list; } +/* OpenACC 2.0: + use_device ( variable-list ) */ + +static tree +c_parser_oacc_clause_use_device (c_parser *parser, tree list) +{ + return c_parser_omp_var_list_parens (parser, OMP_CLAUSE_USE_DEVICE, list); +} + /* OpenACC: wait ( int-expr-list ) */ @@ -12474,6 +12485,10 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask, clauses = c_parser_oacc_data_clause (parser, c_kind, clauses); c_name = "self"; break; + case PRAGMA_OACC_CLAUSE_USE_DEVICE: + clauses = c_parser_oacc_clause_use_device (parser, clauses); + c_name = "use_device"; + break; case PRAGMA_OACC_CLAUSE_VECTOR_LENGTH: clauses = c_parser_omp_clause_vector_length (parser, clauses); c_name = "vector_length"; @@ -13003,6 +13018,29 @@ c_parser_oacc_enter_exit_data (c_parser *parser, bool enter) /* OpenACC 2.0: + # pragma acc host_data oacc-data-clause[optseq] new-line + structured-block +*/ + +#define OACC_HOST_DATA_CLAUSE_MASK \ + ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_USE_DEVICE) ) + +static tree +c_parser_oacc_host_data (location_t loc, c_parser *parser) +{ + tree stmt, clauses, block; + + clauses = c_parser_oacc_all_clauses (parser, OACC_HOST_DATA_CLAUSE_MASK, + "#pragma acc host_data"); + + block = c_begin_omp_parallel (); + add_stmt (c_parser_omp_structured_block (parser)); + stmt = c_finish_oacc_host_data (loc, clauses, block); + return stmt; +} + + +/* OpenACC 2.0: # pragma acc loop oacc-loop-clause[optseq] new-line structured-block @@ -16075,6 +16113,9 @@ c_parser_omp_construct (c_parser *parser) case PRAGMA_OACC_DATA: stmt = c_parser_oacc_data (loc, parser); break; + case PRAGMA_OACC_HOST_DATA: + stmt = c_parser_oacc_host_data (loc, parser); + break; case PRAGMA_OACC_KERNELS: strcpy (p_name, "#pragma acc"); stmt = c_parser_oacc_kernels (loc, parser, p_name); diff --git a/gcc/c/c-tree.h b/gcc/c/c-tree.h index bee03d3..a9c5975 100644 --- a/gcc/c/c-tree.h +++ b/gcc/c/c-tree.h @@ -643,6 +643,7 @@ extern tree c_expr_to_decl (tree, bool *, bool *); extern tree c_finish_oacc_parallel (location_t, tree, tree); extern tree c_finish_oacc_kernels (location_t, tree, tree); extern tree c_finish_oacc_data (location_t, tree, tree); +extern tree c_finish_oacc_host_data (location_t, tree, tree); extern tree c_begin_omp_parallel (void); extern tree c_finish_omp_parallel (location_t, tree, tree); extern tree c_begin_omp_task (void); diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c index bc43602..a5e2a4a 100644 --- a/gcc/c/c-typeck.c +++ b/gcc/c/c-typeck.c @@ -11510,6 +11510,25 @@ c_finish_oacc_data (location_t loc, tree clauses, tree block) return add_stmt (stmt); } +/* Generate OACC_HOST_DATA, with CLAUSES and BLOCK as its compound + statement. LOC is the location of the OACC_HOST_DATA. */ + +tree +c_finish_oacc_host_data (location_t loc, tree clauses, tree block) +{ + tree stmt; + + block = c_end_compound_stmt (loc, block, true); + + stmt = make_node (OACC_HOST_DATA); + TREE_TYPE (stmt) = void_type_node; + OACC_HOST_DATA_CLAUSES (stmt) = clauses; + OACC_HOST_DATA_BODY (stmt) = block; + SET_EXPR_LOCATION (stmt, loc); + + return add_stmt (stmt); +} + /* Like c_begin_compound_stmt, except force the retention of the BLOCK. */ tree @@ -12942,6 +12961,7 @@ c_finish_omp_clauses (tree clauses, bool is_omp, bool declare_simd) case OMP_CLAUSE_GANG: case OMP_CLAUSE_WORKER: case OMP_CLAUSE_VECTOR: + case OMP_CLAUSE_USE_DEVICE: pc = &OMP_CLAUSE_CHAIN (c); continue; diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h index 16db41f..76ece42 100644 --- a/gcc/cp/cp-tree.h +++ b/gcc/cp/cp-tree.h @@ -6318,6 +6318,7 @@ extern void finish_omp_threadprivate (tree); extern tree begin_omp_structured_block (void); extern tree finish_omp_structured_block (tree); extern tree finish_oacc_data (tree, tree); +extern tree finish_oacc_host_data (tree, tree); extern tree finish_oacc_kernels (tree, tree); extern tree finish_oacc_parallel (tree, tree); extern tree begin_omp_parallel (void); diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index f07a5e4..714e69c 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -29235,6 +29235,8 @@ cp_parser_omp_clause_name (cp_parser *parser) result = PRAGMA_OMP_CLAUSE_UNTIED; else if (!strcmp ("use_device_ptr", p)) result = PRAGMA_OMP_CLAUSE_USE_DEVICE_PTR; + else if (!strcmp ("use_device", p)) + result = PRAGMA_OACC_CLAUSE_USE_DEVICE; break; case 'v': if (!strcmp ("vector_length", p)) @@ -31381,6 +31383,11 @@ cp_parser_oacc_all_clauses (cp_parser *parser, omp_clause_mask mask, clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses); c_name = "self"; break; + case PRAGMA_OACC_CLAUSE_USE_DEVICE: + clauses = cp_parser_omp_var_list (parser, OMP_CLAUSE_USE_DEVICE, + clauses); + c_name = "use_device"; + break; case PRAGMA_OACC_CLAUSE_VECTOR_LENGTH: clauses = cp_parser_oacc_clause_vector_length (parser, clauses); c_name = "vector_length"; @@ -34221,6 +34228,30 @@ cp_parser_oacc_data (cp_parser *parser, cp_token *pragma_tok) return stmt; } +#define OACC_HOST_DATA_CLAUSE_MASK \ + ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_USE_DEVICE) ) + +/* OpenACC 2.0: + # pragma acc host_data <clauses> new-line + structured-block */ + +static tree +cp_parser_oacc_host_data (cp_parser *parser, cp_token *pragma_tok) +{ + tree stmt, clauses, block; + unsigned int save; + + clauses = cp_parser_oacc_all_clauses (parser, OACC_HOST_DATA_CLAUSE_MASK, + "#pragma acc host_data", pragma_tok); + + block = begin_omp_parallel (); + save = cp_parser_begin_omp_structured_block (parser); + cp_parser_statement (parser, NULL_TREE, false, NULL); + cp_parser_end_omp_structured_block (parser, save); + stmt = finish_oacc_host_data (clauses, block); + return stmt; +} + /* OpenACC 2.0: # pragma acc enter data oacc-enter-data-clause[optseq] new-line @@ -35288,6 +35319,9 @@ cp_parser_omp_construct (cp_parser *parser, cp_token *pragma_tok) case PRAGMA_OACC_EXIT_DATA: stmt = cp_parser_oacc_enter_exit_data (parser, pragma_tok, false); break; + case PRAGMA_OACC_HOST_DATA: + stmt = cp_parser_oacc_host_data (parser, pragma_tok); + break; case PRAGMA_OACC_KERNELS: stmt = cp_parser_oacc_kernels (parser, pragma_tok); break; @@ -35856,6 +35890,7 @@ cp_parser_pragma (cp_parser *parser, enum pragma_context context) case PRAGMA_OACC_DATA: case PRAGMA_OACC_ENTER_DATA: case PRAGMA_OACC_EXIT_DATA: + case PRAGMA_OACC_HOST_DATA: case PRAGMA_OACC_KERNELS: case PRAGMA_OACC_PARALLEL: case PRAGMA_OACC_LOOP: diff --git a/gcc/cp/semantics.c b/gcc/cp/semantics.c index c0a8b32..25482e7 100644 --- a/gcc/cp/semantics.c +++ b/gcc/cp/semantics.c @@ -6689,6 +6689,7 @@ finish_omp_clauses (tree clauses, bool allow_fields, bool declare_simd) case OMP_CLAUSE_SIMD: case OMP_CLAUSE_DEFAULTMAP: case OMP_CLAUSE__CILK_FOR_COUNT_: + case OMP_CLAUSE_USE_DEVICE: break; case OMP_CLAUSE_INBRANCH: @@ -7119,6 +7120,24 @@ finish_oacc_data (tree clauses, tree block) return add_stmt (stmt); } +/* Generate OACC_HOST_DATA, with CLAUSES and BLOCK as its compound + statement. */ + +tree +finish_oacc_host_data (tree clauses, tree block) +{ + tree stmt; + + block = finish_omp_structured_block (block); + + stmt = make_node (OACC_HOST_DATA); + TREE_TYPE (stmt) = void_type_node; + OACC_HOST_DATA_CLAUSES (stmt) = clauses; + OACC_HOST_DATA_BODY (stmt) = block; + + return add_stmt (stmt); +} + /* Generate OACC_KERNELS, with CLAUSES and BLOCK as its compound statement. LOC is the location of the OACC_KERNELS. */ diff --git a/gcc/gimplify.c b/gcc/gimplify.c index ab9e540..0c32219 100644 --- a/gcc/gimplify.c +++ b/gcc/gimplify.c @@ -93,6 +93,8 @@ enum gimplify_omp_var_data GOVD_MAP_0LEN_ARRAY = 32768, + GOVD_USE_DEVICE = 65536, + GOVD_DATA_SHARE_CLASS = (GOVD_SHARED | GOVD_PRIVATE | GOVD_FIRSTPRIVATE | GOVD_LASTPRIVATE | GOVD_REDUCTION | GOVD_LINEAR | GOVD_LOCAL) @@ -116,7 +118,9 @@ enum omp_region_type ORT_COMBINED_TARGET = 33, /* Dummy OpenMP region, used to disable expansion of DECL_VALUE_EXPRs in taskloop pre body. */ - ORT_NONE = 64 + ORT_NONE = 64, + /* An OpenACC host-data region. */ + ORT_HOST_DATA = 128 }; /* Gimplify hashtable helper. */ @@ -6338,6 +6342,10 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p, decl = TREE_OPERAND (decl, 0); } goto do_add_decl; + case OMP_CLAUSE_USE_DEVICE: + flags = GOVD_USE_DEVICE | GOVD_EXPLICIT; + check_non_private = "use_device"; + goto do_add; case OMP_CLAUSE_LINEAR: if (gimplify_expr (&OMP_CLAUSE_LINEAR_STEP (c), pre_p, NULL, is_gimple_val, fb_rvalue) == GS_ERROR) @@ -7005,7 +7013,6 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p, break; case OMP_CLAUSE_DEVICE_RESIDENT: - case OMP_CLAUSE_USE_DEVICE: case OMP_CLAUSE_INDEPENDENT: remove = true; break; @@ -7529,6 +7536,127 @@ gimplify_oacc_cache (tree *expr_p, gimple_seq *pre_p) *expr_p = NULL_TREE; } +static tree +gimplify_oacc_host_data_1 (tree *tp, int *walk_subtrees, + void *data ATTRIBUTE_UNUSED) +{ + splay_tree_node n = NULL; + location_t loc = EXPR_LOCATION (*tp); + + switch (TREE_CODE (*tp)) + { + case ADDR_EXPR: + { + tree decl = TREE_OPERAND (*tp, 0); + + switch (TREE_CODE (decl)) + { + case ARRAY_REF: + case ARRAY_RANGE_REF: + case COMPONENT_REF: + case VIEW_CONVERT_EXPR: + case REALPART_EXPR: + case IMAGPART_EXPR: + if (TREE_CODE (TREE_OPERAND (decl, 0)) == VAR_DECL) + n = splay_tree_lookup (gimplify_omp_ctxp->variables, + (splay_tree_key) TREE_OPERAND (decl, 0)); + break; + + case VAR_DECL: + n = splay_tree_lookup (gimplify_omp_ctxp->variables, + (splay_tree_key) decl); + break; + + default: + ; + } + + if (n != NULL && (n->value & GOVD_USE_DEVICE) != 0) + { + tree t = builtin_decl_explicit (BUILT_IN_GOACC_DEVICEPTR); + *tp = build_call_expr_loc (loc, t, 1, *tp); + } + + *walk_subtrees = 0; + } + break; + + case VAR_DECL: + { + tree decl = *tp; + + n = splay_tree_lookup (gimplify_omp_ctxp->variables, + (splay_tree_key) decl); + + if (n != NULL && (n->value & GOVD_USE_DEVICE) != 0) + { + if (!POINTER_TYPE_P (TREE_TYPE (decl))) + return decl; + + tree t = builtin_decl_explicit (BUILT_IN_GOACC_DEVICEPTR); + *tp = build_call_expr_loc (loc, t, 1, *tp); + *walk_subtrees = 0; + } + } + break; + + case OACC_PARALLEL: + case OACC_KERNELS: + case OACC_LOOP: + *walk_subtrees = 0; + break; + + default: + ; + } + + return NULL_TREE; +} + +static enum gimplify_status +gimplify_oacc_host_data (tree *expr_p, gimple_seq *pre_p) +{ + tree expr = *expr_p, orig_body; + gimple_seq body = NULL; + + gimplify_scan_omp_clauses (&OACC_HOST_DATA_CLAUSES (expr), pre_p, + ORT_HOST_DATA, OACC_HOST_DATA); + + orig_body = OACC_HOST_DATA_BODY (expr); + + /* Perform a pre-pass over the host_data region's body, inserting calls to + GOACC_deviceptr where appropriate. */ + + tree ret = walk_tree_without_duplicates (&orig_body, + &gimplify_oacc_host_data_1, 0); + + if (ret) + { + error_at (EXPR_LOCATION (expr), + "undefined use of variable %qE in host_data region", + DECL_NAME (ret)); + gimplify_adjust_omp_clauses (pre_p, &OACC_HOST_DATA_CLAUSES (expr), + OACC_HOST_DATA); + return GS_ERROR; + } + + push_gimplify_context (); + + gimple *g = gimplify_and_return_first (orig_body, &body); + + if (gimple_code (g) == GIMPLE_BIND) + pop_gimplify_context (g); + else + pop_gimplify_context (NULL); + + gimplify_adjust_omp_clauses (pre_p, &OACC_HOST_DATA_CLAUSES (expr), + OACC_HOST_DATA); + + gimplify_seq_add_stmt (pre_p, g); + + return GS_ALL_DONE; +} + /* Gimplify the contents of an OMP_PARALLEL statement. This involves gimplification of the body, as well as scanning the body for used variables. We need to do this scan now, because variable-sized @@ -9595,6 +9723,9 @@ gimplify_expr (tree *expr_p, gimple_seq *pre_p, gimple_seq *post_p, break; case OACC_HOST_DATA: + ret = gimplify_oacc_host_data (expr_p, pre_p); + break; + case OACC_DECLARE: sorry ("directive not yet implemented"); ret = GS_ALL_DONE; diff --git a/gcc/omp-builtins.def b/gcc/omp-builtins.def index ea9cf0d..9ed075f 100644 --- a/gcc/omp-builtins.def +++ b/gcc/omp-builtins.def @@ -47,6 +47,8 @@ DEF_GOACC_BUILTIN (BUILT_IN_GOACC_UPDATE, "GOACC_update", DEF_GOACC_BUILTIN (BUILT_IN_GOACC_WAIT, "GOACC_wait", BT_FN_VOID_INT_INT_VAR, ATTR_NOTHROW_LIST) +DEF_GOACC_BUILTIN (BUILT_IN_GOACC_DEVICEPTR, "GOACC_deviceptr", + BT_FN_PTR_PTR, ATTR_CONST_NOTHROW_LEAF_LIST) DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_THREAD_NUM, "GOACC_get_thread_num", BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST) DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_NUM_THREADS, "GOACC_get_num_threads", diff --git a/gcc/testsuite/c-c++-common/goacc/host_data-1.c b/gcc/testsuite/c-c++-common/goacc/host_data-1.c new file mode 100644 index 0000000..521c854 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/host_data-1.c @@ -0,0 +1,13 @@ +/* Test valid use of host_data directive. */ +/* { dg-do compile } */ + +int v0; +int v1[3][3]; + +void +f (void) +{ + int v2 = 3; +#pragma acc host_data use_device(v2, v0, v1) + ; +} diff --git a/gcc/testsuite/c-c++-common/goacc/host_data-2.c b/gcc/testsuite/c-c++-common/goacc/host_data-2.c new file mode 100644 index 0000000..e5213a0 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/host_data-2.c @@ -0,0 +1,13 @@ +/* Test invalid use of host_data directive. */ +/* { dg-do compile } */ + +int v0; +#pragma acc host_data use_device(v0) /* { dg-error "expected" } */ + +void +f (void) +{ + int v2 = 3; +#pragma acc host_data copy(v2) /* { dg-error "not valid for" } */ + ; +} diff --git a/gcc/testsuite/c-c++-common/goacc/host_data-3.c b/gcc/testsuite/c-c++-common/goacc/host_data-3.c new file mode 100644 index 0000000..f9621c9 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/host_data-3.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ + +int main (int argc, char* argv[]) +{ + int x = 5, y; + + #pragma acc enter data copyin (x) + /* It's not clear what attempts to use non-pointer variables "directly" + (rather than merely taking their address) should do in host_data regions. + We choose to make it an error. */ + #pragma acc host_data use_device (x) /* TODO { dg-error "" } */ + { + y = x; + } + #pragma acc exit data delete (x) + + return y - 5; +} diff --git a/gcc/testsuite/c-c++-common/goacc/host_data-4.c b/gcc/testsuite/c-c++-common/goacc/host_data-4.c new file mode 100644 index 0000000..3dac5f3 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/host_data-4.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ + +int main (int argc, char* argv[]) +{ + int x[100]; + + #pragma acc enter data copyin (x) + /* Specifying an array index is not valid for host_data/use_device. */ + #pragma acc host_data use_device (x[4]) /* { dg-error "expected '\\\)' before '\\\[' token" } */ + ; + #pragma acc exit data delete (x) + + return 0; +} diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map index 2153661..2a43a8c 100644 --- a/libgomp/libgomp.map +++ b/libgomp/libgomp.map @@ -378,6 +378,7 @@ GOACC_2.0 { GOACC_wait; GOACC_get_thread_num; GOACC_get_num_threads; + GOACC_deviceptr; }; GOACC_2.0.1 { diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c index af067d6..497ab92 100644 --- a/libgomp/oacc-mem.c +++ b/libgomp/oacc-mem.c @@ -204,6 +204,38 @@ acc_deviceptr (void *h) return d; } +/* This function is used as a helper in generated code to implement pointer + lookup in host_data regions. Unlike acc_deviceptr, it returns its argument + unchanged on a shared-memory system (e.g. the host). */ + +void * +GOACC_deviceptr (void *h) +{ + splay_tree_key n; + void *d; + void *offset; + + goacc_lazy_initialize (); + + struct goacc_thread *thr = goacc_thread (); + + if ((thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) == 0) + { + n = lookup_host (thr->dev, h, 1); + + if (!n) + return NULL; + + offset = h - n->host_start; + + d = n->tgt->tgt_start + n->tgt_offset + offset; + + return d; + } + else + return h; +} + /* Return the host pointer that corresponds to device data D. Or NULL if no mapping. */ diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c new file mode 100644 index 0000000..15ccb27 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c @@ -0,0 +1,125 @@ +/* { dg-do run { target openacc_nvidia_accel_selected } } */ +/* { dg-additional-options "-lcuda -lcublas -lcudart" } */ + +#include <stdio.h> +#include <stdlib.h> +#include <openacc.h> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cublas_v2.h> + +void +saxpy_host (int n, float a, float *x, float *y) +{ + int i; + + for (i = 0; i < n; i++) + y[i] = y[i] + a * x[i]; +} + +#pragma acc routine +void +saxpy_target (int n, float a, float *x, float *y) +{ + int i; + + for (i = 0; i < n; i++) + y[i] = y[i] + a * x[i]; +} + +int +main(int argc, char **argv) +{ + const int N = 8; + int i; + float *x_ref, *y_ref; + float *x, *y; + cublasHandle_t h; + float a = 2.0; + + x_ref = (float*) malloc (N * sizeof(float)); + y_ref = (float*) malloc (N * sizeof(float)); + + x = (float*) malloc (N * sizeof(float)); + y = (float*) malloc (N * sizeof(float)); + +#pragma acc data copyin (x[0:N]) copy (y[0:N]) + { + float *xp, *yp; +#pragma acc host_data use_device (x, y) + { +#pragma acc parallel pcopy (xp, yp) present (x, y) + { + xp = x; + yp = y; + } + } + + if (xp != acc_deviceptr (x) || yp != acc_deviceptr (y)) + abort (); + } + + for (i = 0; i < N; i++) + { + x[i] = x_ref[i] = 4.0 + i; + y[i] = y_ref[i] = 3.0; + } + + saxpy_host (N, a, x_ref, y_ref); + + cublasCreate (&h); + +#pragma acc data copyin (x[0:N]) copy (y[0:N]) + { +#pragma acc host_data use_device (x, y) + { + cublasSaxpy (h, N, &a, x, 1, y, 1); + } + } + + for (i = 0; i < N; i++) + { + if (y[i] != y_ref[i]) + abort (); + } + +#pragma acc data create (x[0:N]) copyout (y[0:N]) + { +#pragma acc kernels + for (i = 0; i < N; i++) + y[i] = 3.0; + +#pragma acc host_data use_device (x, y) + { + cublasSaxpy (h, N, &a, x, 1, y, 1); + } + } + + cublasDestroy (h); + + for (i = 0; i < N; i++) + { + if (y[i] != y_ref[i]) + abort (); + } + + for (i = 0; i < N; i++) + y[i] = 3.0; + +#pragma acc data copyin (x[0:N]) copyin (a, N) copy (y[0:N]) + { +#pragma acc host_data use_device (x, y) + { +#pragma acc parallel present (x[0:N]) pcopy (y[0:N]) present (a, N) + saxpy_target (N, a, x, y); + } + } + + for (i = 0; i < N; i++) + { + if (y[i] != y_ref[i]) + abort (); + } + + return 0; +} diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-2.c new file mode 100644 index 0000000..511ec64 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-2.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ + +#include <stdlib.h> + +struct by_lightning { + int a; + int b; + int c; +}; + +int main (int argc, char* argv[]) +{ + int x; + void *q = NULL, *r = NULL, *p = NULL, *s = NULL, *t = NULL; + long u; + struct by_lightning on_the_head = {1, 2, 3}; + int arr[10], *f = NULL; + _Complex float cf; + #pragma acc enter data copyin (x, arr, on_the_head, cf) + #pragma acc host_data use_device (x, arr, on_the_head, cf) + { + q = &x; + { + f = &arr[5]; + r = f; + s = &__real__ cf; + t = &on_the_head.c; + u = (long) &__imag__ cf; + #pragma acc parallel copyout(p) present (x, arr, on_the_head, cf) + { + /* This will not (and must not) call GOACC_deviceptr, but '&x' will be + the address on the device (if appropriate) regardless. */ + p = &x; + } + } + } + #pragma acc exit data delete (x) + +#if ACC_MEM_SHARED + if (q != &x || f != &arr[5] || r != f || s != &(__real__ cf) + || t != &on_the_head.c || u != (long) &(__imag__ cf) || p != &x) + abort (); +#else + if (q == &x || f == &arr[5] || r != f || s == &(__real__ cf) + || t == &on_the_head.c || u == (long) &(__imag__ cf) || p == &x) + abort (); +#endif + + return 0; +}