[gomp4.1] depend nowait support for target {update,{enter,exit} data}

Message ID	20150908092014.GA1847@tucnak.redhat.com
State	New
Headers	show Return-Path: <gcc-patches-return-406854-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:cc:subject:message-id:reply-to:mime-version :content-type; q=dns; s=default; b=vwzyt9uP0rJTk4DcL+8vYjvljSoAE KmSa+SboXg0WGHnFKTP5zO36l+8BhBCx3gxcpOzwqBVRcHwVUyQa0AddCFYxaMxq DO5ZhBdihzI6afvZtmUqpLS6U0jkyDTZtyg8HtYadpZqICrA52RO94TEzT/cgUqW Pj7b3KmX+6jTFw= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Date: Tue, 8 Sep 2015 11:20:14 +0200 From: Jakub Jelinek <jakub@redhat.com> To: Ilya Verbin <iverbin@gmail.com>, Aldy Hernandez <aldyh@redhat.com> Cc: gcc-patches@gcc.gnu.org Subject: [gomp4.1] depend nowait support for target {update, {enter, exit} data} Message-ID: <20150908092014.GA1847@tucnak.redhat.com> Reply-To: Jakub Jelinek <jakub@redhat.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.23 (2014-03-12)

--- libgomp/target.c.jj 2015-09-03 16:51:06.000000000 +0200 +++ libgomp/target.c 2015-09-08 09:55:24.591484158 +0200 @@ -899,13 +899,6 @@ gomp_update (struct gomp_device_descr *d - n->host_start), cur_node.host_end - cur_node.host_start); } - else - { - gomp_mutex_unlock (&devicep->lock); - gomp_fatal ("Trying to update [%p..%p) object that is not mapped", - (void *) cur_node.host_start, - (void *) cur_node.host_end); - } } gomp_mutex_unlock (&devicep->lock); } @@ -1460,18 +1453,50 @@ GOMP_target_update_41 (int device, size_ /* If there are depend clauses, but nowait is not present, block the parent task until the dependencies are resolved and then just continue with the rest of the function as if it - is a merged task. */ + is a merged task. Until we are able to schedule task during + variable mapping or unmapping, ignore nowait if depend clauses + are not present. */ if (depend != NULL) { struct gomp_thread *thr = gomp_thread (); if (thr->task && thr->task->depend_hash) - gomp_task_maybe_wait_for_dependencies (depend); + { + if ((flags & GOMP_TARGET_FLAG_NOWAIT) + && thr->ts.team + && !thr->task->final_task) + { + gomp_create_target_task (devicep, (void (*) (void *)) NULL, + mapnum, hostaddrs, sizes, kinds, + flags | GOMP_TARGET_FLAG_UPDATE, + depend); + return; + } + + struct gomp_team *team = thr->ts.team; + /* If parallel or taskgroup has been cancelled, don't start new + tasks. */ + if (team + && (gomp_team_barrier_cancelled (&team->barrier) + || (thr->task->taskgroup + && thr->task->taskgroup->cancelled))) + return; + + gomp_task_maybe_wait_for_dependencies (depend); + } } if (devicep == NULL || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) return; + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + /* If parallel or taskgroup has been cancelled, don't start new tasks. */ + if (team + && (gomp_team_barrier_cancelled (&team->barrier) + || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) + return; + gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, true); } @@ -1548,18 +1573,49 @@ GOMP_target_enter_exit_data (int device, /* If there are depend clauses, but nowait is not present, block the parent task until the dependencies are resolved and then just continue with the rest of the function as if it - is a merged task. */ + is a merged task. Until we are able to schedule task during + variable mapping or unmapping, ignore nowait if depend clauses + are not present. */ if (depend != NULL) { struct gomp_thread *thr = gomp_thread (); if (thr->task && thr->task->depend_hash) - gomp_task_maybe_wait_for_dependencies (depend); + { + if ((flags & GOMP_TARGET_FLAG_NOWAIT) + && thr->ts.team + && !thr->task->final_task) + { + gomp_create_target_task (devicep, (void (*) (void *)) NULL, + mapnum, hostaddrs, sizes, kinds, + flags, depend); + return; + } + + struct gomp_team *team = thr->ts.team; + /* If parallel or taskgroup has been cancelled, don't start new + tasks. */ + if (team + && (gomp_team_barrier_cancelled (&team->barrier) + || (thr->task->taskgroup + && thr->task->taskgroup->cancelled))) + return; + + gomp_task_maybe_wait_for_dependencies (depend); + } } if (devicep == NULL || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) return; + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + /* If parallel or taskgroup has been cancelled, don't start new tasks. */ + if (team + && (gomp_team_barrier_cancelled (&team->barrier) + || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) + return; + size_t i; if ((flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0) for (i = 0; i < mapnum; i++) @@ -1577,6 +1633,40 @@ GOMP_target_enter_exit_data (int device, } void +gomp_target_task_fn (void *data) +{ + struct gomp_target_task *ttask = (struct gomp_target_task *) data; + if (ttask->fn != NULL) + { + /* GOMP_target_41 */ + } + else if (ttask->devicep == NULL + || !(ttask->devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) + return; + + size_t i; + if (ttask->flags & GOMP_TARGET_FLAG_UPDATE) + gomp_update (ttask->devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes, + ttask->kinds, true); + else if ((ttask->flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0) + for (i = 0; i < ttask->mapnum; i++) + if ((ttask->kinds[i] & 0xff) == GOMP_MAP_STRUCT) + { + gomp_map_vars (ttask->devicep, ttask->sizes[i] + 1, + &ttask->hostaddrs[i], NULL, &ttask->sizes[i], + &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA); + i += ttask->sizes[i]; + } + else + gomp_map_vars (ttask->devicep, 1, &ttask->hostaddrs[i], NULL, + &ttask->sizes[i], &ttask->kinds[i], + true, GOMP_MAP_VARS_ENTER_DATA); + else + gomp_exit_data (ttask->devicep, ttask->mapnum, ttask->hostaddrs, + ttask->sizes, ttask->kinds); +} + +void GOMP_teams (unsigned int num_teams, unsigned int thread_limit) { if (thread_limit) --- libgomp/task.c.jj 2015-09-02 15:22:14.000000000 +0200 +++ libgomp/task.c 2015-09-08 10:20:54.163978966 +0200 @@ -108,6 +108,123 @@ gomp_clear_parent (struct gomp_task *chi while (task != children); } +/* Helper function for GOMP_task and gomp_create_target_task. Depend clause + handling for undeferred task creation. */ + +static void +gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent, + void **depend) +{ + size_t ndepend = (uintptr_t) depend[0]; + size_t nout = (uintptr_t) depend[1]; + size_t i; + hash_entry_type ent; + + task->depend_count = ndepend; + task->num_dependees = 0; + if (parent->depend_hash == NULL) + parent->depend_hash = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12); + for (i = 0; i < ndepend; i++) + { + task->depend[i].addr = depend[2 + i]; + task->depend[i].next = NULL; + task->depend[i].prev = NULL; + task->depend[i].task = task; + task->depend[i].is_in = i >= nout; + task->depend[i].redundant = false; + task->depend[i].redundant_out = false; + + hash_entry_type *slot = htab_find_slot (&parent->depend_hash, + &task->depend[i], INSERT); + hash_entry_type out = NULL, last = NULL; + if (*slot) + { + /* If multiple depends on the same task are the same, all but the + first one are redundant. As inout/out come first, if any of them + is inout/out, it will win, which is the right semantics. */ + if ((*slot)->task == task) + { + task->depend[i].redundant = true; + continue; + } + for (ent = *slot; ent; ent = ent->next) + { + if (ent->redundant_out) + break; + + last = ent; + + /* depend(in:...) doesn't depend on earlier depend(in:...). */ + if (i >= nout && ent->is_in) + continue; + + if (!ent->is_in) + out = ent; + + struct gomp_task *tsk = ent->task; + if (tsk->dependers == NULL) + { + tsk->dependers + = gomp_malloc (sizeof (struct gomp_dependers_vec) + + 6 * sizeof (struct gomp_task *)); + tsk->dependers->n_elem = 1; + tsk->dependers->allocated = 6; + tsk->dependers->elem[0] = task; + task->num_dependees++; + continue; + } + /* We already have some other dependency on tsk from earlier + depend clause. */ + else if (tsk->dependers->n_elem + && (tsk->dependers->elem[tsk->dependers->n_elem - 1] + == task)) + continue; + else if (tsk->dependers->n_elem == tsk->dependers->allocated) + { + tsk->dependers->allocated + = tsk->dependers->allocated * 2 + 2; + tsk->dependers + = gomp_realloc (tsk->dependers, + sizeof (struct gomp_dependers_vec) + + (tsk->dependers->allocated + * sizeof (struct gomp_task *))); + } + tsk->dependers->elem[tsk->dependers->n_elem++] = task; + task->num_dependees++; + } + task->depend[i].next = *slot; + (*slot)->prev = &task->depend[i]; + } + *slot = &task->depend[i]; + + /* There is no need to store more than one depend({,in}out:) task per + address in the hash table chain for the purpose of creation of + deferred tasks, because each out depends on all earlier outs, thus it + is enough to record just the last depend({,in}out:). For depend(in:), + we need to keep all of the previous ones not terminated yet, because + a later depend({,in}out:) might need to depend on all of them. So, if + the new task's clause is depend({,in}out:), we know there is at most + one other depend({,in}out:) clause in the list (out). For + non-deferred tasks we want to see all outs, so they are moved to the + end of the chain, after first redundant_out entry all following + entries should be redundant_out. */ + if (!task->depend[i].is_in && out) + { + if (out != last) + { + out->next->prev = out->prev; + out->prev->next = out->next; + out->next = last->next; + out->prev = last; + last->next = out; + if (out->next) + out->next->prev = out; + } + out->redundant_out = true; + } + } +} + /* Called when encountering an explicit task directive. If IF_CLAUSE is false, then we must not delay in executing the task. If UNTIED is true, then the task may be executed by any member of the team. @@ -248,123 +365,7 @@ GOMP_task (void (*fn) (void *), void *da taskgroup->num_children++; if (depend_size) { - size_t ndepend = (uintptr_t) depend[0]; - size_t nout = (uintptr_t) depend[1]; - size_t i; - hash_entry_type ent; - - task->depend_count = ndepend; - task->num_dependees = 0; - if (parent->depend_hash == NULL) - parent->depend_hash - = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12); - for (i = 0; i < ndepend; i++) - { - task->depend[i].addr = depend[2 + i]; - task->depend[i].next = NULL; - task->depend[i].prev = NULL; - task->depend[i].task = task; - task->depend[i].is_in = i >= nout; - task->depend[i].redundant = false; - task->depend[i].redundant_out = false; - - hash_entry_type *slot - = htab_find_slot (&parent->depend_hash, &task->depend[i], - INSERT); - hash_entry_type out = NULL, last = NULL; - if (*slot) - { - /* If multiple depends on the same task are the - same, all but the first one are redundant. - As inout/out come first, if any of them is - inout/out, it will win, which is the right - semantics. */ - if ((*slot)->task == task) - { - task->depend[i].redundant = true; - continue; - } - for (ent = *slot; ent; ent = ent->next) - { - if (ent->redundant_out) - break; - - last = ent; - - /* depend(in:...) doesn't depend on earlier - depend(in:...). */ - if (i >= nout && ent->is_in) - continue; - - if (!ent->is_in) - out = ent; - - struct gomp_task *tsk = ent->task; - if (tsk->dependers == NULL) - { - tsk->dependers - = gomp_malloc (sizeof (struct gomp_dependers_vec) - + 6 * sizeof (struct gomp_task *)); - tsk->dependers->n_elem = 1; - tsk->dependers->allocated = 6; - tsk->dependers->elem[0] = task; - task->num_dependees++; - continue; - } - /* We already have some other dependency on tsk - from earlier depend clause. */ - else if (tsk->dependers->n_elem - && (tsk->dependers->elem[tsk->dependers->n_elem - - 1] - == task)) - continue; - else if (tsk->dependers->n_elem - == tsk->dependers->allocated) - { - tsk->dependers->allocated - = tsk->dependers->allocated * 2 + 2; - tsk->dependers - = gomp_realloc (tsk->dependers, - sizeof (struct gomp_dependers_vec) - + (tsk->dependers->allocated - * sizeof (struct gomp_task *))); - } - tsk->dependers->elem[tsk->dependers->n_elem++] = task; - task->num_dependees++; - } - task->depend[i].next = *slot; - (*slot)->prev = &task->depend[i]; - } - *slot = &task->depend[i]; - - /* There is no need to store more than one depend({,in}out:) - task per address in the hash table chain for the purpose - of creation of deferred tasks, because each out - depends on all earlier outs, thus it is enough to record - just the last depend({,in}out:). For depend(in:), we need - to keep all of the previous ones not terminated yet, because - a later depend({,in}out:) might need to depend on all of - them. So, if the new task's clause is depend({,in}out:), - we know there is at most one other depend({,in}out:) clause - in the list (out). For non-deferred tasks we want to see - all outs, so they are moved to the end of the chain, - after first redundant_out entry all following entries - should be redundant_out. */ - if (!task->depend[i].is_in && out) - { - if (out != last) - { - out->next->prev = out->prev; - out->prev->next = out->next; - out->next = last->next; - out->prev = last; - last->next = out; - if (out->next) - out->next->prev = out; - } - out->redundant_out = true; - } - } + gomp_task_handle_depend (task, parent, depend); if (task->num_dependees) { gomp_mutex_unlock (&team->task_lock); @@ -444,6 +445,128 @@ ialias (GOMP_taskgroup_end) #undef UTYPE #undef GOMP_taskloop +/* Called for nowait target tasks. */ + +void +gomp_create_target_task (struct gomp_device_descr *devicep, + void (*fn) (void *), size_t mapnum, void **hostaddrs, + size_t *sizes, unsigned short *kinds, + unsigned int flags, void **depend) +{ + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + + /* If parallel or taskgroup has been cancelled, don't start new tasks. */ + if (team + && (gomp_team_barrier_cancelled (&team->barrier) + || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) + return; + + struct gomp_target_task *ttask; + struct gomp_task *task; + struct gomp_task *parent = thr->task; + struct gomp_taskgroup *taskgroup = parent->taskgroup; + bool do_wake; + size_t depend_size = 0; + + if (depend != NULL) + depend_size = ((uintptr_t) depend[0] + * sizeof (struct gomp_task_depend_entry)); + task = gomp_malloc (sizeof (*task) + depend_size + + sizeof (*ttask) + + mapnum * (sizeof (void *) + sizeof (size_t) + + sizeof (unsigned short))); + gomp_init_task (task, parent, gomp_icv (false)); + task->kind = GOMP_TASK_WAITING; + task->in_tied_task = parent->in_tied_task; + task->taskgroup = taskgroup; + ttask = (struct gomp_target_task *) &task->depend[(uintptr_t) depend[0]]; + ttask->devicep = devicep; + ttask->fn = fn; + ttask->mapnum = mapnum; + memcpy (ttask->hostaddrs, hostaddrs, mapnum * sizeof (void *)); + ttask->sizes = (size_t *) &ttask->hostaddrs[mapnum]; + memcpy (ttask->sizes, sizes, mapnum * sizeof (size_t)); + ttask->kinds = (unsigned short *) &ttask->sizes[mapnum]; + memcpy (ttask->kinds, kinds, mapnum * sizeof (unsigned short)); + ttask->flags = flags; + task->fn = gomp_target_task_fn; + task->fn_data = ttask; + task->final_task = 0; + gomp_mutex_lock (&team->task_lock); + /* If parallel or taskgroup has been cancelled, don't start new tasks. */ + if (__builtin_expect (gomp_team_barrier_cancelled (&team->barrier) + || (taskgroup && taskgroup->cancelled), 0)) + { + gomp_mutex_unlock (&team->task_lock); + gomp_finish_task (task); + free (task); + return; + } + if (taskgroup) + taskgroup->num_children++; + if (depend_size) + { + gomp_task_handle_depend (task, parent, depend); + if (task->num_dependees) + { + gomp_mutex_unlock (&team->task_lock); + return; + } + } + if (parent->children) + { + task->next_child = parent->children; + task->prev_child = parent->children->prev_child; + task->next_child->prev_child = task; + task->prev_child->next_child = task; + } + else + { + task->next_child = task; + task->prev_child = task; + } + parent->children = task; + if (taskgroup) + { + /* If applicable, place task into its taskgroup. */ + if (taskgroup->children) + { + task->next_taskgroup = taskgroup->children; + task->prev_taskgroup = taskgroup->children->prev_taskgroup; + task->next_taskgroup->prev_taskgroup = task; + task->prev_taskgroup->next_taskgroup = task; + } + else + { + task->next_taskgroup = task; + task->prev_taskgroup = task; + } + taskgroup->children = task; + } + if (team->task_queue) + { + task->next_queue = team->task_queue; + task->prev_queue = team->task_queue->prev_queue; + task->next_queue->prev_queue = task; + task->prev_queue->next_queue = task; + } + else + { + task->next_queue = task; + task->prev_queue = task; + team->task_queue = task; + } + ++team->task_count; + ++team->task_queued_count; + gomp_team_barrier_set_task_pending (&team->barrier); + do_wake = team->task_running_count + !parent->in_tied_task + < team->nthreads; + gomp_mutex_unlock (&team->task_lock); + if (do_wake) + gomp_team_barrier_wake (&team->barrier, 1); +} + static inline bool gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent, struct gomp_taskgroup *taskgroup, struct gomp_team *team) --- libgomp/libgomp.h.jj 2015-09-03 16:33:42.000000000 +0200 +++ libgomp/libgomp.h 2015-09-08 10:18:44.164897688 +0200 @@ -374,6 +374,17 @@ struct gomp_taskgroup size_t num_children; }; +struct gomp_target_task +{ + struct gomp_device_descr *devicep; + void (*fn) (void *); + size_t mapnum; + size_t *sizes; + unsigned short *kinds; + unsigned int flags; + void *hostaddrs[]; +}; + /* This structure describes a "team" of threads. These are the threads that are spawned by a PARALLEL constructs, as well as the work sharing constructs that the team encounters. */ @@ -653,6 +664,10 @@ extern void gomp_init_task (struct gomp_ extern void gomp_end_task (void); extern void gomp_barrier_handle_tasks (gomp_barrier_state_t); extern void gomp_task_maybe_wait_for_dependencies (void **); +extern void gomp_create_target_task (struct gomp_device_descr *, + void (*) (void *), size_t, void **, + size_t *, unsigned short *, unsigned int, + void **); static void inline gomp_finish_task (struct gomp_task *task) @@ -673,6 +688,7 @@ extern void gomp_free_thread (void *); extern void gomp_init_targets_once (void); extern int gomp_get_num_devices (void); +extern void gomp_target_task_fn (void *); typedef struct splay_tree_node_s *splay_tree_node; typedef struct splay_tree_s *splay_tree; --- libgomp/testsuite/libgomp.c/target-27.c.jj 2015-09-07 17:56:17.503966807 +0200 +++ libgomp/testsuite/libgomp.c/target-27.c 2015-09-07 18:46:23.000000000 +0200 @@ -0,0 +1,67 @@ +#include <stdlib.h> +#include <unistd.h> + +int +main () +{ + int x = 0, y = 0, z = 0, err; + int shared_mem = 0; + #pragma omp target map(to: shared_mem) + shared_mem = 1; + #pragma omp parallel + #pragma omp single + { + #pragma omp task depend(in: x) + { + usleep (5000); + x = 1; + } + #pragma omp task depend(in: x) + { + usleep (6000); + y = 2; + } + #pragma omp task depend(out: z) + { + usleep (7000); + z = 3; + } + #pragma omp target enter data map(to: x, y, z) depend(inout: x, z) nowait + #pragma omp task depend(inout: x, z) + { + x++; y++; z++; + } + #pragma omp target update to(x, y) depend(inout: x) nowait + #pragma omp target enter data map(always, to: z) depend(inout: z) nowait + #pragma omp target map (alloc: x, y, z) map (from: err) depend(inout: x, z) + { + err = x != 2 || y != 3 || z != 4; + x = 5; y = 6; z = 7; + } + #pragma omp task depend(in: x) + { + usleep (5000); + if (!shared_mem) + x = 1; + } + #pragma omp task depend(in: x) + { + usleep (6000); + if (!shared_mem) + y = 2; + } + #pragma omp task depend(out: z) + { + usleep (3000); + if (!shared_mem) + z = 3; + } + #pragma omp target exit data map(release: z) depend(inout: z) nowait + #pragma omp target exit data map(from: x, y) depend(inout: x) nowait + #pragma omp target exit data map(from: z) depend(inout: z) nowait + #pragma omp taskwait + if (err || x != 5 || y != 6 || z != 7) + abort (); + } + return 0; +}

[gomp4.1] depend nowait support for target {update,{enter,exit} data}

Commit Message

Patch