From: Chung-Lin Tang Date: Thu, 26 May 2016 13:28:25 +0000 (+0000) Subject: oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int parameter. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b4557008c45c0a44ae848c71d31ce4ed6316d043;p=gcc.git oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int parameter. 2016-05-26 Chung-Lin Tang libgomp/ * oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int parameter. * oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Add 'int async' parameter, use to set async stream around call to gomp_unmap_vars, call gomp_unmap_vars() with 'do_copyfrom' set to true. * plugin/plugin-nvptx.c (struct ptx_event): Add 'int val' field. (event_gc): Adjust event handling loop, collect PTX_EVT_ASYNC_CLEANUP events and call GOMP_PLUGIN_async_unmap_vars() for each of them. (event_add): Add int parameter, initialize 'val' field when adding new ptx_event struct. (nvptx_evec): Adjust event_add() call arguments. (nvptx_host2dev): Likewise. (nvptx_dev2host): Likewise. (nvptx_wait_async): Likewise. (nvptx_wait_all_async): Likewise. (GOMP_OFFLOAD_openacc_register_async_cleanup): Add async parameter, pass to event_add() call. * oacc-host.c (host_openacc_register_async_cleanup): Add 'int async' parameter. * oacc-mem.c (gomp_acc_remove_pointer): Adjust async case to call openacc.register_async_cleanup_func() hook. * oacc-parallel.c (GOACC_parallel_keyed): Likewise. * target.c (gomp_copy_from_async): Delete function. (gomp_map_vars): Remove async_refcount. (gomp_unmap_vars): Likewise. (gomp_load_image_to_device): Likewise. (omp_target_associate_ptr): Likewise. * libgomp.h (struct splay_tree_key_s): Remove async_refcount. (acc_dispatch_t.register_async_cleanup_func): Add int parameter. (gomp_copy_from_async): Remove. From-SVN: r236772 --- diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index e2496ff5833..b4ae304dabb 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,35 @@ +2016-05-26 Chung-Lin Tang + + * oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int parameter. + * oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Add 'int async' + parameter, use to set async stream around call to gomp_unmap_vars, + call gomp_unmap_vars() with 'do_copyfrom' set to true. + * plugin/plugin-nvptx.c (struct ptx_event): Add 'int val' field. + (event_gc): Adjust event handling loop, collect PTX_EVT_ASYNC_CLEANUP + events and call GOMP_PLUGIN_async_unmap_vars() for each of them. + (event_add): Add int parameter, initialize 'val' field when + adding new ptx_event struct. + (nvptx_evec): Adjust event_add() call arguments. + (nvptx_host2dev): Likewise. + (nvptx_dev2host): Likewise. + (nvptx_wait_async): Likewise. + (nvptx_wait_all_async): Likewise. + (GOMP_OFFLOAD_openacc_register_async_cleanup): Add async parameter, + pass to event_add() call. + * oacc-host.c (host_openacc_register_async_cleanup): Add 'int async' + parameter. + * oacc-mem.c (gomp_acc_remove_pointer): Adjust async case to + call openacc.register_async_cleanup_func() hook. + * oacc-parallel.c (GOACC_parallel_keyed): Likewise. + * target.c (gomp_copy_from_async): Delete function. + (gomp_map_vars): Remove async_refcount. + (gomp_unmap_vars): Likewise. + (gomp_load_image_to_device): Likewise. + (omp_target_associate_ptr): Likewise. + * libgomp.h (struct splay_tree_key_s): Remove async_refcount. + (acc_dispatch_t.register_async_cleanup_func): Add int parameter. + (gomp_copy_from_async): Remove. + 2016-05-26 Chung-Lin Tang * target.c (gomp_device_copy): New function. diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index f0c048b151b..7b2671ba49d 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -835,8 +835,6 @@ struct splay_tree_key_s { uintptr_t tgt_offset; /* Reference count. */ uintptr_t refcount; - /* Asynchronous reference count. */ - uintptr_t async_refcount; /* Pointer to the original mapping of "omp declare target link" object. */ splay_tree_key link_key; }; @@ -872,7 +870,7 @@ typedef struct acc_dispatch_t unsigned *, void *); /* Async cleanup callback registration. */ - void (*register_async_cleanup_func) (void *); + void (*register_async_cleanup_func) (void *, int); /* Asynchronous routines. */ int (*async_test_func) (int); @@ -977,7 +975,6 @@ extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *, size_t, void **, void **, size_t *, void *, bool, enum gomp_map_vars_kind); -extern void gomp_copy_from_async (struct target_mem_desc *); extern void gomp_unmap_vars (struct target_mem_desc *, bool); extern void gomp_init_device (struct gomp_device_descr *); extern void gomp_free_memmap (struct splay_tree_s *); diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c index a24899c7f60..fd3a672e4a5 100644 --- a/libgomp/oacc-host.c +++ b/libgomp/oacc-host.c @@ -148,7 +148,8 @@ host_openacc_exec (void (*fn) (void *), } static void -host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused))) +host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)), + int async __attribute__ ((unused))) { } diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c index 2aaa0d295cb..bd4b62b006e 100644 --- a/libgomp/oacc-mem.c +++ b/libgomp/oacc-mem.c @@ -704,10 +704,7 @@ gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum) if (async < acc_async_noval) gomp_unmap_vars (t, true); else - { - gomp_copy_from_async (t); - acc_dev->openacc.register_async_cleanup_func (t); - } + t->device_descr->openacc.register_async_cleanup_func (t, async); gomp_debug (0, " %s: mappings restored\n", __FUNCTION__); } diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c index 1fdb01d927b..ecdd75adcb8 100644 --- a/libgomp/oacc-parallel.c +++ b/libgomp/oacc-parallel.c @@ -186,10 +186,7 @@ GOACC_parallel_keyed (int device, void (*fn) (void *), if (async < acc_async_noval) gomp_unmap_vars (tgt, true); else - { - gomp_copy_from_async (tgt); - acc_dev->openacc.register_async_cleanup_func (tgt); - } + tgt->device_descr->openacc.register_async_cleanup_func (tgt, async); acc_dev->openacc.async_set_async_func (acc_async_sync); } diff --git a/libgomp/oacc-plugin.c b/libgomp/oacc-plugin.c index 54d8840b27d..889d86c8c7a 100644 --- a/libgomp/oacc-plugin.c +++ b/libgomp/oacc-plugin.c @@ -31,11 +31,14 @@ #include "oacc-int.h" void -GOMP_PLUGIN_async_unmap_vars (void *ptr) +GOMP_PLUGIN_async_unmap_vars (void *ptr, int async) { struct target_mem_desc *tgt = ptr; + struct gomp_device_descr *devicep = tgt->device_descr; - gomp_unmap_vars (tgt, false); + devicep->openacc.async_set_async_func (async); + gomp_unmap_vars (tgt, true); + devicep->openacc.async_set_async_func (acc_async_sync); } /* Return the target-specific part of the TLS data for the current thread. */ diff --git a/libgomp/oacc-plugin.h b/libgomp/oacc-plugin.h index d2e4fbff01f..57fced5a6e4 100644 --- a/libgomp/oacc-plugin.h +++ b/libgomp/oacc-plugin.h @@ -27,7 +27,7 @@ #ifndef OACC_PLUGIN_H #define OACC_PLUGIN_H 1 -extern void GOMP_PLUGIN_async_unmap_vars (void *); +extern void GOMP_PLUGIN_async_unmap_vars (void *, int); extern void *GOMP_PLUGIN_acc_thread (void); #endif diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 2b6a888cbd2..327500c01aa 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -329,6 +329,7 @@ struct ptx_event int type; void *addr; int ord; + int val; struct ptx_event *next; }; @@ -789,6 +790,7 @@ static void event_gc (bool memmap_lockable) { struct ptx_event *ptx_event = ptx_events; + struct ptx_event *async_cleanups = NULL; struct nvptx_thread *nvthd = nvptx_thread (); pthread_mutex_lock (&ptx_event_lock); @@ -806,6 +808,7 @@ event_gc (bool memmap_lockable) r = cuEventQuery (*e->evt); if (r == CUDA_SUCCESS) { + bool append_async = false; CUevent *te; te = e->evt; @@ -830,7 +833,7 @@ event_gc (bool memmap_lockable) if (!memmap_lockable) continue; - GOMP_PLUGIN_async_unmap_vars (e->addr); + append_async = true; } break; } @@ -838,6 +841,7 @@ event_gc (bool memmap_lockable) cuEventDestroy (*te); free ((void *)te); + /* Unlink 'e' from ptx_events list. */ if (ptx_events == e) ptx_events = ptx_events->next; else @@ -848,15 +852,31 @@ event_gc (bool memmap_lockable) e_->next = e_->next->next; } - free (e); + if (append_async) + { + e->next = async_cleanups; + async_cleanups = e; + } + else + free (e); } } pthread_mutex_unlock (&ptx_event_lock); + + /* We have to do these here, after ptx_event_lock is released. */ + while (async_cleanups) + { + struct ptx_event *e = async_cleanups; + async_cleanups = async_cleanups->next; + + GOMP_PLUGIN_async_unmap_vars (e->addr, e->val); + free (e); + } } static void -event_add (enum ptx_event_type type, CUevent *e, void *h) +event_add (enum ptx_event_type type, CUevent *e, void *h, int val) { struct ptx_event *ptx_event; struct nvptx_thread *nvthd = nvptx_thread (); @@ -869,6 +889,7 @@ event_add (enum ptx_event_type type, CUevent *e, void *h) ptx_event->evt = e; ptx_event->addr = h; ptx_event->ord = nvthd->ptx_dev->ord; + ptx_event->val = val; pthread_mutex_lock (&ptx_event_lock); @@ -975,7 +996,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream); - event_add (PTX_EVT_KNL, e, (void *)dev_str); + event_add (PTX_EVT_KNL, e, (void *)dev_str, 0); } #else r = cuCtxSynchronize (); @@ -1071,7 +1092,7 @@ nvptx_host2dev (void *d, const void *h, size_t s) CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) d, h, s, nvthd->current_stream->stream); CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream); - event_add (PTX_EVT_MEM, e, (void *)h); + event_add (PTX_EVT_MEM, e, (void *)h, 0); } else #endif @@ -1127,7 +1148,7 @@ nvptx_dev2host (void *h, const void *d, size_t s) CUDA_CALL (cuMemcpyDtoHAsync, h, (CUdeviceptr) d, s, nvthd->current_stream->stream); CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream); - event_add (PTX_EVT_MEM, e, (void *)h); + event_add (PTX_EVT_MEM, e, (void *)h, 0); } else #endif @@ -1240,7 +1261,7 @@ nvptx_wait_async (int async1, int async2) CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream); - event_add (PTX_EVT_SYNC, e, NULL); + event_add (PTX_EVT_SYNC, e, NULL, 0); CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0); } @@ -1313,7 +1334,7 @@ nvptx_wait_all_async (int async) /* Record an event on the waited-for stream. */ CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream); - event_add (PTX_EVT_SYNC, e, NULL); + event_add (PTX_EVT_SYNC, e, NULL, 0); CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0); } @@ -1646,14 +1667,14 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum, } void -GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc) +GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async) { struct nvptx_thread *nvthd = nvptx_thread (); CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream); - event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc); + event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async); } int diff --git a/libgomp/target.c b/libgomp/target.c index 5a86fc077e6..48b9ab8e076 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -707,7 +707,6 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, tgt->list[i].offset = 0; tgt->list[i].length = k->host_end - k->host_start; k->refcount = 1; - k->async_refcount = 0; tgt->refcount++; array->left = NULL; array->right = NULL; @@ -854,43 +853,9 @@ gomp_unmap_tgt (struct target_mem_desc *tgt) free (tgt); } -/* Decrease the refcount for a set of mapped variables, and queue asychronous - copies from the device back to the host after any work that has been issued. - Because the regions are still "live", increment an asynchronous reference - count to indicate that they should not be unmapped from host-side data - structures until the asynchronous copy has completed. */ - -attribute_hidden void -gomp_copy_from_async (struct target_mem_desc *tgt) -{ - struct gomp_device_descr *devicep = tgt->device_descr; - size_t i; - - gomp_mutex_lock (&devicep->lock); - - for (i = 0; i < tgt->list_count; i++) - if (tgt->list[i].key == NULL) - ; - else if (tgt->list[i].key->refcount > 1) - { - tgt->list[i].key->refcount--; - tgt->list[i].key->async_refcount++; - } - else - { - splay_tree_key k = tgt->list[i].key; - if (tgt->list[i].copy_from) - gomp_copy_dev2host (devicep, (void *) k->host_start, - (void *) (k->tgt->tgt_start + k->tgt_offset), - k->host_end - k->host_start); - } - - gomp_mutex_unlock (&devicep->lock); -} - /* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant variables back from device to host: if it is false, it is assumed that this - has been done already, i.e. by gomp_copy_from_async above. */ + has been done already. */ attribute_hidden void gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom) @@ -924,13 +889,8 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom) k->refcount--; else if (k->refcount == 1) { - if (k->async_refcount > 0) - k->async_refcount--; - else - { - k->refcount--; - do_unmap = true; - } + k->refcount--; + do_unmap = true; } if ((do_unmap && do_copyfrom && tgt->list[i].copy_from) @@ -1076,7 +1036,6 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version, k->tgt = tgt; k->tgt_offset = target_table[i].start; k->refcount = REFCOUNT_INFINITY; - k->async_refcount = 0; k->link_key = NULL; array->left = NULL; array->right = NULL; @@ -1109,7 +1068,6 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version, k->tgt = tgt; k->tgt_offset = target_var->start; k->refcount = target_size & link_bit ? REFCOUNT_LINK : REFCOUNT_INFINITY; - k->async_refcount = 0; k->link_key = NULL; array->left = NULL; array->right = NULL; @@ -2332,7 +2290,6 @@ omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size, k->tgt = tgt; k->tgt_offset = (uintptr_t) device_ptr + device_offset; k->refcount = REFCOUNT_INFINITY; - k->async_refcount = 0; array->left = NULL; array->right = NULL; splay_tree_insert (&devicep->mem_map, array);