From 1f4c5b9bb2eb81880e2bc725435d596fcd2bdfef Mon Sep 17 00:00:00 2001 From: Chung-Lin Tang Date: Mon, 13 May 2019 13:32:00 +0000 Subject: [PATCH] 2019-05-13 Chung-Lin Tang Reviewed-by: Thomas Schwinge libgomp/ * libgomp-plugin.h (struct goacc_asyncqueue): Declare. (struct goacc_asyncqueue_list): Likewise. (goacc_aq): Likewise. (goacc_aq_list): Likewise. (GOMP_OFFLOAD_openacc_register_async_cleanup): Remove. (GOMP_OFFLOAD_openacc_async_test): Remove. (GOMP_OFFLOAD_openacc_async_test_all): Remove. (GOMP_OFFLOAD_openacc_async_wait): Remove. (GOMP_OFFLOAD_openacc_async_wait_async): Remove. (GOMP_OFFLOAD_openacc_async_wait_all): Remove. (GOMP_OFFLOAD_openacc_async_wait_all_async): Remove. (GOMP_OFFLOAD_openacc_async_set_async): Remove. (GOMP_OFFLOAD_openacc_exec): Adjust declaration. (GOMP_OFFLOAD_openacc_cuda_get_stream): Likewise. (GOMP_OFFLOAD_openacc_cuda_set_stream): Likewise. (GOMP_OFFLOAD_openacc_async_exec): Declare. (GOMP_OFFLOAD_openacc_async_construct): Declare. (GOMP_OFFLOAD_openacc_async_destruct): Declare. (GOMP_OFFLOAD_openacc_async_test): Declare. (GOMP_OFFLOAD_openacc_async_synchronize): Declare. (GOMP_OFFLOAD_openacc_async_serialize): Declare. (GOMP_OFFLOAD_openacc_async_queue_callback): Declare. (GOMP_OFFLOAD_openacc_async_host2dev): Declare. (GOMP_OFFLOAD_openacc_async_dev2host): Declare. * libgomp.h (struct acc_dispatch_t): Define 'async' sub-struct. (gomp_acc_insert_pointer): Adjust declaration. (gomp_copy_host2dev): New declaration. (gomp_copy_dev2host): Likewise. (gomp_map_vars_async): Likewise. (gomp_unmap_tgt): Likewise. (gomp_unmap_vars_async): Likewise. (gomp_fini_device): Likewise. * oacc-async.c (get_goacc_thread): New function. (get_goacc_thread_device): New function. (lookup_goacc_asyncqueue): New function. (get_goacc_asyncqueue): New function. (acc_async_test): Adjust code to use new async design. (acc_async_test_all): Likewise. (acc_wait): Likewise. (acc_wait_async): Likewise. (acc_wait_all): Likewise. (acc_wait_all_async): Likewise. (goacc_async_free): New function. (goacc_init_asyncqueues): Likewise. (goacc_fini_asyncqueues): Likewise. * oacc-cuda.c (acc_get_cuda_stream): Adjust code to use new async design. (acc_set_cuda_stream): Likewise. * oacc-host.c (host_openacc_exec): Adjust parameters, remove 'async'. (host_openacc_register_async_cleanup): Remove. (host_openacc_async_exec): New function. (host_openacc_async_test): Adjust parameters. (host_openacc_async_test_all): Remove. (host_openacc_async_wait): Remove. (host_openacc_async_wait_async): Remove. (host_openacc_async_wait_all): Remove. (host_openacc_async_wait_all_async): Remove. (host_openacc_async_set_async): Remove. (host_openacc_async_synchronize): New function. (host_openacc_async_serialize): New function. (host_openacc_async_host2dev): New function. (host_openacc_async_dev2host): New function. (host_openacc_async_queue_callback): New function. (host_openacc_async_construct): New function. (host_openacc_async_destruct): New function. (struct gomp_device_descr host_dispatch): Remove initialization of old interface, add intialization of new async sub-struct. * oacc-init.c (acc_shutdown_1): Adjust to use gomp_fini_device. (goacc_attach_host_thread_to_device): Remove old async code usage. * oacc-int.h (goacc_init_asyncqueues): New declaration. (goacc_fini_asyncqueues): Likewise. (goacc_async_copyout_unmap_vars): Likewise. (goacc_async_free): Likewise. (get_goacc_asyncqueue): Likewise. (lookup_goacc_asyncqueue): Likewise. * oacc-mem.c (memcpy_tofrom_device): Adjust code to use new async design. (present_create_copy): Adjust code to use new async design. (delete_copyout): Likewise. (update_dev_host): Likewise. (gomp_acc_insert_pointer): Add async parameter, adjust code to use new async design. (gomp_acc_remove_pointer): Adjust code to use new async design. * oacc-parallel.c (GOACC_parallel_keyed): Adjust code to use new async design. (GOACC_enter_exit_data): Likewise. (goacc_wait): Likewise. (GOACC_update): Likewise. * oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Change to assert fail when called, warn as obsolete in comment. * target.c (goacc_device_copy_async): New function. (gomp_copy_host2dev): Remove 'static', add goacc_asyncqueue parameter, add goacc_device_copy_async case. (gomp_copy_dev2host): Likewise. (gomp_map_vars_existing): Add goacc_asyncqueue parameter, adjust code. (gomp_map_pointer): Likewise. (gomp_map_fields_existing): Likewise. (gomp_map_vars_internal): New always_inline function, renamed from gomp_map_vars. (gomp_map_vars): Implement by calling gomp_map_vars_internal. (gomp_map_vars_async): Implement by calling gomp_map_vars_internal, passing goacc_asyncqueue argument. (gomp_unmap_tgt): Remove static, add attribute_hidden. (gomp_unref_tgt): New function. (gomp_unmap_vars_internal): New always_inline function, renamed from gomp_unmap_vars. (gomp_unmap_vars): Implement by calling gomp_unmap_vars_internal. (gomp_unmap_vars_async): Implement by calling gomp_unmap_vars_internal, passing goacc_asyncqueue argument. (gomp_fini_device): New function. (gomp_exit_data): Adjust gomp_copy_dev2host call. (gomp_load_plugin_for_device): Remove old interface, adjust to load new async interface. (gomp_target_fini): Adjust code to call gomp_fini_device. * plugin/plugin-nvptx.c (struct cuda_map): Remove. (struct ptx_stream): Remove. (struct nvptx_thread): Remove current_stream field. (cuda_map_create): Remove. (cuda_map_destroy): Remove. (map_init): Remove. (map_fini): Remove. (map_pop): Remove. (map_push): Remove. (struct goacc_asyncqueue): Define. (struct nvptx_callback): Define. (struct ptx_free_block): Define. (struct ptx_device): Remove null_stream, active_streams, async_streams, stream_lock, and next fields. (enum ptx_event_type): Remove. (struct ptx_event): Remove. (ptx_event_lock): Remove. (ptx_events): Remove. (init_streams_for_device): Remove. (fini_streams_for_device): Remove. (select_stream_for_async): Remove. (nvptx_init): Remove ptx_events and ptx_event_lock references. (nvptx_attach_host_thread_to_device): Remove CUDA_ERROR_NOT_PERMITTED case. (nvptx_open_device): Add free_blocks initialization, remove init_streams_for_device call. (nvptx_close_device): Remove fini_streams_for_device call, add free_blocks destruct code. (event_gc): Remove. (event_add): Remove. (nvptx_exec): Adjust parameters and code. (nvptx_free): Likewise. (nvptx_host2dev): Remove. (nvptx_dev2host): Remove. (nvptx_set_async): Remove. (nvptx_async_test): Remove. (nvptx_async_test_all): Remove. (nvptx_wait): Remove. (nvptx_wait_async): Remove. (nvptx_wait_all): Remove. (nvptx_wait_all_async): Remove. (nvptx_get_cuda_stream): Remove. (nvptx_set_cuda_stream): Remove. (GOMP_OFFLOAD_alloc): Adjust code. (GOMP_OFFLOAD_free): Likewise. (GOMP_OFFLOAD_openacc_register_async_cleanup): Remove. (GOMP_OFFLOAD_openacc_exec): Adjust parameters and code. (GOMP_OFFLOAD_openacc_async_test_all): Remove. (GOMP_OFFLOAD_openacc_async_wait): Remove. (GOMP_OFFLOAD_openacc_async_wait_async): Remove. (GOMP_OFFLOAD_openacc_async_wait_all): Remove. (GOMP_OFFLOAD_openacc_async_wait_all_async): Remove. (GOMP_OFFLOAD_openacc_async_set_async): Remove. (cuda_free_argmem): New function. (GOMP_OFFLOAD_openacc_async_exec): New plugin hook function. (GOMP_OFFLOAD_openacc_create_thread_data): Adjust code. (GOMP_OFFLOAD_openacc_cuda_get_stream): Adjust code. (GOMP_OFFLOAD_openacc_cuda_set_stream): Adjust code. (GOMP_OFFLOAD_openacc_async_construct): New plugin hook function. (GOMP_OFFLOAD_openacc_async_destruct): New plugin hook function. (GOMP_OFFLOAD_openacc_async_test): Remove and re-implement. (GOMP_OFFLOAD_openacc_async_synchronize): New plugin hook function. (GOMP_OFFLOAD_openacc_async_serialize): New plugin hook function. (GOMP_OFFLOAD_openacc_async_queue_callback): New plugin hook function. (cuda_callback_wrapper): New function. (cuda_memcpy_sanity_check): New function. (GOMP_OFFLOAD_host2dev): Remove and re-implement. (GOMP_OFFLOAD_dev2host): Remove and re-implement. (GOMP_OFFLOAD_openacc_async_host2dev): New plugin hook function. (GOMP_OFFLOAD_openacc_async_dev2host): New plugin hook function. From-SVN: r271128 --- libgomp/ChangeLog | 190 +++++ libgomp/libgomp-plugin.h | 45 +- libgomp/libgomp.h | 53 +- libgomp/oacc-async.c | 250 +++++- libgomp/oacc-cuda.c | 28 +- libgomp/oacc-host.c | 92 ++- libgomp/oacc-init.c | 4 +- libgomp/oacc-int.h | 7 + libgomp/oacc-mem.c | 57 +- libgomp/oacc-parallel.c | 80 +- libgomp/oacc-plugin.c | 11 +- libgomp/plugin/cuda-lib.def | 1 + libgomp/plugin/cuda/cuda.h | 8 +- libgomp/plugin/plugin-nvptx.c | 1345 ++++++++------------------------- libgomp/target.c | 206 +++-- 15 files changed, 1107 insertions(+), 1270 deletions(-) diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index a8ce3c241fc..a16d5244f54 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,193 @@ +2019-05-13 Chung-Lin Tang + + * libgomp-plugin.h (struct goacc_asyncqueue): Declare. + (struct goacc_asyncqueue_list): Likewise. + (goacc_aq): Likewise. + (goacc_aq_list): Likewise. + (GOMP_OFFLOAD_openacc_register_async_cleanup): Remove. + (GOMP_OFFLOAD_openacc_async_test): Remove. + (GOMP_OFFLOAD_openacc_async_test_all): Remove. + (GOMP_OFFLOAD_openacc_async_wait): Remove. + (GOMP_OFFLOAD_openacc_async_wait_async): Remove. + (GOMP_OFFLOAD_openacc_async_wait_all): Remove. + (GOMP_OFFLOAD_openacc_async_wait_all_async): Remove. + (GOMP_OFFLOAD_openacc_async_set_async): Remove. + (GOMP_OFFLOAD_openacc_exec): Adjust declaration. + (GOMP_OFFLOAD_openacc_cuda_get_stream): Likewise. + (GOMP_OFFLOAD_openacc_cuda_set_stream): Likewise. + (GOMP_OFFLOAD_openacc_async_exec): Declare. + (GOMP_OFFLOAD_openacc_async_construct): Declare. + (GOMP_OFFLOAD_openacc_async_destruct): Declare. + (GOMP_OFFLOAD_openacc_async_test): Declare. + (GOMP_OFFLOAD_openacc_async_synchronize): Declare. + (GOMP_OFFLOAD_openacc_async_serialize): Declare. + (GOMP_OFFLOAD_openacc_async_queue_callback): Declare. + (GOMP_OFFLOAD_openacc_async_host2dev): Declare. + (GOMP_OFFLOAD_openacc_async_dev2host): Declare. + + * libgomp.h (struct acc_dispatch_t): Define 'async' sub-struct. + (gomp_acc_insert_pointer): Adjust declaration. + (gomp_copy_host2dev): New declaration. + (gomp_copy_dev2host): Likewise. + (gomp_map_vars_async): Likewise. + (gomp_unmap_tgt): Likewise. + (gomp_unmap_vars_async): Likewise. + (gomp_fini_device): Likewise. + + * oacc-async.c (get_goacc_thread): New function. + (get_goacc_thread_device): New function. + (lookup_goacc_asyncqueue): New function. + (get_goacc_asyncqueue): New function. + (acc_async_test): Adjust code to use new async design. + (acc_async_test_all): Likewise. + (acc_wait): Likewise. + (acc_wait_async): Likewise. + (acc_wait_all): Likewise. + (acc_wait_all_async): Likewise. + (goacc_async_free): New function. + (goacc_init_asyncqueues): Likewise. + (goacc_fini_asyncqueues): Likewise. + * oacc-cuda.c (acc_get_cuda_stream): Adjust code to use new async + design. + (acc_set_cuda_stream): Likewise. + * oacc-host.c (host_openacc_exec): Adjust parameters, remove 'async'. + (host_openacc_register_async_cleanup): Remove. + (host_openacc_async_exec): New function. + (host_openacc_async_test): Adjust parameters. + (host_openacc_async_test_all): Remove. + (host_openacc_async_wait): Remove. + (host_openacc_async_wait_async): Remove. + (host_openacc_async_wait_all): Remove. + (host_openacc_async_wait_all_async): Remove. + (host_openacc_async_set_async): Remove. + (host_openacc_async_synchronize): New function. + (host_openacc_async_serialize): New function. + (host_openacc_async_host2dev): New function. + (host_openacc_async_dev2host): New function. + (host_openacc_async_queue_callback): New function. + (host_openacc_async_construct): New function. + (host_openacc_async_destruct): New function. + (struct gomp_device_descr host_dispatch): Remove initialization of old + interface, add intialization of new async sub-struct. + * oacc-init.c (acc_shutdown_1): Adjust to use gomp_fini_device. + (goacc_attach_host_thread_to_device): Remove old async code usage. + * oacc-int.h (goacc_init_asyncqueues): New declaration. + (goacc_fini_asyncqueues): Likewise. + (goacc_async_copyout_unmap_vars): Likewise. + (goacc_async_free): Likewise. + (get_goacc_asyncqueue): Likewise. + (lookup_goacc_asyncqueue): Likewise. + * oacc-mem.c (memcpy_tofrom_device): Adjust code to use new async + design. + (present_create_copy): Adjust code to use new async design. + (delete_copyout): Likewise. + (update_dev_host): Likewise. + (gomp_acc_insert_pointer): Add async parameter, adjust code to use new + async design. + (gomp_acc_remove_pointer): Adjust code to use new async design. + * oacc-parallel.c (GOACC_parallel_keyed): Adjust code to use new async + design. + (GOACC_enter_exit_data): Likewise. + (goacc_wait): Likewise. + (GOACC_update): Likewise. + * oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Change to assert fail + when called, warn as obsolete in comment. + * target.c (goacc_device_copy_async): New function. + (gomp_copy_host2dev): Remove 'static', add goacc_asyncqueue parameter, + add goacc_device_copy_async case. + (gomp_copy_dev2host): Likewise. + (gomp_map_vars_existing): Add goacc_asyncqueue parameter, adjust code. + (gomp_map_pointer): Likewise. + (gomp_map_fields_existing): Likewise. + (gomp_map_vars_internal): New always_inline function, renamed from + gomp_map_vars. + (gomp_map_vars): Implement by calling gomp_map_vars_internal. + (gomp_map_vars_async): Implement by calling gomp_map_vars_internal, + passing goacc_asyncqueue argument. + (gomp_unmap_tgt): Remove static, add attribute_hidden. + (gomp_unref_tgt): New function. + (gomp_unmap_vars_internal): New always_inline function, renamed from + gomp_unmap_vars. + (gomp_unmap_vars): Implement by calling gomp_unmap_vars_internal. + (gomp_unmap_vars_async): Implement by calling + gomp_unmap_vars_internal, passing goacc_asyncqueue argument. + (gomp_fini_device): New function. + (gomp_exit_data): Adjust gomp_copy_dev2host call. + (gomp_load_plugin_for_device): Remove old interface, adjust to load + new async interface. + (gomp_target_fini): Adjust code to call gomp_fini_device. + + * plugin/plugin-nvptx.c (struct cuda_map): Remove. + (struct ptx_stream): Remove. + (struct nvptx_thread): Remove current_stream field. + (cuda_map_create): Remove. + (cuda_map_destroy): Remove. + (map_init): Remove. + (map_fini): Remove. + (map_pop): Remove. + (map_push): Remove. + (struct goacc_asyncqueue): Define. + (struct nvptx_callback): Define. + (struct ptx_free_block): Define. + (struct ptx_device): Remove null_stream, active_streams, async_streams, + stream_lock, and next fields. + (enum ptx_event_type): Remove. + (struct ptx_event): Remove. + (ptx_event_lock): Remove. + (ptx_events): Remove. + (init_streams_for_device): Remove. + (fini_streams_for_device): Remove. + (select_stream_for_async): Remove. + (nvptx_init): Remove ptx_events and ptx_event_lock references. + (nvptx_attach_host_thread_to_device): Remove CUDA_ERROR_NOT_PERMITTED + case. + (nvptx_open_device): Add free_blocks initialization, remove + init_streams_for_device call. + (nvptx_close_device): Remove fini_streams_for_device call, add + free_blocks destruct code. + (event_gc): Remove. + (event_add): Remove. + (nvptx_exec): Adjust parameters and code. + (nvptx_free): Likewise. + (nvptx_host2dev): Remove. + (nvptx_dev2host): Remove. + (nvptx_set_async): Remove. + (nvptx_async_test): Remove. + (nvptx_async_test_all): Remove. + (nvptx_wait): Remove. + (nvptx_wait_async): Remove. + (nvptx_wait_all): Remove. + (nvptx_wait_all_async): Remove. + (nvptx_get_cuda_stream): Remove. + (nvptx_set_cuda_stream): Remove. + (GOMP_OFFLOAD_alloc): Adjust code. + (GOMP_OFFLOAD_free): Likewise. + (GOMP_OFFLOAD_openacc_register_async_cleanup): Remove. + (GOMP_OFFLOAD_openacc_exec): Adjust parameters and code. + (GOMP_OFFLOAD_openacc_async_test_all): Remove. + (GOMP_OFFLOAD_openacc_async_wait): Remove. + (GOMP_OFFLOAD_openacc_async_wait_async): Remove. + (GOMP_OFFLOAD_openacc_async_wait_all): Remove. + (GOMP_OFFLOAD_openacc_async_wait_all_async): Remove. + (GOMP_OFFLOAD_openacc_async_set_async): Remove. + (cuda_free_argmem): New function. + (GOMP_OFFLOAD_openacc_async_exec): New plugin hook function. + (GOMP_OFFLOAD_openacc_create_thread_data): Adjust code. + (GOMP_OFFLOAD_openacc_cuda_get_stream): Adjust code. + (GOMP_OFFLOAD_openacc_cuda_set_stream): Adjust code. + (GOMP_OFFLOAD_openacc_async_construct): New plugin hook function. + (GOMP_OFFLOAD_openacc_async_destruct): New plugin hook function. + (GOMP_OFFLOAD_openacc_async_test): Remove and re-implement. + (GOMP_OFFLOAD_openacc_async_synchronize): New plugin hook function. + (GOMP_OFFLOAD_openacc_async_serialize): New plugin hook function. + (GOMP_OFFLOAD_openacc_async_queue_callback): New plugin hook function. + (cuda_callback_wrapper): New function. + (cuda_memcpy_sanity_check): New function. + (GOMP_OFFLOAD_host2dev): Remove and re-implement. + (GOMP_OFFLOAD_dev2host): Remove and re-implement. + (GOMP_OFFLOAD_openacc_async_host2dev): New plugin hook function. + (GOMP_OFFLOAD_openacc_async_dev2host): New plugin hook function. + 2019-05-07 Thomas Schwinge PR target/87835 diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index 8960d809265..01483f27f4c 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -53,6 +53,20 @@ enum offload_target_type OFFLOAD_TARGET_TYPE_HSA = 7 }; +/* Opaque type to represent plugin-dependent implementation of an + OpenACC asynchronous queue. */ +struct goacc_asyncqueue; + +/* Used to keep a list of active asynchronous queues. */ +struct goacc_asyncqueue_list +{ + struct goacc_asyncqueue *aq; + struct goacc_asyncqueue_list *next; +}; + +typedef struct goacc_asyncqueue *goacc_aq; +typedef struct goacc_asyncqueue_list *goacc_aq_list; + /* Auxiliary struct, used for transferring pairs of addresses from plugin to libgomp. */ struct addr_pair @@ -93,22 +107,31 @@ extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t); extern bool GOMP_OFFLOAD_can_run (void *); extern void GOMP_OFFLOAD_run (int, void *, void *, void **); extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *); + extern void GOMP_OFFLOAD_openacc_exec (void (*) (void *), size_t, void **, - void **, int, unsigned *, void *); -extern void GOMP_OFFLOAD_openacc_register_async_cleanup (void *, int); -extern int GOMP_OFFLOAD_openacc_async_test (int); -extern int GOMP_OFFLOAD_openacc_async_test_all (void); -extern void GOMP_OFFLOAD_openacc_async_wait (int); -extern void GOMP_OFFLOAD_openacc_async_wait_async (int, int); -extern void GOMP_OFFLOAD_openacc_async_wait_all (void); -extern void GOMP_OFFLOAD_openacc_async_wait_all_async (int); -extern void GOMP_OFFLOAD_openacc_async_set_async (int); + void **, unsigned *, void *); extern void *GOMP_OFFLOAD_openacc_create_thread_data (int); extern void GOMP_OFFLOAD_openacc_destroy_thread_data (void *); +extern struct goacc_asyncqueue *GOMP_OFFLOAD_openacc_async_construct (void); +extern bool GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *); +extern int GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *); +extern bool GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *); +extern bool GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *, + struct goacc_asyncqueue *); +extern void GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *, + void (*)(void *), void *); +extern void GOMP_OFFLOAD_openacc_async_exec (void (*) (void *), size_t, void **, + void **, unsigned *, void *, + struct goacc_asyncqueue *); +extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size_t, + struct goacc_asyncqueue *); +extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t, + struct goacc_asyncqueue *); extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void); extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void); -extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (int); -extern int GOMP_OFFLOAD_openacc_cuda_set_stream (int, void *); +extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *); +extern int GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *, + void *); #ifdef __cplusplus } diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index afea659445d..9f433160ab5 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -949,24 +949,31 @@ typedef struct acc_dispatch_t /* Execute. */ __typeof (GOMP_OFFLOAD_openacc_exec) *exec_func; - /* Async cleanup callback registration. */ - __typeof (GOMP_OFFLOAD_openacc_register_async_cleanup) - *register_async_cleanup_func; - - /* Asynchronous routines. */ - __typeof (GOMP_OFFLOAD_openacc_async_test) *async_test_func; - __typeof (GOMP_OFFLOAD_openacc_async_test_all) *async_test_all_func; - __typeof (GOMP_OFFLOAD_openacc_async_wait) *async_wait_func; - __typeof (GOMP_OFFLOAD_openacc_async_wait_async) *async_wait_async_func; - __typeof (GOMP_OFFLOAD_openacc_async_wait_all) *async_wait_all_func; - __typeof (GOMP_OFFLOAD_openacc_async_wait_all_async) - *async_wait_all_async_func; - __typeof (GOMP_OFFLOAD_openacc_async_set_async) *async_set_async_func; - /* Create/destroy TLS data. */ __typeof (GOMP_OFFLOAD_openacc_create_thread_data) *create_thread_data_func; __typeof (GOMP_OFFLOAD_openacc_destroy_thread_data) *destroy_thread_data_func; + + struct { + /* Once created and put into the "active" list, asyncqueues are then never + destructed and removed from the "active" list, other than if the TODO + device is shut down. */ + gomp_mutex_t lock; + int nasyncqueue; + struct goacc_asyncqueue **asyncqueue; + struct goacc_asyncqueue_list *active; + + __typeof (GOMP_OFFLOAD_openacc_async_construct) *construct_func; + __typeof (GOMP_OFFLOAD_openacc_async_destruct) *destruct_func; + __typeof (GOMP_OFFLOAD_openacc_async_test) *test_func; + __typeof (GOMP_OFFLOAD_openacc_async_synchronize) *synchronize_func; + __typeof (GOMP_OFFLOAD_openacc_async_serialize) *serialize_func; + __typeof (GOMP_OFFLOAD_openacc_async_queue_callback) *queue_callback_func; + + __typeof (GOMP_OFFLOAD_openacc_async_exec) *exec_func; + __typeof (GOMP_OFFLOAD_openacc_async_dev2host) *dev2host_func; + __typeof (GOMP_OFFLOAD_openacc_async_host2dev) *host2dev_func; + } async; /* NVIDIA target specific routines. */ struct { @@ -1053,17 +1060,33 @@ enum gomp_map_vars_kind GOMP_MAP_VARS_ENTER_DATA }; -extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *); +extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *, int); extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int); extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *, unsigned short *); +struct gomp_coalesce_buf; +extern void gomp_copy_host2dev (struct gomp_device_descr *, + struct goacc_asyncqueue *, void *, const void *, + size_t, struct gomp_coalesce_buf *); +extern void gomp_copy_dev2host (struct gomp_device_descr *, + struct goacc_asyncqueue *, void *, const void *, + size_t); extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *, size_t, void **, void **, size_t *, void *, bool, enum gomp_map_vars_kind); +extern struct target_mem_desc *gomp_map_vars_async (struct gomp_device_descr *, + struct goacc_asyncqueue *, + size_t, void **, void **, + size_t *, void *, bool, + enum gomp_map_vars_kind); +extern void gomp_unmap_tgt (struct target_mem_desc *); extern void gomp_unmap_vars (struct target_mem_desc *, bool); +extern void gomp_unmap_vars_async (struct target_mem_desc *, bool, + struct goacc_asyncqueue *); extern void gomp_init_device (struct gomp_device_descr *); +extern bool gomp_fini_device (struct gomp_device_descr *); extern void gomp_free_memmap (struct splay_tree_s *); extern void gomp_unload_device (struct gomp_device_descr *); extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key); diff --git a/libgomp/oacc-async.c b/libgomp/oacc-async.c index 915284db20b..51bb676610c 100644 --- a/libgomp/oacc-async.c +++ b/libgomp/oacc-async.c @@ -27,47 +27,160 @@ . */ #include +#include #include "openacc.h" #include "libgomp.h" #include "oacc-int.h" -int -acc_async_test (int async) +static struct goacc_thread * +get_goacc_thread (void) { - if (!async_valid_p (async)) - gomp_fatal ("invalid async argument: %d", async); - struct goacc_thread *thr = goacc_thread (); if (!thr || !thr->dev) gomp_fatal ("no device active"); - return thr->dev->openacc.async_test_func (async); + return thr; } -int -acc_async_test_all (void) +static struct gomp_device_descr * +get_goacc_thread_device (void) { struct goacc_thread *thr = goacc_thread (); if (!thr || !thr->dev) gomp_fatal ("no device active"); - return thr->dev->openacc.async_test_all_func (); + return thr->dev; } -void -acc_wait (int async) +static int +validate_async_val (int async) { if (!async_valid_p (async)) - gomp_fatal ("invalid async argument: %d", async); + gomp_fatal ("invalid async-argument: %d", async); + + if (async == acc_async_sync) + return -1; + + if (async == acc_async_noval) + return 0; + + if (async >= 0) + /* TODO: we reserve 0 for acc_async_noval before we can clarify the + semantics of "default_async". */ + return 1 + async; + else + __builtin_unreachable (); +} + +/* Return the asyncqueue to be used for OpenACC async-argument ASYNC. This + might return NULL if no asyncqueue is to be used. Otherwise, if CREATE, + create the asyncqueue if it doesn't exist yet. */ + +attribute_hidden struct goacc_asyncqueue * +lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async) +{ + async = validate_async_val (async); + if (async < 0) + return NULL; + + struct goacc_asyncqueue *ret_aq = NULL; + struct gomp_device_descr *dev = thr->dev; + + gomp_mutex_lock (&dev->openacc.async.lock); + if (!create + && (async >= dev->openacc.async.nasyncqueue + || !dev->openacc.async.asyncqueue[async])) + goto end; + + if (async >= dev->openacc.async.nasyncqueue) + { + int diff = async + 1 - dev->openacc.async.nasyncqueue; + dev->openacc.async.asyncqueue + = gomp_realloc (dev->openacc.async.asyncqueue, + sizeof (goacc_aq) * (async + 1)); + memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue, + 0, sizeof (goacc_aq) * diff); + dev->openacc.async.nasyncqueue = async + 1; + } + + if (!dev->openacc.async.asyncqueue[async]) + { + dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func (); + + if (!dev->openacc.async.asyncqueue[async]) + { + gomp_mutex_unlock (&dev->openacc.async.lock); + gomp_fatal ("async %d creation failed", async); + } + + /* Link new async queue into active list. */ + goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list)); + n->aq = dev->openacc.async.asyncqueue[async]; + n->next = dev->openacc.async.active; + dev->openacc.async.active = n; + } + + ret_aq = dev->openacc.async.asyncqueue[async]; + + end: + gomp_mutex_unlock (&dev->openacc.async.lock); + return ret_aq; +} + +/* Return the asyncqueue to be used for OpenACC async-argument ASYNC. This + might return NULL if no asyncqueue is to be used. Otherwise, create the + asyncqueue if it doesn't exist yet. */ + +attribute_hidden struct goacc_asyncqueue * +get_goacc_asyncqueue (int async) +{ + struct goacc_thread *thr = get_goacc_thread (); + return lookup_goacc_asyncqueue (thr, true, async); +} + +int +acc_async_test (int async) +{ struct goacc_thread *thr = goacc_thread (); if (!thr || !thr->dev) gomp_fatal ("no device active"); - thr->dev->openacc.async_wait_func (async); + goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async); + if (!aq) + return 1; + else + return thr->dev->openacc.async.test_func (aq); +} + +int +acc_async_test_all (void) +{ + struct goacc_thread *thr = get_goacc_thread (); + + int ret = 1; + gomp_mutex_lock (&thr->dev->openacc.async.lock); + for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next) + if (!thr->dev->openacc.async.test_func (l->aq)) + { + ret = 0; + break; + } + gomp_mutex_unlock (&thr->dev->openacc.async.lock); + return ret; +} + +void +acc_wait (int async) +{ + struct goacc_thread *thr = get_goacc_thread (); + + goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async); + if (aq && !thr->dev->openacc.async.synchronize_func (aq)) + gomp_fatal ("wait on %d failed", async); } /* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait. */ @@ -84,23 +197,46 @@ acc_async_wait (int async) void acc_wait_async (int async1, int async2) { - struct goacc_thread *thr = goacc_thread (); + struct goacc_thread *thr = get_goacc_thread (); - if (!thr || !thr->dev) - gomp_fatal ("no device active"); + goacc_aq aq1 = lookup_goacc_asyncqueue (thr, false, async1); + /* TODO: Is this also correct for acc_async_sync, assuming that in this case, + we'll always be synchronous anyways? */ + if (!aq1) + return; + + goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2); + /* An async queue is always synchronized with itself. */ + if (aq1 == aq2) + return; - thr->dev->openacc.async_wait_async_func (async1, async2); + if (aq2) + { + if (!thr->dev->openacc.async.serialize_func (aq1, aq2)) + gomp_fatal ("ordering of async ids %d and %d failed", async1, async2); + } + else + { + /* TODO: Local thread synchronization. + Necessary for the "async2 == acc_async_sync" case, or can just skip? */ + if (!thr->dev->openacc.async.synchronize_func (aq1)) + gomp_fatal ("wait on %d failed", async1); + } } void acc_wait_all (void) { - struct goacc_thread *thr = goacc_thread (); + struct gomp_device_descr *dev = get_goacc_thread_device (); - if (!thr || !thr->dev) - gomp_fatal ("no device active"); + bool ret = true; + gomp_mutex_lock (&dev->openacc.async.lock); + for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next) + ret &= dev->openacc.async.synchronize_func (l->aq); + gomp_mutex_unlock (&dev->openacc.async.lock); - thr->dev->openacc.async_wait_all_func (); + if (!ret) + gomp_fatal ("wait all failed"); } /* acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all. */ @@ -117,13 +253,73 @@ acc_async_wait_all (void) void acc_wait_all_async (int async) { - if (!async_valid_p (async)) - gomp_fatal ("invalid async argument: %d", async); + struct goacc_thread *thr = get_goacc_thread (); - struct goacc_thread *thr = goacc_thread (); + goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async); - if (!thr || !thr->dev) - gomp_fatal ("no device active"); + bool ret = true; + gomp_mutex_lock (&thr->dev->openacc.async.lock); + for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next) + { + if (waiting_queue) + ret &= thr->dev->openacc.async.serialize_func (l->aq, waiting_queue); + else + /* TODO: Local thread synchronization. + Necessary for the "async2 == acc_async_sync" case, or can just skip? */ + ret &= thr->dev->openacc.async.synchronize_func (l->aq); + } + gomp_mutex_unlock (&thr->dev->openacc.async.lock); + + if (!ret) + gomp_fatal ("wait all async(%d) failed", async); +} + +attribute_hidden void +goacc_async_free (struct gomp_device_descr *devicep, + struct goacc_asyncqueue *aq, void *ptr) +{ + if (!aq) + free (ptr); + else + devicep->openacc.async.queue_callback_func (aq, free, ptr); +} + +/* This function initializes the asyncqueues for the device specified by + DEVICEP. TODO DEVICEP must be locked on entry, and remains locked on + return. */ + +attribute_hidden void +goacc_init_asyncqueues (struct gomp_device_descr *devicep) +{ + devicep->openacc.async.nasyncqueue = 0; + devicep->openacc.async.asyncqueue = NULL; + devicep->openacc.async.active = NULL; + gomp_mutex_init (&devicep->openacc.async.lock); +} - thr->dev->openacc.async_wait_all_async_func (async); +/* This function finalizes the asyncqueues for the device specified by DEVICEP. + TODO DEVICEP must be locked on entry, and remains locked on return. */ + +attribute_hidden bool +goacc_fini_asyncqueues (struct gomp_device_descr *devicep) +{ + bool ret = true; + gomp_mutex_lock (&devicep->openacc.async.lock); + if (devicep->openacc.async.nasyncqueue > 0) + { + goacc_aq_list next; + for (goacc_aq_list l = devicep->openacc.async.active; l; l = next) + { + ret &= devicep->openacc.async.destruct_func (l->aq); + next = l->next; + free (l); + } + free (devicep->openacc.async.asyncqueue); + devicep->openacc.async.nasyncqueue = 0; + devicep->openacc.async.asyncqueue = NULL; + devicep->openacc.async.active = NULL; + } + gomp_mutex_unlock (&devicep->openacc.async.lock); + gomp_mutex_destroy (&devicep->openacc.async.lock); + return ret; } diff --git a/libgomp/oacc-cuda.c b/libgomp/oacc-cuda.c index 16eb6c3effb..1a6946c5dda 100644 --- a/libgomp/oacc-cuda.c +++ b/libgomp/oacc-cuda.c @@ -30,6 +30,7 @@ #include "config.h" #include "libgomp.h" #include "oacc-int.h" +#include void * acc_get_current_cuda_device (void) @@ -62,7 +63,11 @@ acc_get_cuda_stream (int async) return NULL; if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func) - return thr->dev->openacc.cuda.get_stream_func (async); + { + goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async); + if (aq) + return thr->dev->openacc.cuda.get_stream_func (aq); + } return NULL; } @@ -79,8 +84,23 @@ acc_set_cuda_stream (int async, void *stream) thr = goacc_thread (); + int ret = -1; if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func) - return thr->dev->openacc.cuda.set_stream_func (async, stream); - - return -1; + { + goacc_aq aq = get_goacc_asyncqueue (async); + /* Due to not using an asyncqueue for "acc_async_sync", this cannot be + used to change the CUDA stream associated with "acc_async_sync". */ + if (!aq) + { + assert (async == acc_async_sync); + gomp_debug (0, "Refusing request to set CUDA stream associated" + " with \"acc_async_sync\"\n"); + return 0; + } + gomp_mutex_lock (&thr->dev->openacc.async.lock); + ret = thr->dev->openacc.cuda.set_stream_func (aq, stream); + gomp_mutex_unlock (&thr->dev->openacc.async.lock); + } + + return ret; } diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c index 222bfb7d1a6..00484b9f6ed 100644 --- a/libgomp/oacc-host.c +++ b/libgomp/oacc-host.c @@ -140,55 +140,89 @@ host_openacc_exec (void (*fn) (void *), size_t mapnum __attribute__ ((unused)), void **hostaddrs, void **devaddrs __attribute__ ((unused)), - int async __attribute__ ((unused)), - unsigned *dims __attribute ((unused)), + unsigned *dims __attribute__ ((unused)), void *targ_mem_desc __attribute__ ((unused))) { fn (hostaddrs); } static void -host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)), - int async __attribute__ ((unused))) +host_openacc_async_exec (void (*fn) (void *), + size_t mapnum __attribute__ ((unused)), + void **hostaddrs, + void **devaddrs __attribute__ ((unused)), + unsigned *dims __attribute__ ((unused)), + void *targ_mem_desc __attribute__ ((unused)), + struct goacc_asyncqueue *aq __attribute__ ((unused))) { + fn (hostaddrs); } static int -host_openacc_async_test (int async __attribute__ ((unused))) +host_openacc_async_test (struct goacc_asyncqueue *aq __attribute__ ((unused))) { return 1; } -static int -host_openacc_async_test_all (void) +static bool +host_openacc_async_synchronize (struct goacc_asyncqueue *aq + __attribute__ ((unused))) { - return 1; + return true; } -static void -host_openacc_async_wait (int async __attribute__ ((unused))) +static bool +host_openacc_async_serialize (struct goacc_asyncqueue *aq1 + __attribute__ ((unused)), + struct goacc_asyncqueue *aq2 + __attribute__ ((unused))) { + return true; } -static void -host_openacc_async_wait_async (int async1 __attribute__ ((unused)), - int async2 __attribute__ ((unused))) +static bool +host_openacc_async_host2dev (int ord __attribute__ ((unused)), + void *dst __attribute__ ((unused)), + const void *src __attribute__ ((unused)), + size_t n __attribute__ ((unused)), + struct goacc_asyncqueue *aq + __attribute__ ((unused))) { + return true; } -static void -host_openacc_async_wait_all (void) +static bool +host_openacc_async_dev2host (int ord __attribute__ ((unused)), + void *dst __attribute__ ((unused)), + const void *src __attribute__ ((unused)), + size_t n __attribute__ ((unused)), + struct goacc_asyncqueue *aq + __attribute__ ((unused))) { + return true; } static void -host_openacc_async_wait_all_async (int async __attribute__ ((unused))) +host_openacc_async_queue_callback (struct goacc_asyncqueue *aq + __attribute__ ((unused)), + void (*callback_fn)(void *) + __attribute__ ((unused)), + void *userptr __attribute__ ((unused))) { } -static void -host_openacc_async_set_async (int async __attribute__ ((unused))) +static struct goacc_asyncqueue * +host_openacc_async_construct (void) { + /* Non-NULL 0xffff... value as opaque dummy. */ + return (struct goacc_asyncqueue *) -1; +} + +static bool +host_openacc_async_destruct (struct goacc_asyncqueue *aq + __attribute__ ((unused))) +{ + return true; } static void * @@ -235,19 +269,21 @@ static struct gomp_device_descr host_dispatch = .exec_func = host_openacc_exec, - .register_async_cleanup_func = host_openacc_register_async_cleanup, - - .async_test_func = host_openacc_async_test, - .async_test_all_func = host_openacc_async_test_all, - .async_wait_func = host_openacc_async_wait, - .async_wait_async_func = host_openacc_async_wait_async, - .async_wait_all_func = host_openacc_async_wait_all, - .async_wait_all_async_func = host_openacc_async_wait_all_async, - .async_set_async_func = host_openacc_async_set_async, - .create_thread_data_func = host_openacc_create_thread_data, .destroy_thread_data_func = host_openacc_destroy_thread_data, + .async = { + .construct_func = host_openacc_async_construct, + .destruct_func = host_openacc_async_destruct, + .test_func = host_openacc_async_test, + .synchronize_func = host_openacc_async_synchronize, + .serialize_func = host_openacc_async_serialize, + .queue_callback_func = host_openacc_async_queue_callback, + .exec_func = host_openacc_async_exec, + .dev2host_func = host_openacc_async_dev2host, + .host2dev_func = host_openacc_async_host2dev, + }, + .cuda = { .get_current_device_func = NULL, .get_current_context_func = NULL, diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c index f30cf2f81d8..28471e40ba0 100644 --- a/libgomp/oacc-init.c +++ b/libgomp/oacc-init.c @@ -309,7 +309,7 @@ acc_shutdown_1 (acc_device_t d) if (acc_dev->state == GOMP_DEVICE_INITIALIZED) { devices_active = true; - ret &= acc_dev->fini_device_func (acc_dev->target_id); + ret &= gomp_fini_device (acc_dev); acc_dev->state = GOMP_DEVICE_UNINITIALIZED; } gomp_mutex_unlock (&acc_dev->lock); @@ -426,8 +426,6 @@ goacc_attach_host_thread_to_device (int ord) thr->target_tls = acc_dev->openacc.create_thread_data_func (ord); - - acc_dev->openacc.async_set_async_func (acc_async_sync); } /* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h index 940052b7936..e4b6ea6b7db 100644 --- a/libgomp/oacc-int.h +++ b/libgomp/oacc-int.h @@ -99,6 +99,13 @@ void goacc_restore_bind (void); void goacc_lazy_initialize (void); void goacc_host_init (void); +void goacc_init_asyncqueues (struct gomp_device_descr *); +bool goacc_fini_asyncqueues (struct gomp_device_descr *); +void goacc_async_free (struct gomp_device_descr *, struct goacc_asyncqueue *, + void *); +struct goacc_asyncqueue *get_goacc_asyncqueue (int); +struct goacc_asyncqueue *lookup_goacc_asyncqueue (struct goacc_thread *, bool, + int); static inline bool async_valid_stream_id_p (int async) { diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c index 26e1a7545df..03df0d4fbf6 100644 --- a/libgomp/oacc-mem.c +++ b/libgomp/oacc-mem.c @@ -172,18 +172,11 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, return; } - if (async > acc_async_sync) - thr->dev->openacc.async_set_async_func (async); - - bool ret = (from - ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s) - : thr->dev->host2dev_func (thr->dev->target_id, d, h, s)); - - if (async > acc_async_sync) - thr->dev->openacc.async_set_async_func (acc_async_sync); - - if (!ret) - gomp_fatal ("error in %s", libfnname); + goacc_aq aq = get_goacc_asyncqueue (async); + if (from) + gomp_copy_dev2host (thr->dev, aq, h, d, s); + else + gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL); } void @@ -509,17 +502,13 @@ present_create_copy (unsigned f, void *h, size_t s, int async) gomp_mutex_unlock (&acc_dev->lock); - if (async > acc_async_sync) - acc_dev->openacc.async_set_async_func (async); + goacc_aq aq = get_goacc_asyncqueue (async); - tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true, - GOMP_MAP_VARS_OPENACC); + tgt = gomp_map_vars_async (acc_dev, aq, mapnum, &hostaddrs, NULL, &s, + &kinds, true, GOMP_MAP_VARS_OPENACC); /* Initialize dynamic refcount. */ tgt->list[0].key->dynamic_refcount = 1; - if (async > acc_async_sync) - acc_dev->openacc.async_set_async_func (acc_async_sync); - gomp_mutex_lock (&acc_dev->lock); d = tgt->to_free; @@ -676,13 +665,9 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname) if (f & FLAG_COPYOUT) { - if (async > acc_async_sync) - acc_dev->openacc.async_set_async_func (async); - acc_dev->dev2host_func (acc_dev->target_id, h, d, s); - if (async > acc_async_sync) - acc_dev->openacc.async_set_async_func (acc_async_sync); + goacc_aq aq = get_goacc_asyncqueue (async); + gomp_copy_dev2host (acc_dev, aq, h, d, s); } - gomp_remove_var (acc_dev, n); } @@ -765,16 +750,12 @@ update_dev_host (int is_dev, void *h, size_t s, int async) d = (void *) (n->tgt->tgt_start + n->tgt_offset + (uintptr_t) h - n->host_start); - if (async > acc_async_sync) - acc_dev->openacc.async_set_async_func (async); + goacc_aq aq = get_goacc_asyncqueue (async); if (is_dev) - acc_dev->host2dev_func (acc_dev->target_id, d, h, s); + gomp_copy_host2dev (acc_dev, aq, d, h, s, /* TODO: cbuf? */ NULL); else - acc_dev->dev2host_func (acc_dev->target_id, h, d, s); - - if (async > acc_async_sync) - acc_dev->openacc.async_set_async_func (acc_async_sync); + gomp_copy_dev2host (acc_dev, aq, h, d, s); gomp_mutex_unlock (&acc_dev->lock); } @@ -805,7 +786,7 @@ acc_update_self_async (void *h, size_t s, int async) void gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes, - void *kinds) + void *kinds, int async) { struct target_mem_desc *tgt; struct goacc_thread *thr = goacc_thread (); @@ -835,8 +816,9 @@ gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes, } gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__); - tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, - NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC); + goacc_aq aq = get_goacc_asyncqueue (async); + tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, + NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC); gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__); /* Initialize dynamic refcount. */ @@ -930,7 +912,10 @@ gomp_acc_remove_pointer (void *h, size_t s, bool force_copyfrom, int async, if (async < acc_async_noval) gomp_unmap_vars (t, true); else - t->device_descr->openacc.register_async_cleanup_func (t, async); + { + goacc_aq aq = get_goacc_asyncqueue (async); + gomp_unmap_vars_async (t, true, aq); + } } gomp_mutex_unlock (&acc_dev->lock); diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c index f5fb63c5b5a..fa99a2ad1a9 100644 --- a/libgomp/oacc-parallel.c +++ b/libgomp/oacc-parallel.c @@ -217,8 +217,6 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *), } va_end (ap); - acc_dev->openacc.async_set_async_func (async); - if (!(acc_dev->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC)) { k.host_start = (uintptr_t) fn; @@ -235,44 +233,29 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (void *), else tgt_fn = (void (*)) fn; - tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true, - GOMP_MAP_VARS_OPENACC); + goacc_aq aq = get_goacc_asyncqueue (async); + tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds, + true, GOMP_MAP_VARS_OPENACC); + devaddrs = gomp_alloca (sizeof (void *) * mapnum); for (i = 0; i < mapnum; i++) devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start + tgt->list[i].key->tgt_offset + tgt->list[i].offset); - - acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, - async, dims, tgt); - - /* If running synchronously, unmap immediately. */ - bool copyfrom = true; - if (async_synchronous_p (async)) - gomp_unmap_vars (tgt, true); + if (aq == NULL) + { + acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, + dims, tgt); + /* If running synchronously, unmap immediately. */ + gomp_unmap_vars (tgt, true); + } else { - bool async_unmap = false; - for (size_t i = 0; i < tgt->list_count; i++) - { - splay_tree_key k = tgt->list[i].key; - if (k && k->refcount == 1) - { - async_unmap = true; - break; - } - } - if (async_unmap) - tgt->device_descr->openacc.register_async_cleanup_func (tgt, async); - else - { - copyfrom = false; - gomp_unmap_vars (tgt, copyfrom); - } + acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, + dims, tgt, aq); + gomp_unmap_vars_async (tgt, true, aq); } - - acc_dev->openacc.async_set_async_func (acc_async_sync); } /* Legacy entry point (GCC 5). Only provide host fallback execution. */ @@ -383,8 +366,6 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum, finalize = true; } - acc_dev->openacc.async_set_async_func (async); - /* Determine if this is an "acc enter data". */ for (i = 0; i < mapnum; ++i) { @@ -437,11 +418,11 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum, { case GOMP_MAP_ALLOC: case GOMP_MAP_FORCE_ALLOC: - acc_create (hostaddrs[i], sizes[i]); + acc_create_async (hostaddrs[i], sizes[i], async); break; case GOMP_MAP_TO: case GOMP_MAP_FORCE_TO: - acc_copyin (hostaddrs[i], sizes[i]); + acc_copyin_async (hostaddrs[i], sizes[i], async); break; default: gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x", @@ -452,7 +433,7 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum, else { gomp_acc_insert_pointer (pointer, &hostaddrs[i], - &sizes[i], &kinds[i]); + &sizes[i], &kinds[i], async); /* Increment 'i' by two because OpenACC requires fortran arrays to be contiguous, so each PSET is associated with one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and @@ -477,17 +458,17 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum, if (acc_is_present (hostaddrs[i], sizes[i])) { if (finalize) - acc_delete_finalize (hostaddrs[i], sizes[i]); + acc_delete_finalize_async (hostaddrs[i], sizes[i], async); else - acc_delete (hostaddrs[i], sizes[i]); + acc_delete_async (hostaddrs[i], sizes[i], async); } break; case GOMP_MAP_FROM: case GOMP_MAP_FORCE_FROM: if (finalize) - acc_copyout_finalize (hostaddrs[i], sizes[i]); + acc_copyout_finalize_async (hostaddrs[i], sizes[i], async); else - acc_copyout (hostaddrs[i], sizes[i]); + acc_copyout_async (hostaddrs[i], sizes[i], async); break; default: gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x", @@ -505,8 +486,6 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum, i += pointer - 1; } } - - acc_dev->openacc.async_set_async_func (acc_async_sync); } static void @@ -532,9 +511,10 @@ goacc_wait (int async, int num_waits, va_list *ap) if (async == acc_async_sync) acc_wait (qid); else if (qid == async) - ;/* If we're waiting on the same asynchronous queue as we're - launching on, the queue itself will order work as - required, so there's no need to wait explicitly. */ + /* If we're waiting on the same asynchronous queue as we're + launching on, the queue itself will order work as + required, so there's no need to wait explicitly. */ + ; else acc_wait_async (qid, async); } @@ -567,8 +547,6 @@ GOACC_update (int flags_m, size_t mapnum, va_end (ap); } - acc_dev->openacc.async_set_async_func (async); - bool update_device = false; for (i = 0; i < mapnum; ++i) { @@ -591,6 +569,8 @@ GOACC_update (int flags_m, size_t mapnum, the value of the allocated device memory in the previous pointer. */ *(uintptr_t *) hostaddrs[i] = (uintptr_t)dptr; + /* TODO: verify that we really cannot use acc_update_device_async + here. */ acc_update_device (hostaddrs[i], sizeof (uintptr_t)); /* Restore the host pointer. */ @@ -608,7 +588,7 @@ GOACC_update (int flags_m, size_t mapnum, /* Fallthru */ case GOMP_MAP_FORCE_TO: update_device = true; - acc_update_device (hostaddrs[i], sizes[i]); + acc_update_device_async (hostaddrs[i], sizes[i], async); break; case GOMP_MAP_FROM: @@ -620,7 +600,7 @@ GOACC_update (int flags_m, size_t mapnum, /* Fallthru */ case GOMP_MAP_FORCE_FROM: update_device = false; - acc_update_self (hostaddrs[i], sizes[i]); + acc_update_self_async (hostaddrs[i], sizes[i], async); break; default: @@ -628,8 +608,6 @@ GOACC_update (int flags_m, size_t mapnum, break; } } - - acc_dev->openacc.async_set_async_func (acc_async_sync); } void diff --git a/libgomp/oacc-plugin.c b/libgomp/oacc-plugin.c index 958cda97177..5480c1db56e 100644 --- a/libgomp/oacc-plugin.c +++ b/libgomp/oacc-plugin.c @@ -30,15 +30,12 @@ #include "oacc-plugin.h" #include "oacc-int.h" +/* This plugin function is now obsolete. */ void -GOMP_PLUGIN_async_unmap_vars (void *ptr, int async) +GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)), + int async __attribute__((unused))) { - struct target_mem_desc *tgt = ptr; - struct gomp_device_descr *devicep = tgt->device_descr; - - devicep->openacc.async_set_async_func (async); - gomp_unmap_vars (tgt, true); - devicep->openacc.async_set_async_func (acc_async_sync); + gomp_fatal ("invalid plugin function"); } /* Return the target-specific part of the TLS data for the current thread. */ diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index b2a4c2154eb..a16badcfa9d 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -42,6 +42,7 @@ CUDA_ONE_CALL (cuModuleLoad) CUDA_ONE_CALL (cuModuleLoadData) CUDA_ONE_CALL (cuModuleUnload) CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize) +CUDA_ONE_CALL (cuStreamAddCallback) CUDA_ONE_CALL (cuStreamCreate) CUDA_ONE_CALL (cuStreamDestroy) CUDA_ONE_CALL (cuStreamQuery) diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h index 7c0afafc730..e65f9725349 100644 --- a/libgomp/plugin/cuda/cuda.h +++ b/libgomp/plugin/cuda/cuda.h @@ -54,7 +54,11 @@ typedef enum { CUDA_ERROR_INVALID_CONTEXT = 201, CUDA_ERROR_NOT_FOUND = 500, CUDA_ERROR_NOT_READY = 600, - CUDA_ERROR_LAUNCH_FAILED = 719 + CUDA_ERROR_LAUNCH_FAILED = 719, + CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720, + CUDA_ERROR_NOT_PERMITTED = 800, + CUDA_ERROR_NOT_SUPPORTED = 801, + CUDA_ERROR_UNKNOWN = 999 } CUresult; typedef enum { @@ -173,6 +177,8 @@ CUresult cuModuleLoadData (CUmodule *, const void *); CUresult cuModuleUnload (CUmodule); CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int); +typedef void (*CUstreamCallback)(CUstream, CUresult, void *); +CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *, unsigned int); CUresult cuStreamCreate (CUstream *, unsigned); #define cuStreamDestroy cuStreamDestroy_v2 CUresult cuStreamDestroy (CUstream); diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 387e7cc6dd3..8f71e69acb6 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -192,175 +192,30 @@ cuda_error (CUresult r) static unsigned int instantiated_devices = 0; static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; -struct cuda_map +/* NVPTX/CUDA specific definition of asynchronous queues. */ +struct goacc_asyncqueue { - CUdeviceptr d; - size_t size; - bool active; - struct cuda_map *next; + CUstream cuda_stream; }; -struct ptx_stream +struct nvptx_callback { - CUstream stream; - pthread_t host_thread; - bool multithreaded; - struct cuda_map *map; - struct ptx_stream *next; + void (*fn) (void *); + void *ptr; + struct goacc_asyncqueue *aq; + struct nvptx_callback *next; }; /* Thread-specific data for PTX. */ struct nvptx_thread { - struct ptx_stream *current_stream; + /* We currently have this embedded inside the plugin because libgomp manages + devices through integer target_ids. This might be better if using an + opaque target-specific pointer directly from gomp_device_descr. */ struct ptx_device *ptx_dev; }; -static struct cuda_map * -cuda_map_create (size_t size) -{ - struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map)); - - assert (map); - - map->next = NULL; - map->size = size; - map->active = false; - - CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size); - assert (map->d); - - return map; -} - -static void -cuda_map_destroy (struct cuda_map *map) -{ - if (map->active) - /* Possible reasons for the map to be still active: - - the associated async kernel might still be running. - - the associated async kernel might have finished, but the - corresponding event that should trigger the pop_map has not been - processed by event_gc. - - the associated sync kernel might have aborted - - The async cases could happen if the user specified an async region - without adding a corresponding wait that is guaranteed to be executed - (before returning from main, or in an atexit handler). - We do not want to deallocate a device pointer that is still being - used, so skip it. - - In the sync case, the device pointer is no longer used, but deallocating - it using cuMemFree will not succeed, so skip it. - - TODO: Handle this in a more constructive way, by f.i. waiting for streams - to finish before de-allocating them (PR88981), or by ensuring the CUDA - lib atexit handler is called before rather than after the libgomp plugin - atexit handler (PR83795). */ - ; - else - CUDA_CALL_NOCHECK (cuMemFree, map->d); - - free (map); -} - -/* The following map_* routines manage the CUDA device memory that - contains the data mapping arguments for cuLaunchKernel. Each - asynchronous PTX stream may have multiple pending kernel - invocations, which are launched in a FIFO order. As such, the map - routines maintains a queue of cuLaunchKernel arguments. - - Calls to map_push and map_pop must be guarded by ptx_event_lock. - Likewise, calls to map_init and map_fini are guarded by - ptx_dev_lock inside GOMP_OFFLOAD_init_device and - GOMP_OFFLOAD_fini_device, respectively. */ - -static bool -map_init (struct ptx_stream *s) -{ - int size = getpagesize (); - - assert (s); - - s->map = cuda_map_create (size); - - return true; -} - -static bool -map_fini (struct ptx_stream *s) -{ - assert (s->map->next == NULL); - - cuda_map_destroy (s->map); - - return true; -} - -static void -map_pop (struct ptx_stream *s) -{ - struct cuda_map *next; - - assert (s != NULL); - - if (s->map->next == NULL) - { - s->map->active = false; - return; - } - - next = s->map->next; - cuda_map_destroy (s->map); - s->map = next; -} - -static CUdeviceptr -map_push (struct ptx_stream *s, size_t size) -{ - struct cuda_map *map = NULL; - struct cuda_map **t; - - assert (s); - assert (s->map); - - /* Select an element to push. */ - if (s->map->active) - map = cuda_map_create (size); - else - { - /* Pop the inactive front element. */ - struct cuda_map *pop = s->map; - s->map = pop->next; - pop->next = NULL; - - if (pop->size < size) - { - cuda_map_destroy (pop); - - map = cuda_map_create (size); - } - else - map = pop; - } - - /* Check that the element is as expected. */ - assert (map->next == NULL); - assert (!map->active); - - /* Mark the element active. */ - map->active = true; - - /* Push the element to the back of the list. */ - for (t = &s->map; (*t) != NULL; t = &(*t)->next) - ; - assert (t != NULL && *t == NULL); - *t = map; - - return map->d; -} - /* Target data function launch information. */ struct targ_fn_launch @@ -412,22 +267,18 @@ struct ptx_image_data struct ptx_image_data *next; }; +struct ptx_free_block +{ + void *ptr; + struct ptx_free_block *next; +}; + struct ptx_device { CUcontext ctx; bool ctx_shared; CUdevice dev; - struct ptx_stream *null_stream; - /* All non-null streams associated with this device (actually context), - either created implicitly or passed in from the user (via - acc_set_cuda_stream). */ - struct ptx_stream *active_streams; - struct { - struct ptx_stream **arr; - int size; - } async_streams; - /* A lock for use when manipulating the above stream list and array. */ - pthread_mutex_t stream_lock; + int ord; bool overlap; bool map; @@ -445,32 +296,13 @@ struct ptx_device struct ptx_image_data *images; /* Images loaded on device. */ pthread_mutex_t image_lock; /* Lock for above list. */ - - struct ptx_device *next; -}; -enum ptx_event_type -{ - PTX_EVT_MEM, - PTX_EVT_KNL, - PTX_EVT_SYNC, - PTX_EVT_ASYNC_CLEANUP -}; - -struct ptx_event -{ - CUevent *evt; - int type; - void *addr; - int ord; - int val; + struct ptx_free_block *free_blocks; + pthread_mutex_t free_blocks_lock; - struct ptx_event *next; + struct ptx_device *next; }; -static pthread_mutex_t ptx_event_lock; -static struct ptx_event *ptx_events; - static struct ptx_device **ptx_devices; static inline struct nvptx_thread * @@ -479,193 +311,6 @@ nvptx_thread (void) return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); } -static bool -init_streams_for_device (struct ptx_device *ptx_dev, int concurrency) -{ - int i; - struct ptx_stream *null_stream - = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream)); - - null_stream->stream = NULL; - null_stream->host_thread = pthread_self (); - null_stream->multithreaded = true; - if (!map_init (null_stream)) - return false; - - ptx_dev->null_stream = null_stream; - ptx_dev->active_streams = NULL; - pthread_mutex_init (&ptx_dev->stream_lock, NULL); - - if (concurrency < 1) - concurrency = 1; - - /* This is just a guess -- make space for as many async streams as the - current device is capable of concurrently executing. This can grow - later as necessary. No streams are created yet. */ - ptx_dev->async_streams.arr - = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *)); - ptx_dev->async_streams.size = concurrency; - - for (i = 0; i < concurrency; i++) - ptx_dev->async_streams.arr[i] = NULL; - - return true; -} - -static bool -fini_streams_for_device (struct ptx_device *ptx_dev) -{ - free (ptx_dev->async_streams.arr); - - bool ret = true; - while (ptx_dev->active_streams != NULL) - { - struct ptx_stream *s = ptx_dev->active_streams; - ptx_dev->active_streams = ptx_dev->active_streams->next; - - ret &= map_fini (s); - - CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream); - if (r != CUDA_SUCCESS) - { - GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r)); - ret = false; - } - free (s); - } - - ret &= map_fini (ptx_dev->null_stream); - free (ptx_dev->null_stream); - return ret; -} - -/* Select a stream for (OpenACC-semantics) ASYNC argument for the current - thread THREAD (and also current device/context). If CREATE is true, create - the stream if it does not exist (or use EXISTING if it is non-NULL), and - associate the stream with the same thread argument. Returns stream to use - as result. */ - -static struct ptx_stream * -select_stream_for_async (int async, pthread_t thread, bool create, - CUstream existing) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - /* Local copy of TLS variable. */ - struct ptx_device *ptx_dev = nvthd->ptx_dev; - struct ptx_stream *stream = NULL; - int orig_async = async; - - /* The special value acc_async_noval (-1) maps (for now) to an - implicitly-created stream, which is then handled the same as any other - numbered async stream. Other options are available, e.g. using the null - stream for anonymous async operations, or choosing an idle stream from an - active set. But, stick with this for now. */ - if (async > acc_async_sync) - async++; - - if (create) - pthread_mutex_lock (&ptx_dev->stream_lock); - - /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the - null stream, and in fact better performance may be obtainable if it doesn't - (because the null stream enforces overly-strict synchronisation with - respect to other streams for legacy reasons, and that's probably not - needed with OpenACC). Maybe investigate later. */ - if (async == acc_async_sync) - stream = ptx_dev->null_stream; - else if (async >= 0 && async < ptx_dev->async_streams.size - && ptx_dev->async_streams.arr[async] && !(create && existing)) - stream = ptx_dev->async_streams.arr[async]; - else if (async >= 0 && create) - { - if (async >= ptx_dev->async_streams.size) - { - int i, newsize = ptx_dev->async_streams.size * 2; - - if (async >= newsize) - newsize = async + 1; - - ptx_dev->async_streams.arr - = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr, - newsize * sizeof (struct ptx_stream *)); - - for (i = ptx_dev->async_streams.size; i < newsize; i++) - ptx_dev->async_streams.arr[i] = NULL; - - ptx_dev->async_streams.size = newsize; - } - - /* Create a new stream on-demand if there isn't one already, or if we're - setting a particular async value to an existing (externally-provided) - stream. */ - if (!ptx_dev->async_streams.arr[async] || existing) - { - CUresult r; - struct ptx_stream *s - = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream)); - - if (existing) - s->stream = existing; - else - { - r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream, - CU_STREAM_DEFAULT); - if (r != CUDA_SUCCESS) - { - pthread_mutex_unlock (&ptx_dev->stream_lock); - GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", - cuda_error (r)); - } - } - - /* If CREATE is true, we're going to be queueing some work on this - stream. Associate it with the current host thread. */ - s->host_thread = thread; - s->multithreaded = false; - - if (!map_init (s)) - { - pthread_mutex_unlock (&ptx_dev->stream_lock); - GOMP_PLUGIN_fatal ("map_init fail"); - } - - s->next = ptx_dev->active_streams; - ptx_dev->active_streams = s; - ptx_dev->async_streams.arr[async] = s; - } - - stream = ptx_dev->async_streams.arr[async]; - } - else if (async < 0) - { - if (create) - pthread_mutex_unlock (&ptx_dev->stream_lock); - GOMP_PLUGIN_fatal ("bad async %d", async); - } - - if (create) - { - assert (stream != NULL); - - /* If we're trying to use the same stream from different threads - simultaneously, set stream->multithreaded to true. This affects the - behaviour of acc_async_test_all and acc_wait_all, which are supposed to - only wait for asynchronous launches from the same host thread they are - invoked on. If multiple threads use the same async value, we make note - of that here and fall back to testing/waiting for all threads in those - functions. */ - if (thread != stream->host_thread) - stream->multithreaded = true; - - pthread_mutex_unlock (&ptx_dev->stream_lock); - } - else if (stream && !stream->multithreaded - && !pthread_equal (stream->host_thread, thread)) - GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async); - - return stream; -} - /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK should be locked on entry and remains locked on exit. */ @@ -677,9 +322,6 @@ nvptx_init (void) if (instantiated_devices != 0) return true; - ptx_events = NULL; - pthread_mutex_init (&ptx_event_lock, NULL); - if (!init_cuda_lib ()) return false; @@ -703,6 +345,11 @@ nvptx_attach_host_thread_to_device (int n) CUcontext thd_ctx; r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev); + if (r == CUDA_ERROR_NOT_PERMITTED) + { + /* Assume we're in a CUDA callback, just return true. */ + return true; + } if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) { GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); @@ -847,8 +494,8 @@ nvptx_open_device (int n) ptx_dev->images = NULL; pthread_mutex_init (&ptx_dev->image_lock, NULL); - if (!init_streams_for_device (ptx_dev, async_engines)) - return NULL; + ptx_dev->free_blocks = NULL; + pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL); return ptx_dev; } @@ -859,9 +506,15 @@ nvptx_close_device (struct ptx_device *ptx_dev) if (!ptx_dev) return true; - if (!fini_streams_for_device (ptx_dev)) - return false; - + for (struct ptx_free_block *b = ptx_dev->free_blocks; b;) + { + struct ptx_free_block *b_next = b->next; + CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr); + free (b); + b = b_next; + } + + pthread_mutex_destroy (&ptx_dev->free_blocks_lock); pthread_mutex_destroy (&ptx_dev->image_lock); if (!ptx_dev->ctx_shared) @@ -1040,140 +693,20 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, return true; } -static void -event_gc (bool memmap_lockable) -{ - struct ptx_event *ptx_event = ptx_events; - struct ptx_event *async_cleanups = NULL; - struct nvptx_thread *nvthd = nvptx_thread (); - - pthread_mutex_lock (&ptx_event_lock); - - while (ptx_event != NULL) - { - CUresult r; - struct ptx_event *e = ptx_event; - - ptx_event = ptx_event->next; - - if (e->ord != nvthd->ptx_dev->ord) - continue; - - r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt); - if (r == CUDA_SUCCESS) - { - bool append_async = false; - CUevent *te; - - te = e->evt; - - switch (e->type) - { - case PTX_EVT_MEM: - case PTX_EVT_SYNC: - break; - - case PTX_EVT_KNL: - map_pop (e->addr); - break; - - case PTX_EVT_ASYNC_CLEANUP: - { - /* The function gomp_plugin_async_unmap_vars needs to claim the - memory-map splay tree lock for the current device, so we - can't call it when one of our callers has already claimed - the lock. In that case, just delay the GC for this event - until later. */ - if (!memmap_lockable) - continue; - - append_async = true; - } - break; - } - - CUDA_CALL_NOCHECK (cuEventDestroy, *te); - free ((void *)te); - - /* Unlink 'e' from ptx_events list. */ - if (ptx_events == e) - ptx_events = ptx_events->next; - else - { - struct ptx_event *e_ = ptx_events; - while (e_->next != e) - e_ = e_->next; - e_->next = e_->next->next; - } - - if (append_async) - { - e->next = async_cleanups; - async_cleanups = e; - } - else - free (e); - } - } - - pthread_mutex_unlock (&ptx_event_lock); - - /* We have to do these here, after ptx_event_lock is released. */ - while (async_cleanups) - { - struct ptx_event *e = async_cleanups; - async_cleanups = async_cleanups->next; - - GOMP_PLUGIN_async_unmap_vars (e->addr, e->val); - free (e); - } -} - -static void -event_add (enum ptx_event_type type, CUevent *e, void *h, int val) -{ - struct ptx_event *ptx_event; - struct nvptx_thread *nvthd = nvptx_thread (); - - assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC - || type == PTX_EVT_ASYNC_CLEANUP); - - ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event)); - ptx_event->type = type; - ptx_event->evt = e; - ptx_event->addr = h; - ptx_event->ord = nvthd->ptx_dev->ord; - ptx_event->val = val; - - pthread_mutex_lock (&ptx_event_lock); - - ptx_event->next = ptx_events; - ptx_events = ptx_event; - - pthread_mutex_unlock (&ptx_event_lock); -} - static void nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, - int async, unsigned *dims, void *targ_mem_desc) + unsigned *dims, void *targ_mem_desc, + CUdeviceptr dp, CUstream stream) { struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; CUfunction function; - CUresult r; int i; - struct ptx_stream *dev_str; void *kargs[1]; - void *hp; - CUdeviceptr dp = 0; struct nvptx_thread *nvthd = nvptx_thread (); int warp_size = nvthd->ptx_dev->warp_size; - const char *maybe_abort_msg = "(perhaps abort was called)"; function = targ_fn->fn; - dev_str = select_stream_for_async (async, pthread_self (), false, NULL); - assert (dev_str == nvthd->current_stream); - /* Initialize the launch dimensions. Typically this is constant, provided by the device compiler, but we must permit runtime values. */ @@ -1361,27 +894,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, dims[GOMP_DIM_VECTOR]); } - if (mapnum > 0) - { - /* This reserves a chunk of a pre-allocated page of memory mapped on both - the host and the device. HP is a host pointer to the new chunk, and DP is - the corresponding device pointer. */ - pthread_mutex_lock (&ptx_event_lock); - dp = map_push (dev_str, mapnum * sizeof (void *)); - pthread_mutex_unlock (&ptx_event_lock); - - GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); - - /* Copy the array of arguments to the mapped page. */ - hp = alloca(sizeof(void *) * mapnum); - for (i = 0; i < mapnum; i++) - ((void **) hp)[i] = devaddrs[i]; - - /* Copy the (device) pointers to arguments to the device */ - CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp, - mapnum * sizeof (void *)); - } - GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " gangs=%u, workers=%u, vectors=%u\n", __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG], @@ -1392,62 +904,14 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, // num_gangs nctaid.x // num_workers ntid.y // vector length ntid.x - kargs[0] = &dp; CUDA_CALL_ASSERT (cuLaunchKernel, function, dims[GOMP_DIM_GANG], 1, 1, dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, - 0, dev_str->stream, kargs, 0); - -#ifndef DISABLE_ASYNC - if (async < acc_async_noval) - { - r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream); - if (r == CUDA_ERROR_LAUNCH_FAILED) - GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), - maybe_abort_msg); - else if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); - } - else - { - CUevent *e; - - e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); - - r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); - if (r == CUDA_ERROR_LAUNCH_FAILED) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r), - maybe_abort_msg); - else if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); - - event_gc (true); - - CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream); - - if (mapnum > 0) - event_add (PTX_EVT_KNL, e, (void *)dev_str, 0); - } -#else - r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); - if (r == CUDA_ERROR_LAUNCH_FAILED) - GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), - maybe_abort_msg); - else if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); -#endif + 0, stream, kargs, 0); GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, targ_fn->launch->fn); - -#ifndef DISABLE_ASYNC - if (async < acc_async_noval) -#endif - { - if (mapnum > 0) - map_pop (dev_str); - } } void * openacc_get_current_cuda_context (void); @@ -1462,8 +926,21 @@ nvptx_alloc (size_t s) } static bool -nvptx_free (void *p) +nvptx_free (void *p, struct ptx_device *ptx_dev) { + /* Assume callback context if this is null. */ + if (GOMP_PLUGIN_acc_thread () == NULL) + { + struct ptx_free_block *n + = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block)); + n->ptr = p; + pthread_mutex_lock (&ptx_dev->free_blocks_lock); + n->next = ptx_dev->free_blocks; + ptx_dev->free_blocks = n; + pthread_mutex_unlock (&ptx_dev->free_blocks_lock); + return true; + } + CUdeviceptr pb; size_t ps; @@ -1478,318 +955,19 @@ nvptx_free (void *p) return true; } - -static bool -nvptx_host2dev (void *d, const void *h, size_t s) +static void * +nvptx_get_current_cuda_device (void) { - CUdeviceptr pb; - size_t ps; struct nvptx_thread *nvthd = nvptx_thread (); - if (!s) - return true; - if (!d) - { - GOMP_PLUGIN_error ("invalid device address"); - return false; - } - - CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d); - - if (!pb) - { - GOMP_PLUGIN_error ("invalid device address"); - return false; - } - if (!h) - { - GOMP_PLUGIN_error ("invalid host address"); - return false; - } - if (d == h) - { - GOMP_PLUGIN_error ("invalid host or device address"); - return false; - } - if ((void *)(d + s) > (void *)(pb + ps)) - { - GOMP_PLUGIN_error ("invalid size"); - return false; - } - -#ifndef DISABLE_ASYNC - if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream) - { - CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); - CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); - event_gc (false); - CUDA_CALL (cuMemcpyHtoDAsync, - (CUdeviceptr) d, h, s, nvthd->current_stream->stream); - CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream); - event_add (PTX_EVT_MEM, e, (void *)h, 0); - } - else -#endif - CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s); + if (!nvthd || !nvthd->ptx_dev) + return NULL; - return true; + return &nvthd->ptx_dev->dev; } -static bool -nvptx_dev2host (void *h, const void *d, size_t s) -{ - CUdeviceptr pb; - size_t ps; - struct nvptx_thread *nvthd = nvptx_thread (); - - if (!s) - return true; - if (!d) - { - GOMP_PLUGIN_error ("invalid device address"); - return false; - } - - CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d); - - if (!pb) - { - GOMP_PLUGIN_error ("invalid device address"); - return false; - } - if (!h) - { - GOMP_PLUGIN_error ("invalid host address"); - return false; - } - if (d == h) - { - GOMP_PLUGIN_error ("invalid host or device address"); - return false; - } - if ((void *)(d + s) > (void *)(pb + ps)) - { - GOMP_PLUGIN_error ("invalid size"); - return false; - } - -#ifndef DISABLE_ASYNC - if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream) - { - CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); - CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); - event_gc (false); - CUDA_CALL (cuMemcpyDtoHAsync, - h, (CUdeviceptr) d, s, nvthd->current_stream->stream); - CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream); - event_add (PTX_EVT_MEM, e, (void *)h, 0); - } - else -#endif - CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s); - - return true; -} - -static void -nvptx_set_async (int async) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - nvthd->current_stream - = select_stream_for_async (async, pthread_self (), true, NULL); -} - -static int -nvptx_async_test (int async) -{ - CUresult r; - struct ptx_stream *s; - - s = select_stream_for_async (async, pthread_self (), false, NULL); - if (!s) - return 1; - - r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream); - if (r == CUDA_SUCCESS) - { - /* The oacc-parallel.c:goacc_wait function calls this hook to determine - whether all work has completed on this stream, and if so omits the call - to the wait hook. If that happens, event_gc might not get called - (which prevents variables from getting unmapped and their associated - device storage freed), so call it here. */ - event_gc (true); - return 1; - } - else if (r == CUDA_ERROR_NOT_READY) - return 0; - - GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); - - return 0; -} - -static int -nvptx_async_test_all (void) -{ - struct ptx_stream *s; - pthread_t self = pthread_self (); - struct nvptx_thread *nvthd = nvptx_thread (); - - pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); - - for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) - { - if ((s->multithreaded || pthread_equal (s->host_thread, self)) - && CUDA_CALL_NOCHECK (cuStreamQuery, - s->stream) == CUDA_ERROR_NOT_READY) - { - pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); - return 0; - } - } - - pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); - - event_gc (true); - - return 1; -} - -static void -nvptx_wait (int async) -{ - struct ptx_stream *s; - - s = select_stream_for_async (async, pthread_self (), false, NULL); - if (!s) - return; - - CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream); - - event_gc (true); -} - -static void -nvptx_wait_async (int async1, int async2) -{ - CUevent *e; - struct ptx_stream *s1, *s2; - pthread_t self = pthread_self (); - - s1 = select_stream_for_async (async1, self, false, NULL); - if (!s1) - return; - - /* The stream that is waiting (rather than being waited for) doesn't - necessarily have to exist already. */ - s2 = select_stream_for_async (async2, self, true, NULL); - - /* A stream is always synchronized with itself. */ - if (s1 == s2) - return; - - e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); - - CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); - - event_gc (true); - - CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream); - - event_add (PTX_EVT_SYNC, e, NULL, 0); - - CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0); -} - -static void -nvptx_wait_all (void) -{ - CUresult r; - struct ptx_stream *s; - pthread_t self = pthread_self (); - struct nvptx_thread *nvthd = nvptx_thread (); - - pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); - - /* Wait for active streams initiated by this thread (or by multiple threads) - to complete. */ - for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) - { - if (s->multithreaded || pthread_equal (s->host_thread, self)) - { - r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream); - if (r == CUDA_SUCCESS) - continue; - else if (r != CUDA_ERROR_NOT_READY) - GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); - - CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream); - } - } - - pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); - - event_gc (true); -} - -static void -nvptx_wait_all_async (int async) -{ - struct ptx_stream *waiting_stream, *other_stream; - CUevent *e; - struct nvptx_thread *nvthd = nvptx_thread (); - pthread_t self = pthread_self (); - - /* The stream doing the waiting. This could be the first mention of the - stream, so create it if necessary. */ - waiting_stream - = select_stream_for_async (async, pthread_self (), true, NULL); - - /* Launches on the null stream already block on other streams in the - context. */ - if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream) - return; - - event_gc (true); - - pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); - - for (other_stream = nvthd->ptx_dev->active_streams; - other_stream != NULL; - other_stream = other_stream->next) - { - if (!other_stream->multithreaded - && !pthread_equal (other_stream->host_thread, self)) - continue; - - e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); - - CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); - - /* Record an event on the waited-for stream. */ - CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream); - - event_add (PTX_EVT_SYNC, e, NULL, 0); - - CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0); - } - - pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); -} - -static void * -nvptx_get_current_cuda_device (void) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - - if (!nvthd || !nvthd->ptx_dev) - return NULL; - - return &nvthd->ptx_dev->dev; -} - -static void * -nvptx_get_current_cuda_context (void) +static void * +nvptx_get_current_cuda_context (void) { struct nvptx_thread *nvthd = nvptx_thread (); @@ -1799,75 +977,6 @@ nvptx_get_current_cuda_context (void) return nvthd->ptx_dev->ctx; } -static void * -nvptx_get_cuda_stream (int async) -{ - struct ptx_stream *s; - struct nvptx_thread *nvthd = nvptx_thread (); - - if (!nvthd || !nvthd->ptx_dev) - return NULL; - - s = select_stream_for_async (async, pthread_self (), false, NULL); - - return s ? s->stream : NULL; -} - -static int -nvptx_set_cuda_stream (int async, void *stream) -{ - struct ptx_stream *oldstream; - pthread_t self = pthread_self (); - struct nvptx_thread *nvthd = nvptx_thread (); - - /* Due to the "null_stream" usage for "acc_async_sync", this cannot be used - to change the stream handle associated with "acc_async_sync". */ - if (async == acc_async_sync) - { - GOMP_PLUGIN_debug (0, "Refusing request to set CUDA stream associated" - " with \"acc_async_sync\"\n"); - return 0; - } - - pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); - - /* We have a list of active streams and an array mapping async values to - entries of that list. We need to take "ownership" of the passed-in stream, - and add it to our list, removing the previous entry also (if there was one) - in order to prevent resource leaks. Note the potential for surprise - here: maybe we should keep track of passed-in streams and leave it up to - the user to tidy those up, but that doesn't work for stream handles - returned from acc_get_cuda_stream above... */ - - oldstream = select_stream_for_async (async, self, false, NULL); - - if (oldstream) - { - if (nvthd->ptx_dev->active_streams == oldstream) - nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next; - else - { - struct ptx_stream *s = nvthd->ptx_dev->active_streams; - while (s->next != oldstream) - s = s->next; - s->next = s->next->next; - } - - CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream); - - if (!map_fini (oldstream)) - GOMP_PLUGIN_fatal ("error when freeing host memory"); - - free (oldstream); - } - - pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); - - (void) select_stream_for_async (async, self, true, (CUstream) stream); - - return 1; -} - /* Plugin entry points. */ const char * @@ -2107,100 +1216,116 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) { if (!nvptx_attach_host_thread_to_device (ord)) return NULL; - return nvptx_alloc (size); -} -bool -GOMP_OFFLOAD_free (int ord, void *ptr) -{ - return (nvptx_attach_host_thread_to_device (ord) - && nvptx_free (ptr)); -} + struct ptx_device *ptx_dev = ptx_devices[ord]; + struct ptx_free_block *blocks, *tmp; -bool -GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) -{ - return (nvptx_attach_host_thread_to_device (ord) - && nvptx_dev2host (dst, src, n)); -} + pthread_mutex_lock (&ptx_dev->free_blocks_lock); + blocks = ptx_dev->free_blocks; + ptx_dev->free_blocks = NULL; + pthread_mutex_unlock (&ptx_dev->free_blocks_lock); -bool -GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) -{ - return (nvptx_attach_host_thread_to_device (ord) - && nvptx_host2dev (dst, src, n)); + while (blocks) + { + tmp = blocks->next; + nvptx_free (blocks->ptr, ptx_dev); + free (blocks); + blocks = tmp; + } + + return nvptx_alloc (size); } bool -GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n) +GOMP_OFFLOAD_free (int ord, void *ptr) { - struct ptx_device *ptx_dev = ptx_devices[ord]; - CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, - ptx_dev->null_stream->stream); - return true; + return (nvptx_attach_host_thread_to_device (ord) + && nvptx_free (ptr, ptx_devices[ord])); } -void (*device_run) (int n, void *fn_ptr, void *vars) = NULL; - void GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum, void **hostaddrs, void **devaddrs, - int async, unsigned *dims, void *targ_mem_desc) + unsigned *dims, void *targ_mem_desc) { - nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc); -} + GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); -void -GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); + void **hp = NULL; + CUdeviceptr dp = 0; - CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); - CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream); - event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async); -} + if (mapnum > 0) + { + hp = alloca (mapnum * sizeof (void *)); + for (int i = 0; i < mapnum; i++) + hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]); + CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *)); + } -int -GOMP_OFFLOAD_openacc_async_test (int async) -{ - return nvptx_async_test (async); -} + /* Copy the (device) pointers to arguments to the device (dp and hp might in + fact have the same value on a unified-memory system). */ + if (mapnum > 0) + CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp, + mapnum * sizeof (void *)); -int -GOMP_OFFLOAD_openacc_async_test_all (void) -{ - return nvptx_async_test_all (); -} + nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, + dp, NULL); -void -GOMP_OFFLOAD_openacc_async_wait (int async) -{ - nvptx_wait (async); + CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL); + const char *maybe_abort_msg = "(perhaps abort was called)"; + if (r == CUDA_ERROR_LAUNCH_FAILED) + GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), + maybe_abort_msg); + else if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuMemFree, dp); } -void -GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2) +static void +cuda_free_argmem (void *ptr) { - nvptx_wait_async (async1, async2); + void **block = (void **) ptr; + nvptx_free (block[0], (struct ptx_device *) block[1]); + free (block); } void -GOMP_OFFLOAD_openacc_async_wait_all (void) +GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum, + void **hostaddrs, void **devaddrs, + unsigned *dims, void *targ_mem_desc, + struct goacc_asyncqueue *aq) { - nvptx_wait_all (); -} + GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); -void -GOMP_OFFLOAD_openacc_async_wait_all_async (int async) -{ - nvptx_wait_all_async (async); -} + void **hp = NULL; + CUdeviceptr dp = 0; + void **block = NULL; -void -GOMP_OFFLOAD_openacc_async_set_async (int async) -{ - nvptx_set_async (async); + if (mapnum > 0) + { + block = (void **) GOMP_PLUGIN_malloc ((mapnum + 2) * sizeof (void *)); + hp = block + 2; + for (int i = 0; i < mapnum; i++) + hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]); + CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *)); + } + + /* Copy the (device) pointers to arguments to the device (dp and hp might in + fact have the same value on a unified-memory system). */ + if (mapnum > 0) + { + CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp, + mapnum * sizeof (void *), aq->cuda_stream); + block[0] = (void *) dp; + + struct nvptx_thread *nvthd = + (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); + block[1] = (void *) nvthd->ptx_dev; + } + nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, + dp, aq->cuda_stream); + + if (mapnum > 0) + GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block); } void * @@ -2222,7 +1347,6 @@ GOMP_OFFLOAD_openacc_create_thread_data (int ord) if (!thd_ctx) CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx); - nvthd->current_stream = ptx_dev->null_stream; nvthd->ptx_dev = ptx_dev; return (void *) nvthd; @@ -2246,20 +1370,184 @@ GOMP_OFFLOAD_openacc_cuda_get_current_context (void) return nvptx_get_current_cuda_context (); } -/* NOTE: This returns a CUstream, not a ptx_stream pointer. */ - +/* This returns a CUstream. */ void * -GOMP_OFFLOAD_openacc_cuda_get_stream (int async) +GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq) +{ + return (void *) aq->cuda_stream; +} + +/* This takes a CUstream. */ +int +GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream) +{ + if (aq->cuda_stream) + { + CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream); + CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream); + } + + aq->cuda_stream = (CUstream) stream; + return 1; +} + +struct goacc_asyncqueue * +GOMP_OFFLOAD_openacc_async_construct (void) { - return nvptx_get_cuda_stream (async); + CUstream stream = NULL; + CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT); + + struct goacc_asyncqueue *aq + = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue)); + aq->cuda_stream = stream; + return aq; } -/* NOTE: This takes a CUstream, not a ptx_stream pointer. */ +bool +GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq) +{ + CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream); + free (aq); + return true; +} int -GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream) +GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq) { - return nvptx_set_cuda_stream (async, stream); + CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream); + if (r == CUDA_SUCCESS) + return 1; + if (r == CUDA_ERROR_NOT_READY) + return 0; + + GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r)); + return -1; +} + +bool +GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq) +{ + CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream); + return true; +} + +bool +GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1, + struct goacc_asyncqueue *aq2) +{ + CUevent e; + CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING); + CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream); + CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0); + return true; +} + +static void +cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr) +{ + if (res != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res)); + struct nvptx_callback *cb = (struct nvptx_callback *) ptr; + cb->fn (cb->ptr); + free (ptr); +} + +void +GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, + void (*callback_fn)(void *), + void *userptr) +{ + struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b)); + b->fn = callback_fn; + b->ptr = userptr; + b->aq = aq; + CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream, + cuda_callback_wrapper, (void *) b, 0); +} + +static bool +cuda_memcpy_sanity_check (const void *h, const void *d, size_t s) +{ + CUdeviceptr pb; + size_t ps; + if (!s) + return true; + if (!d) + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } + CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d); + if (!pb) + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } + if (!h) + { + GOMP_PLUGIN_error ("invalid host address"); + return false; + } + if (d == h) + { + GOMP_PLUGIN_error ("invalid host or device address"); + return false; + } + if ((void *)(d + s) > (void *)(pb + ps)) + { + GOMP_PLUGIN_error ("invalid size"); + return false; + } + return true; +} + +bool +GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) +{ + if (!nvptx_attach_host_thread_to_device (ord) + || !cuda_memcpy_sanity_check (src, dst, n)) + return false; + CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n); + return true; +} + +bool +GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) +{ + if (!nvptx_attach_host_thread_to_device (ord) + || !cuda_memcpy_sanity_check (dst, src, n)) + return false; + CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n); + return true; +} + +bool +GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n) +{ + CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL); + return true; +} + +bool +GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src, + size_t n, struct goacc_asyncqueue *aq) +{ + if (!nvptx_attach_host_thread_to_device (ord) + || !cuda_memcpy_sanity_check (src, dst, n)) + return false; + CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream); + return true; +} + +bool +GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src, + size_t n, struct goacc_asyncqueue *aq) +{ + if (!nvptx_attach_host_thread_to_device (ord) + || !cuda_memcpy_sanity_check (dst, src, n)) + return false; + CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream); + return true; } /* Adjust launch dimensions: pick good values for number of blocks and warps @@ -2360,8 +1648,7 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) CU_LAUNCH_PARAM_END }; r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1, - 32, threads, 1, 0, ptx_dev->null_stream->stream, - NULL, config); + 32, threads, 1, 0, NULL, NULL, config); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); diff --git a/libgomp/target.c b/libgomp/target.c index 31148003d0a..2e0905effb3 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -177,6 +177,22 @@ gomp_device_copy (struct gomp_device_descr *devicep, } } +static inline void +goacc_device_copy_async (struct gomp_device_descr *devicep, + bool (*copy_func) (int, void *, const void *, size_t, + struct goacc_asyncqueue *), + const char *dst, void *dstaddr, + const char *src, const void *srcaddr, + size_t size, struct goacc_asyncqueue *aq) +{ + if (!copy_func (devicep->target_id, dstaddr, srcaddr, size, aq)) + { + gomp_mutex_unlock (&devicep->lock); + gomp_fatal ("Copying of %s object [%p..%p) to %s object [%p..%p) failed", + src, srcaddr, srcaddr + size, dst, dstaddr, dstaddr + size); + } +} + /* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses) host to device memory transfers. */ @@ -269,8 +285,9 @@ gomp_to_device_kind_p (int kind) } } -static void +attribute_hidden void gomp_copy_host2dev (struct gomp_device_descr *devicep, + struct goacc_asyncqueue *aq, void *d, const void *h, size_t sz, struct gomp_coalesce_buf *cbuf) { @@ -299,14 +316,23 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep, } } } - gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz); + if (__builtin_expect (aq != NULL, 0)) + goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func, + "dev", d, "host", h, sz, aq); + else + gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz); } -static void +attribute_hidden void gomp_copy_dev2host (struct gomp_device_descr *devicep, + struct goacc_asyncqueue *aq, void *h, const void *d, size_t sz) { - gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz); + if (__builtin_expect (aq != NULL, 0)) + goacc_device_copy_async (devicep, devicep->openacc.async.dev2host_func, + "host", h, "dev", d, sz, aq); + else + gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz); } static void @@ -324,7 +350,8 @@ gomp_free_device_memory (struct gomp_device_descr *devicep, void *devptr) Helper function of gomp_map_vars. */ static inline void -gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn, +gomp_map_vars_existing (struct gomp_device_descr *devicep, + struct goacc_asyncqueue *aq, splay_tree_key oldn, splay_tree_key newn, struct target_var_desc *tgt_var, unsigned char kind, struct gomp_coalesce_buf *cbuf) { @@ -346,7 +373,7 @@ gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn, } if (GOMP_MAP_ALWAYS_TO_P (kind)) - gomp_copy_host2dev (devicep, + gomp_copy_host2dev (devicep, aq, (void *) (oldn->tgt->tgt_start + oldn->tgt_offset + newn->host_start - oldn->host_start), (void *) newn->host_start, @@ -364,8 +391,8 @@ get_kind (bool short_mapkind, void *kinds, int idx) } static void -gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr, - uintptr_t target_offset, uintptr_t bias, +gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq, + uintptr_t host_ptr, uintptr_t target_offset, uintptr_t bias, struct gomp_coalesce_buf *cbuf) { struct gomp_device_descr *devicep = tgt->device_descr; @@ -376,7 +403,7 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr, if (cur_node.host_start == (uintptr_t) NULL) { cur_node.tgt_offset = (uintptr_t) NULL; - gomp_copy_host2dev (devicep, + gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset), (void *) &cur_node.tgt_offset, sizeof (void *), cbuf); @@ -398,12 +425,13 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr, array section. Now subtract bias to get what we want to initialize the pointer with. */ cur_node.tgt_offset -= bias; - gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset), + gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset), (void *) &cur_node.tgt_offset, sizeof (void *), cbuf); } static void -gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n, +gomp_map_fields_existing (struct target_mem_desc *tgt, + struct goacc_asyncqueue *aq, splay_tree_key n, size_t first, size_t i, void **hostaddrs, size_t *sizes, void *kinds, struct gomp_coalesce_buf *cbuf) @@ -423,7 +451,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n, && n2->tgt == n->tgt && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset) { - gomp_map_vars_existing (devicep, n2, &cur_node, + gomp_map_vars_existing (devicep, aq, n2, &cur_node, &tgt->list[i], kind & typemask, cbuf); return; } @@ -439,8 +467,8 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n, && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset) { - gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i], - kind & typemask, cbuf); + gomp_map_vars_existing (devicep, aq, n2, &cur_node, + &tgt->list[i], kind & typemask, cbuf); return; } } @@ -451,7 +479,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n, && n2->tgt == n->tgt && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset) { - gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i], + gomp_map_vars_existing (devicep, aq, n2, &cur_node, &tgt->list[i], kind & typemask, cbuf); return; } @@ -483,10 +511,12 @@ gomp_map_val (struct target_mem_desc *tgt, void **hostaddrs, size_t i) return tgt->tgt_start + tgt->list[i].offset; } -attribute_hidden struct target_mem_desc * -gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, - void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds, - bool short_mapkind, enum gomp_map_vars_kind pragma_kind) +static inline __attribute__((always_inline)) struct target_mem_desc * +gomp_map_vars_internal (struct gomp_device_descr *devicep, + struct goacc_asyncqueue *aq, size_t mapnum, + void **hostaddrs, void **devaddrs, size_t *sizes, + void *kinds, bool short_mapkind, + enum gomp_map_vars_kind pragma_kind) { size_t i, tgt_align, tgt_size, not_found_cnt = 0; bool has_firstprivate = false; @@ -600,7 +630,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, continue; } for (i = first; i <= last; i++) - gomp_map_fields_existing (tgt, n, first, i, hostaddrs, + gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs, sizes, kinds, NULL); i--; continue; @@ -645,7 +675,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, else n = splay_tree_lookup (mem_map, &cur_node); if (n && n->refcount != REFCOUNT_LINK) - gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i], + gomp_map_vars_existing (devicep, aq, n, &cur_node, &tgt->list[i], kind & typemask, NULL); else { @@ -756,7 +786,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, tgt_size = (tgt_size + align - 1) & ~(align - 1); tgt->list[i].offset = tgt_size; len = sizes[i]; - gomp_copy_host2dev (devicep, + gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + tgt_size), (void *) hostaddrs[i], len, cbufp); tgt_size += len; @@ -790,7 +820,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, continue; } for (i = first; i <= last; i++) - gomp_map_fields_existing (tgt, n, first, i, hostaddrs, + gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs, sizes, kinds, cbufp); i--; continue; @@ -810,7 +840,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i - 1); if (cur_node.tgt_offset) cur_node.tgt_offset -= sizes[i]; - gomp_copy_host2dev (devicep, + gomp_copy_host2dev (devicep, aq, (void *) (n->tgt->tgt_start + n->tgt_offset + cur_node.host_start @@ -831,7 +861,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, k->host_end = k->host_start + sizeof (void *); splay_tree_key n = splay_tree_lookup (mem_map, k); if (n && n->refcount != REFCOUNT_LINK) - gomp_map_vars_existing (devicep, n, k, &tgt->list[i], + gomp_map_vars_existing (devicep, aq, n, k, &tgt->list[i], kind & typemask, cbufp); else { @@ -884,18 +914,19 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, case GOMP_MAP_FORCE_TOFROM: case GOMP_MAP_ALWAYS_TO: case GOMP_MAP_ALWAYS_TOFROM: - gomp_copy_host2dev (devicep, + gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + k->tgt_offset), (void *) k->host_start, k->host_end - k->host_start, cbufp); break; case GOMP_MAP_POINTER: - gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start, + gomp_map_pointer (tgt, aq, + (uintptr_t) *(void **) k->host_start, k->tgt_offset, sizes[i], cbufp); break; case GOMP_MAP_TO_PSET: - gomp_copy_host2dev (devicep, + gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + k->tgt_offset), (void *) k->host_start, @@ -917,7 +948,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, tgt->list[j].always_copy_from = false; if (k->refcount != REFCOUNT_INFINITY) k->refcount++; - gomp_map_pointer (tgt, + gomp_map_pointer (tgt, aq, (uintptr_t) *(void **) hostaddrs[j], k->tgt_offset + ((uintptr_t) hostaddrs[j] @@ -946,7 +977,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, break; case GOMP_MAP_FORCE_DEVICEPTR: assert (k->host_end - k->host_start == sizeof (void *)); - gomp_copy_host2dev (devicep, + gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + k->tgt_offset), (void *) k->host_start, @@ -965,7 +996,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset); /* We intentionally do not use coalescing here, as it's not data allocated by the current call to this function. */ - gomp_copy_host2dev (devicep, (void *) n->tgt_offset, + gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset, &tgt_addr, sizeof (void *), NULL); } array++; @@ -978,7 +1009,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, for (i = 0; i < mapnum; i++) { cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i); - gomp_copy_host2dev (devicep, + gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + i * sizeof (void *)), (void *) &cur_node.tgt_offset, sizeof (void *), cbufp); @@ -989,7 +1020,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, { long c = 0; for (c = 0; c < cbuf.chunk_cnt; ++c) - gomp_copy_host2dev (devicep, + gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + cbuf.chunks[c].start), (char *) cbuf.buf + (cbuf.chunks[c].start - cbuf.chunks[0].start), @@ -1012,7 +1043,27 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, return tgt; } -static void +attribute_hidden struct target_mem_desc * +gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, + void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds, + bool short_mapkind, enum gomp_map_vars_kind pragma_kind) +{ + return gomp_map_vars_internal (devicep, NULL, mapnum, hostaddrs, devaddrs, + sizes, kinds, short_mapkind, pragma_kind); +} + +attribute_hidden struct target_mem_desc * +gomp_map_vars_async (struct gomp_device_descr *devicep, + struct goacc_asyncqueue *aq, size_t mapnum, + void **hostaddrs, void **devaddrs, size_t *sizes, + void *kinds, bool short_mapkind, + enum gomp_map_vars_kind pragma_kind) +{ + return gomp_map_vars_internal (devicep, aq, mapnum, hostaddrs, devaddrs, + sizes, kinds, short_mapkind, pragma_kind); +} + +attribute_hidden void gomp_unmap_tgt (struct target_mem_desc *tgt) { /* Deallocate on target the tgt->tgt_start .. tgt->tgt_end region. */ @@ -1040,12 +1091,24 @@ gomp_remove_var (struct gomp_device_descr *devicep, splay_tree_key k) return is_tgt_unmapped; } +static void +gomp_unref_tgt (void *ptr) +{ + struct target_mem_desc *tgt = (struct target_mem_desc *) ptr; + + if (tgt->refcount > 1) + tgt->refcount--; + else + gomp_unmap_tgt (tgt); +} + /* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant variables back from device to host: if it is false, it is assumed that this has been done already. */ -attribute_hidden void -gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom) +static inline __attribute__((always_inline)) void +gomp_unmap_vars_internal (struct target_mem_desc *tgt, bool do_copyfrom, + struct goacc_asyncqueue *aq) { struct gomp_device_descr *devicep = tgt->device_descr; @@ -1082,7 +1145,7 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom) if ((do_unmap && do_copyfrom && tgt->list[i].copy_from) || tgt->list[i].always_copy_from) - gomp_copy_dev2host (devicep, + gomp_copy_dev2host (devicep, aq, (void *) (k->host_start + tgt->list[i].offset), (void *) (k->tgt->tgt_start + k->tgt_offset + tgt->list[i].offset), @@ -1091,14 +1154,28 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom) gomp_remove_var (devicep, k); } - if (tgt->refcount > 1) - tgt->refcount--; + if (aq) + devicep->openacc.async.queue_callback_func (aq, gomp_unref_tgt, + (void *) tgt); else - gomp_unmap_tgt (tgt); + gomp_unref_tgt ((void *) tgt); gomp_mutex_unlock (&devicep->lock); } +attribute_hidden void +gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom) +{ + gomp_unmap_vars_internal (tgt, do_copyfrom, NULL); +} + +attribute_hidden void +gomp_unmap_vars_async (struct target_mem_desc *tgt, bool do_copyfrom, + struct goacc_asyncqueue *aq) +{ + gomp_unmap_vars_internal (tgt, do_copyfrom, aq); +} + static void gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs, size_t *sizes, void *kinds, bool short_mapkind) @@ -1148,9 +1225,10 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs, size_t size = cur_node.host_end - cur_node.host_start; if (GOMP_MAP_COPY_TO_P (kind & typemask)) - gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL); + gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size, + NULL); if (GOMP_MAP_COPY_FROM_P (kind & typemask)) - gomp_copy_dev2host (devicep, hostaddr, devaddr, size); + gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size); } } gomp_mutex_unlock (&devicep->lock); @@ -1443,9 +1521,24 @@ gomp_init_device (struct gomp_device_descr *devicep) false); } + /* Initialize OpenACC asynchronous queues. */ + goacc_init_asyncqueues (devicep); + devicep->state = GOMP_DEVICE_INITIALIZED; } +/* This function finalizes the target device, specified by DEVICEP. DEVICEP + must be locked on entry, and remains locked on return. */ + +attribute_hidden bool +gomp_fini_device (struct gomp_device_descr *devicep) +{ + bool ret = goacc_fini_asyncqueues (devicep); + ret &= devicep->fini_device_func (devicep->target_id); + devicep->state = GOMP_DEVICE_FINALIZED; + return ret; +} + attribute_hidden void gomp_unload_device (struct gomp_device_descr *devicep) { @@ -1954,7 +2047,7 @@ gomp_exit_data (struct gomp_device_descr *devicep, size_t mapnum, if ((kind == GOMP_MAP_FROM && k->refcount == 0) || kind == GOMP_MAP_ALWAYS_FROM) - gomp_copy_dev2host (devicep, (void *) cur_node.host_start, + gomp_copy_dev2host (devicep, NULL, (void *) cur_node.host_start, (void *) (k->tgt->tgt_start + k->tgt_offset + cur_node.host_start - k->host_start), @@ -2636,20 +2729,20 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200) { if (!DLSYM_OPT (openacc.exec, openacc_exec) - || !DLSYM_OPT (openacc.register_async_cleanup, - openacc_register_async_cleanup) - || !DLSYM_OPT (openacc.async_test, openacc_async_test) - || !DLSYM_OPT (openacc.async_test_all, openacc_async_test_all) - || !DLSYM_OPT (openacc.async_wait, openacc_async_wait) - || !DLSYM_OPT (openacc.async_wait_async, openacc_async_wait_async) - || !DLSYM_OPT (openacc.async_wait_all, openacc_async_wait_all) - || !DLSYM_OPT (openacc.async_wait_all_async, - openacc_async_wait_all_async) - || !DLSYM_OPT (openacc.async_set_async, openacc_async_set_async) || !DLSYM_OPT (openacc.create_thread_data, openacc_create_thread_data) || !DLSYM_OPT (openacc.destroy_thread_data, - openacc_destroy_thread_data)) + openacc_destroy_thread_data) + || !DLSYM_OPT (openacc.async.construct, openacc_async_construct) + || !DLSYM_OPT (openacc.async.destruct, openacc_async_destruct) + || !DLSYM_OPT (openacc.async.test, openacc_async_test) + || !DLSYM_OPT (openacc.async.synchronize, openacc_async_synchronize) + || !DLSYM_OPT (openacc.async.serialize, openacc_async_serialize) + || !DLSYM_OPT (openacc.async.queue_callback, + openacc_async_queue_callback) + || !DLSYM_OPT (openacc.async.exec, openacc_async_exec) + || !DLSYM_OPT (openacc.async.dev2host, openacc_async_dev2host) + || !DLSYM_OPT (openacc.async.host2dev, openacc_async_host2dev)) { /* Require all the OpenACC handlers if we have GOMP_OFFLOAD_CAP_OPENACC_200. */ @@ -2700,10 +2793,7 @@ gomp_target_fini (void) struct gomp_device_descr *devicep = &devices[i]; gomp_mutex_lock (&devicep->lock); if (devicep->state == GOMP_DEVICE_INITIALIZED) - { - ret = devicep->fini_device_func (devicep->target_id); - devicep->state = GOMP_DEVICE_FINALIZED; - } + ret = gomp_fini_device (devicep); gomp_mutex_unlock (&devicep->lock); if (!ret) gomp_fatal ("device finalization failed"); -- 2.30.2