From d93bdab53b8de8677bca3af17fe8072458ea3f6b Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Wed, 8 Apr 2015 15:58:33 +0000 Subject: [PATCH] mkoffload.c (process): Support variable mapping. gcc/ * config/nvptx/mkoffload.c (process): Support variable mapping. libgomp/ * libgomp.h (target_mem_desc: Remove mem_map field. (acc_dispatch_t): Remove open_device_func, close_device_func, get_device_num_func, set_device_num_func, target_data members. Change create_thread_data_func argument to device number instead of generic pointer. * oacc-async.c (assert.h): Include. (acc_async_test, acc_async_test_all, acc_wait, acc_wait_async) (acc_wait_all, acc_wait_all_async): Use current host thread's active device, not base_dev. * oacc-cuda.c (acc_get_current_cuda_device) (acc_get_current_cuda_context, acc_get_cuda_stream) (acc_set_cuda_stream): Likewise. * oacc-host.c (host_dispatch): Don't set open_device_func, close_device_func, get_device_num_func or set_device_num_func. * oacc-init.c (base_dev, init_key): Remove. (cached_base_dev): New. (name_of_acc_device_t): New. (acc_init_1): Initialise default-numbered device, not zeroth. (acc_shutdown_1): Close all devices of a given type. (goacc_destroy_thread): Don't use base_dev. (lazy_open, lazy_init, lazy_init_and_open): Remove. (goacc_attach_host_thread_to_device): New. (acc_init): Reimplement with goacc_attach_host_thread_to_device. (acc_get_num_devices): Don't use base_dev. (acc_set_device_type): Reimplement. (acc_get_device_type): Don't use base_dev. (acc_get_device_num): Tweak logic. (acc_set_device_num): Likewise. (acc_on_device): Use acc_get_device_type. (goacc_runtime_initialize): Initialize cached_base_dev not base_dev. (goacc_lazy_initialize): Reimplement with acc_init and goacc_attach_host_thread_to_device. * oacc-int.h (goacc_thread): Add base_dev field. (base_dev): Remove extern declaration. (goacc_attach_host_thread_to_device): Add prototype. * oacc-mem.c (acc_malloc): Use current thread's device instead of base_dev. (acc_free): Likewise. (acc_memcpy_to_device): Likewise. (acc_memcpy_from_device): Likewise. * oacc-parallel.c (select_acc_device): Remove. Replace calls with goacc_lazy_initialize (throughout). (GOACC_parallel): Use tgt_offset to locate target functions. * target.c (gomp_map_vars): Don't set tgt->mem_map. (gomp_unmap_vars): Use devicep->mem_map pointer not tgt->mem_map. (gomp_load_plugin_for_device): Remove open_device, close_device, get_device_num, set_device_num openacc hook initialisation. Don't set openacc.target_data. * plugin/plugin-host.c (GOMP_OFFLOAD_openacc_open_device) (GOMP_OFFLOAD_openacc_close_device) (GOMP_OFFLOAD_openacc_get_device_num) (GOMP_OFFLOAD_openacc_set_device_num): Remove. (GOMP_OFFLOAD_openacc_create_thread_data): Change (unused) argument to int. * plugin/plugin-nvptx.c (ptx_inited): Remove. (instantiated_devices, ptx_dev_lock): New. (struct ptx_image_data): New. (ptx_devices, ptx_images, ptx_image_lock): New. (fini_streams_for_device): Reorder cuStreamDestroy call. (nvptx_get_num_devices): Remove forward declaration. (nvptx_init): Change return type to bool. (nvptx_fini): Remove. (nvptx_attach_host_thread_to_device): New. (nvptx_open_device): Return struct ptx_device* instead of void*. (nvptx_close_device): Change argument type to struct ptx_device*, return type to void. (nvptx_get_num_devices): Use instantiated_devices not ptx_inited. (kernel_target_data, kernel_host_table): Remove static globals. (GOMP_OFFLOAD_register_image, GOMP_OFFLOAD_get_table): Remove. (GOMP_OFFLOAD_init_device): Reimplement. (GOMP_OFFLOAD_fini_device): Likewise. (GOMP_OFFLOAD_load_image, GOMP_OFFLOAD_unload_image): New. (GOMP_OFFLOAD_alloc, GOMP_OFFLOAD_free, GOMP_OFFLOAD_dev2host) (GOMP_OFFLOAD_host2dev): Use ORD argument. (GOMP_OFFLOAD_openacc_open_device) (GOMP_OFFLOAD_openacc_close_device) (GOMP_OFFLOAD_openacc_set_device_num) (GOMP_OFFLOAD_openacc_get_device_num): Remove. (GOMP_OFFLOAD_openacc_create_thread_data): Change argument to int (device number). libgomp/testsuite/ * libgomp.oacc-c-c++-common/lib-9.c: Fix devnum check in test. From-SVN: r221922 --- gcc/ChangeLog | 4 + gcc/config/nvptx/mkoffload.c | 8 +- libgomp/ChangeLog | 86 ++++ libgomp/libgomp.h | 17 +- libgomp/oacc-async.c | 44 +- libgomp/oacc-cuda.c | 40 +- libgomp/oacc-host.c | 7 - libgomp/oacc-init.c | 424 +++++++++--------- libgomp/oacc-int.h | 8 +- libgomp/oacc-mem.c | 16 +- libgomp/oacc-parallel.c | 36 +- libgomp/plugin/plugin-host.c | 27 +- libgomp/plugin/plugin-nvptx.c | 318 ++++++++----- libgomp/target.c | 8 +- .../libgomp.oacc-c-c++-common/lib-9.c | 2 +- 15 files changed, 592 insertions(+), 453 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 3cc6b44aca8..e97439f8977 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,7 @@ +2015-04-08 Julian Brown + + * config/nvptx/mkoffload.c (process): Support variable mapping. + 2015-03-27 Trevor Saunders * config/alpha/alpha.c (alpha_use_linkage): Change type of slot to diff --git a/gcc/config/nvptx/mkoffload.c b/gcc/config/nvptx/mkoffload.c index 02c44b6f3e9..dbc68bcfbed 100644 --- a/gcc/config/nvptx/mkoffload.c +++ b/gcc/config/nvptx/mkoffload.c @@ -839,6 +839,7 @@ process (FILE *in, FILE *out) { const char *input = read_file (in); Token *tok = tokenize (input); + unsigned int nvars = 0, nfuncs = 0; do tok = parse_file (tok); @@ -850,16 +851,17 @@ process (FILE *in, FILE *out) write_stmts (out, rev_stmts (fns)); fprintf (out, ";\n\n"); fprintf (out, "static const char *var_mappings[] = {\n"); - for (id_map *id = var_ids; id; id = id->next) + for (id_map *id = var_ids; id; id = id->next, nvars++) fprintf (out, "\t\"%s\"%s\n", id->ptx_name, id->next ? "," : ""); fprintf (out, "};\n\n"); fprintf (out, "static const char *func_mappings[] = {\n"); - for (id_map *id = func_ids; id; id = id->next) + for (id_map *id = func_ids; id; id = id->next, nfuncs++) fprintf (out, "\t\"%s\"%s\n", id->ptx_name, id->next ? "," : ""); fprintf (out, "};\n\n"); fprintf (out, "static const void *target_data[] = {\n"); - fprintf (out, " ptx_code, var_mappings, func_mappings\n"); + fprintf (out, " ptx_code, (void*) %u, var_mappings, (void*) %u, " + "func_mappings\n", nvars, nfuncs); fprintf (out, "};\n\n"); fprintf (out, "extern void GOMP_offload_register (const void *, int, void *);\n"); diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index aa1468f1543..4b0a1c91be6 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,89 @@ +2015-04-08 Julian Brown + + * libgomp.h (target_mem_desc: Remove mem_map field. + (acc_dispatch_t): Remove open_device_func, close_device_func, + get_device_num_func, set_device_num_func, target_data members. + Change create_thread_data_func argument to device number instead of + generic pointer. + * oacc-async.c (assert.h): Include. + (acc_async_test, acc_async_test_all, acc_wait, acc_wait_async) + (acc_wait_all, acc_wait_all_async): Use current host thread's + active device, not base_dev. + * oacc-cuda.c (acc_get_current_cuda_device) + (acc_get_current_cuda_context, acc_get_cuda_stream) + (acc_set_cuda_stream): Likewise. + * oacc-host.c (host_dispatch): Don't set open_device_func, + close_device_func, get_device_num_func or set_device_num_func. + * oacc-init.c (base_dev, init_key): Remove. + (cached_base_dev): New. + (name_of_acc_device_t): New. + (acc_init_1): Initialise default-numbered device, not zeroth. + (acc_shutdown_1): Close all devices of a given type. + (goacc_destroy_thread): Don't use base_dev. + (lazy_open, lazy_init, lazy_init_and_open): Remove. + (goacc_attach_host_thread_to_device): New. + (acc_init): Reimplement with goacc_attach_host_thread_to_device. + (acc_get_num_devices): Don't use base_dev. + (acc_set_device_type): Reimplement. + (acc_get_device_type): Don't use base_dev. + (acc_get_device_num): Tweak logic. + (acc_set_device_num): Likewise. + (acc_on_device): Use acc_get_device_type. + (goacc_runtime_initialize): Initialize cached_base_dev not base_dev. + (goacc_lazy_initialize): Reimplement with acc_init and + goacc_attach_host_thread_to_device. + * oacc-int.h (goacc_thread): Add base_dev field. + (base_dev): Remove extern declaration. + (goacc_attach_host_thread_to_device): Add prototype. + * oacc-mem.c (acc_malloc): Use current thread's device instead of + base_dev. + (acc_free): Likewise. + (acc_memcpy_to_device): Likewise. + (acc_memcpy_from_device): Likewise. + * oacc-parallel.c (select_acc_device): Remove. Replace calls with + goacc_lazy_initialize (throughout). + (GOACC_parallel): Use tgt_offset to locate target functions. + * target.c (gomp_map_vars): Don't set tgt->mem_map. + (gomp_unmap_vars): Use devicep->mem_map pointer not tgt->mem_map. + (gomp_load_plugin_for_device): Remove open_device, close_device, + get_device_num, set_device_num openacc hook initialisation. Don't set + openacc.target_data. + * plugin/plugin-host.c (GOMP_OFFLOAD_openacc_open_device) + (GOMP_OFFLOAD_openacc_close_device) + (GOMP_OFFLOAD_openacc_get_device_num) + (GOMP_OFFLOAD_openacc_set_device_num): Remove. + (GOMP_OFFLOAD_openacc_create_thread_data): Change (unused) argument + to int. + * plugin/plugin-nvptx.c (ptx_inited): Remove. + (instantiated_devices, ptx_dev_lock): New. + (struct ptx_image_data): New. + (ptx_devices, ptx_images, ptx_image_lock): New. + (fini_streams_for_device): Reorder cuStreamDestroy call. + (nvptx_get_num_devices): Remove forward declaration. + (nvptx_init): Change return type to bool. + (nvptx_fini): Remove. + (nvptx_attach_host_thread_to_device): New. + (nvptx_open_device): Return struct ptx_device* instead of void*. + (nvptx_close_device): Change argument type to struct ptx_device*, + return type to void. + (nvptx_get_num_devices): Use instantiated_devices not ptx_inited. + (kernel_target_data, kernel_host_table): Remove static globals. + (GOMP_OFFLOAD_register_image, GOMP_OFFLOAD_get_table): Remove. + (GOMP_OFFLOAD_init_device): Reimplement. + (GOMP_OFFLOAD_fini_device): Likewise. + (GOMP_OFFLOAD_load_image, GOMP_OFFLOAD_unload_image): New. + (GOMP_OFFLOAD_alloc, GOMP_OFFLOAD_free, GOMP_OFFLOAD_dev2host) + (GOMP_OFFLOAD_host2dev): Use ORD argument. + (GOMP_OFFLOAD_openacc_open_device) + (GOMP_OFFLOAD_openacc_close_device) + (GOMP_OFFLOAD_openacc_set_device_num) + (GOMP_OFFLOAD_openacc_get_device_num): Remove. + (GOMP_OFFLOAD_openacc_create_thread_data): Change argument to int + (device number). + + testsuite/ + * libgomp.oacc-c-c++-common/lib-9.c: Fix devnum check in test. + 2015-04-06 Ilya Verbin * libgomp-plugin.h (struct mapping_table): Replace with addr_pair. diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index a1d42c58d26..5272f0154b7 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -655,9 +655,6 @@ struct target_mem_desc { /* Corresponding target device descriptor. */ struct gomp_device_descr *device_descr; - /* Memory mapping info for the thread that created this descriptor. */ - struct splay_tree_s *mem_map; - /* List of splay keys to remove (or decrease refcount) at the end of region. */ splay_tree_key list[]; @@ -691,18 +688,6 @@ typedef struct acc_dispatch_t /* This is guarded by the lock in the "outer" struct gomp_device_descr. */ struct target_mem_desc *data_environ; - /* Extra information required for a device instance by a given target. */ - /* This is guarded by the lock in the "outer" struct gomp_device_descr. */ - void *target_data; - - /* Open or close a device instance. */ - void *(*open_device_func) (int n); - int (*close_device_func) (void *h); - - /* Set or get the device number. */ - int (*get_device_num_func) (void); - void (*set_device_num_func) (int); - /* Execute. */ void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *, unsigned short *, int, int, int, int, void *); @@ -720,7 +705,7 @@ typedef struct acc_dispatch_t void (*async_set_async_func) (int); /* Create/destroy TLS data. */ - void *(*create_thread_data_func) (void *); + void *(*create_thread_data_func) (int); void (*destroy_thread_data_func) (void *); /* NVIDIA target specific routines. */ diff --git a/libgomp/oacc-async.c b/libgomp/oacc-async.c index 08b7c5e1945..1f5827e79f6 100644 --- a/libgomp/oacc-async.c +++ b/libgomp/oacc-async.c @@ -26,7 +26,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see . */ - +#include #include "openacc.h" #include "libgomp.h" #include "oacc-int.h" @@ -37,13 +37,23 @@ acc_async_test (int async) if (async < acc_async_sync) gomp_fatal ("invalid async argument: %d", async); - return base_dev->openacc.async_test_func (async); + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + return thr->dev->openacc.async_test_func (async); } int acc_async_test_all (void) { - return base_dev->openacc.async_test_all_func (); + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + return thr->dev->openacc.async_test_all_func (); } void @@ -52,19 +62,34 @@ acc_wait (int async) if (async < acc_async_sync) gomp_fatal ("invalid async argument: %d", async); - base_dev->openacc.async_wait_func (async); + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + thr->dev->openacc.async_wait_func (async); } void acc_wait_async (int async1, int async2) { - base_dev->openacc.async_wait_async_func (async1, async2); + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + thr->dev->openacc.async_wait_async_func (async1, async2); } void acc_wait_all (void) { - base_dev->openacc.async_wait_all_func (); + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + thr->dev->openacc.async_wait_all_func (); } void @@ -73,5 +98,10 @@ acc_wait_all_async (int async) if (async < acc_async_sync) gomp_fatal ("invalid async argument: %d", async); - base_dev->openacc.async_wait_all_async_func (async); + struct goacc_thread *thr = goacc_thread (); + + if (!thr || !thr->dev) + gomp_fatal ("no device active"); + + thr->dev->openacc.async_wait_all_async_func (async); } diff --git a/libgomp/oacc-cuda.c b/libgomp/oacc-cuda.c index c8ef376e3a2..4aab4221a42 100644 --- a/libgomp/oacc-cuda.c +++ b/libgomp/oacc-cuda.c @@ -34,51 +34,53 @@ void * acc_get_current_cuda_device (void) { - void *p = NULL; + struct goacc_thread *thr = goacc_thread (); - if (base_dev && base_dev->openacc.cuda.get_current_device_func) - p = base_dev->openacc.cuda.get_current_device_func (); + if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func) + return thr->dev->openacc.cuda.get_current_device_func (); - return p; + return NULL; } void * acc_get_current_cuda_context (void) { - void *p = NULL; + struct goacc_thread *thr = goacc_thread (); - if (base_dev && base_dev->openacc.cuda.get_current_context_func) - p = base_dev->openacc.cuda.get_current_context_func (); - - return p; + if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func) + return thr->dev->openacc.cuda.get_current_context_func (); + + return NULL; } void * acc_get_cuda_stream (int async) { - void *p = NULL; + struct goacc_thread *thr = goacc_thread (); if (async < 0) - return p; - - if (base_dev && base_dev->openacc.cuda.get_stream_func) - p = base_dev->openacc.cuda.get_stream_func (async); + return NULL; - return p; + if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func) + return thr->dev->openacc.cuda.get_stream_func (async); + + return NULL; } int acc_set_cuda_stream (int async, void *stream) { - int s = -1; + struct goacc_thread *thr; if (async < 0 || stream == NULL) return 0; goacc_lazy_initialize (); - if (base_dev && base_dev->openacc.cuda.set_stream_func) - s = base_dev->openacc.cuda.set_stream_func (async, stream); + thr = goacc_thread (); + + if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func) + return thr->dev->openacc.cuda.set_stream_func (async, stream); - return s; + return -1; } diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c index e4756b67a77..6dcdbf3658e 100644 --- a/libgomp/oacc-host.c +++ b/libgomp/oacc-host.c @@ -53,16 +53,9 @@ static struct gomp_device_descr host_dispatch = .host2dev_func = GOMP_OFFLOAD_host2dev, .run_func = GOMP_OFFLOAD_run, - .mem_map.root = NULL, .is_initialized = false, .openacc = { - .open_device_func = GOMP_OFFLOAD_openacc_open_device, - .close_device_func = GOMP_OFFLOAD_openacc_close_device, - - .get_device_num_func = GOMP_OFFLOAD_openacc_get_device_num, - .set_device_num_func = GOMP_OFFLOAD_openacc_set_device_num, - .exec_func = GOMP_OFFLOAD_openacc_parallel, .register_async_cleanup_func diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c index 1e0243ede44..dc40fb6ffe1 100644 --- a/libgomp/oacc-init.c +++ b/libgomp/oacc-init.c @@ -37,14 +37,13 @@ static gomp_mutex_t acc_device_lock; -/* The dispatch table for the current accelerator device. This is global, so - you can only have one type of device open at any given time in a program. - This is the "base" device in that several devices that use the same - dispatch table may be active concurrently: this one (the "zeroth") is used - for overall initialisation/shutdown, and other instances -- not necessarily - including this one -- may be opened and closed once the base device has - been initialized. */ -struct gomp_device_descr *base_dev; +/* A cached version of the dispatcher for the global "current" accelerator type, + e.g. used as the default when creating new host threads. This is the + device-type equivalent of goacc_device_num (which specifies which device to + use out of potentially several of the same type). If there are several + devices of a given type, this points at the first one. */ + +static struct gomp_device_descr *cached_base_dev = NULL; #if defined HAVE_TLS || defined USE_EMUTLS __thread struct goacc_thread *goacc_tls_data; @@ -53,9 +52,6 @@ pthread_key_t goacc_tls_key; #endif static pthread_key_t goacc_cleanup_key; -/* Current dispatcher, and how it was initialized */ -static acc_device_t init_key = _ACC_device_hwm; - static struct goacc_thread *goacc_threads; static gomp_mutex_t goacc_thread_lock; @@ -94,6 +90,21 @@ get_openacc_name (const char *name) return name; } +static const char * +name_of_acc_device_t (enum acc_device_t type) +{ + switch (type) + { + case acc_device_none: return "none"; + case acc_device_default: return "default"; + case acc_device_host: return "host"; + case acc_device_host_nonshm: return "host_nonshm"; + case acc_device_not_host: return "not_host"; + case acc_device_nvidia: return "nvidia"; + default: gomp_fatal ("unknown device type %u", (unsigned) type); + } +} + static struct gomp_device_descr * resolve_device (acc_device_t d) { @@ -159,22 +170,87 @@ resolve_device (acc_device_t d) static struct gomp_device_descr * acc_init_1 (acc_device_t d) { - struct gomp_device_descr *acc_dev; + struct gomp_device_descr *base_dev, *acc_dev; + int ndevs; - acc_dev = resolve_device (d); + base_dev = resolve_device (d); + + ndevs = base_dev->get_num_devices_func (); + + if (!base_dev || ndevs <= 0 || goacc_device_num >= ndevs) + gomp_fatal ("device %s not supported", name_of_acc_device_t (d)); - if (!acc_dev || acc_dev->get_num_devices_func () <= 0) - gomp_fatal ("device %u not supported", (unsigned)d); + acc_dev = &base_dev[goacc_device_num]; if (acc_dev->is_initialized) gomp_fatal ("device already active"); - /* We need to remember what we were intialized as, to check shutdown etc. */ - init_key = d; - gomp_init_device (acc_dev); - return acc_dev; + return base_dev; +} + +static void +acc_shutdown_1 (acc_device_t d) +{ + struct gomp_device_descr *base_dev; + struct goacc_thread *walk; + int ndevs, i; + bool devices_active = false; + + /* Get the base device for this device type. */ + base_dev = resolve_device (d); + + if (!base_dev) + gomp_fatal ("device %s not supported", name_of_acc_device_t (d)); + + gomp_mutex_lock (&goacc_thread_lock); + + /* Free target-specific TLS data and close all devices. */ + for (walk = goacc_threads; walk != NULL; walk = walk->next) + { + if (walk->target_tls) + base_dev->openacc.destroy_thread_data_func (walk->target_tls); + + walk->target_tls = NULL; + + /* This would mean the user is shutting down OpenACC in the middle of an + "acc data" pragma. Likely not intentional. */ + if (walk->mapped_data) + gomp_fatal ("shutdown in 'acc data' region"); + + /* Similarly, if this happens then user code has done something weird. */ + if (walk->saved_bound_dev) + gomp_fatal ("shutdown during host fallback"); + + if (walk->dev) + { + gomp_mutex_lock (&walk->dev->lock); + gomp_free_memmap (&walk->dev->mem_map); + gomp_mutex_unlock (&walk->dev->lock); + + walk->dev = NULL; + walk->base_dev = NULL; + } + } + + gomp_mutex_unlock (&goacc_thread_lock); + + ndevs = base_dev->get_num_devices_func (); + + /* Close all the devices of this type that have been opened. */ + for (i = 0; i < ndevs; i++) + { + struct gomp_device_descr *acc_dev = &base_dev[i]; + if (acc_dev->is_initialized) + { + devices_active = true; + gomp_fini_device (acc_dev); + } + } + + if (!devices_active) + gomp_fatal ("no device initialized"); } static struct goacc_thread * @@ -207,9 +283,11 @@ goacc_destroy_thread (void *data) if (thr) { - if (base_dev && thr->target_tls) + struct gomp_device_descr *acc_dev = thr->dev; + + if (acc_dev && thr->target_tls) { - base_dev->openacc.destroy_thread_data_func (thr->target_tls); + acc_dev->openacc.destroy_thread_data_func (thr->target_tls); thr->target_tls = NULL; } @@ -236,53 +314,49 @@ goacc_destroy_thread (void *data) gomp_mutex_unlock (&goacc_thread_lock); } -/* Open the ORD'th device of the currently-active type (base_dev must be - initialised before calling). If ORD is < 0, open the default-numbered - device (set by the ACC_DEVICE_NUM environment variable or a call to - acc_set_device_num), or leave any currently-opened device as is. "Opening" - consists of calling the device's open_device_func hook, and setting up - thread-local data (maybe allocating, then initializing with information - pertaining to the newly-opened or previously-opened device). */ +/* Use the ORD'th device instance for the current host thread (or -1 for the + current global default). The device (and the runtime) must be initialised + before calling this function. */ -static void -lazy_open (int ord) +void +goacc_attach_host_thread_to_device (int ord) { struct goacc_thread *thr = goacc_thread (); - struct gomp_device_descr *acc_dev; - - if (thr && thr->dev) - { - assert (ord < 0 || ord == thr->dev->target_id); - return; - } - - assert (base_dev); - + struct gomp_device_descr *acc_dev = NULL, *base_dev = NULL; + int num_devices; + + if (thr && thr->dev && (thr->dev->target_id == ord || ord < 0)) + return; + if (ord < 0) ord = goacc_device_num; - - /* The OpenACC 2.0 spec leaves the runtime's behaviour when an out-of-range - device is requested as implementation-defined (4.2 ACC_DEVICE_NUM). - We choose to raise an error in such a case. */ - if (ord >= base_dev->get_num_devices_func ()) - gomp_fatal ("device %u does not exist", ord); - + + /* Decide which type of device to use. If the current thread has a device + type already (e.g. set by acc_set_device_type), use that, else use the + global default. */ + if (thr && thr->base_dev) + base_dev = thr->base_dev; + else + { + assert (cached_base_dev); + base_dev = cached_base_dev; + } + + num_devices = base_dev->get_num_devices_func (); + if (num_devices <= 0 || ord >= num_devices) + gomp_fatal ("device %u out of range", ord); + if (!thr) thr = goacc_new_thread (); - - acc_dev = thr->dev = &base_dev[ord]; - - assert (acc_dev->target_id == ord); - + + thr->base_dev = base_dev; + thr->dev = acc_dev = &base_dev[ord]; thr->saved_bound_dev = NULL; thr->mapped_data = NULL; - - if (!acc_dev->openacc.target_data) - acc_dev->openacc.target_data = acc_dev->openacc.open_device_func (ord); - + thr->target_tls - = acc_dev->openacc.create_thread_data_func (acc_dev->openacc.target_data); - + = acc_dev->openacc.create_thread_data_func (ord); + acc_dev->openacc.async_set_async_func (acc_async_sync); } @@ -292,74 +366,20 @@ lazy_open (int ord) void acc_init (acc_device_t d) { - if (!base_dev) + if (!cached_base_dev) gomp_init_targets_once (); gomp_mutex_lock (&acc_device_lock); - base_dev = acc_init_1 (d); - - lazy_open (-1); + cached_base_dev = acc_init_1 (d); gomp_mutex_unlock (&acc_device_lock); + + goacc_attach_host_thread_to_device (-1); } ialias (acc_init) -static void -acc_shutdown_1 (acc_device_t d) -{ - struct goacc_thread *walk; - - /* We don't check whether d matches the actual device found, because - OpenACC 2.0 (3.2.12) says the parameters to the init and this - call must match (for the shutdown call anyway, it's silent on - others). */ - - if (!base_dev) - gomp_fatal ("no device initialized"); - if (d != init_key) - gomp_fatal ("device %u(%u) is initialized", - (unsigned) init_key, (unsigned) base_dev->type); - - gomp_mutex_lock (&goacc_thread_lock); - - /* Free target-specific TLS data and close all devices. */ - for (walk = goacc_threads; walk != NULL; walk = walk->next) - { - if (walk->target_tls) - base_dev->openacc.destroy_thread_data_func (walk->target_tls); - - walk->target_tls = NULL; - - /* This would mean the user is shutting down OpenACC in the middle of an - "acc data" pragma. Likely not intentional. */ - if (walk->mapped_data) - gomp_fatal ("shutdown in 'acc data' region"); - - if (walk->dev) - { - void *target_data = walk->dev->openacc.target_data; - if (walk->dev->openacc.close_device_func (target_data) < 0) - gomp_fatal ("failed to close device"); - - walk->dev->openacc.target_data = target_data = NULL; - - gomp_mutex_lock (&walk->dev->lock); - gomp_free_memmap (&walk->dev->mem_map); - gomp_mutex_unlock (&walk->dev->lock); - - walk->dev = NULL; - } - } - - gomp_mutex_unlock (&goacc_thread_lock); - - gomp_fini_device (base_dev); - - base_dev = NULL; -} - void acc_shutdown (acc_device_t d) { @@ -372,59 +392,16 @@ acc_shutdown (acc_device_t d) ialias (acc_shutdown) -/* This function is called after plugins have been initialized. It deals with - the "base" device, and is used to prepare the runtime for dealing with a - number of such devices (as implemented by some particular plugin). If the - argument device type D matches a previous call to the function, return the - current base device, else shut the old device down and re-initialize with - the new device type. */ - -static struct gomp_device_descr * -lazy_init (acc_device_t d) -{ - if (base_dev) - { - /* Re-initializing the same device, do nothing. */ - if (d == init_key) - return base_dev; - - acc_shutdown_1 (init_key); - } - - assert (!base_dev); - - return acc_init_1 (d); -} - -/* Ensure that plugins are loaded, initialize and open the (default-numbered) - device. */ - -static void -lazy_init_and_open (acc_device_t d) -{ - if (!base_dev) - gomp_init_targets_once (); - - gomp_mutex_lock (&acc_device_lock); - - base_dev = lazy_init (d); - - lazy_open (-1); - - gomp_mutex_unlock (&acc_device_lock); -} - int acc_get_num_devices (acc_device_t d) { int n = 0; - const struct gomp_device_descr *acc_dev; + struct gomp_device_descr *acc_dev; if (d == acc_device_none) return 0; - if (!base_dev) - gomp_init_targets_once (); + gomp_init_targets_once (); acc_dev = resolve_device (d); if (!acc_dev) @@ -439,10 +416,39 @@ acc_get_num_devices (acc_device_t d) ialias (acc_get_num_devices) +/* Set the device type for the current thread only (using the current global + default device number), initialising that device if necessary. Also set the + default device type for new threads to D. */ + void acc_set_device_type (acc_device_t d) { - lazy_init_and_open (d); + struct gomp_device_descr *base_dev, *acc_dev; + struct goacc_thread *thr = goacc_thread (); + + gomp_mutex_lock (&acc_device_lock); + + if (!cached_base_dev) + gomp_init_targets_once (); + + cached_base_dev = base_dev = resolve_device (d); + acc_dev = &base_dev[goacc_device_num]; + + if (!acc_dev->is_initialized) + gomp_init_device (acc_dev); + + gomp_mutex_unlock (&acc_device_lock); + + /* We're changing device type: invalidate the current thread's dev and + base_dev pointers. */ + if (thr && thr->base_dev != base_dev) + { + thr->base_dev = thr->dev = NULL; + if (thr->mapped_data) + gomp_fatal ("acc_set_device_type in 'acc data' region"); + } + + goacc_attach_host_thread_to_device (-1); } ialias (acc_set_device_type) @@ -451,10 +457,11 @@ acc_device_t acc_get_device_type (void) { acc_device_t res = acc_device_none; - const struct gomp_device_descr *dev; + struct gomp_device_descr *dev; + struct goacc_thread *thr = goacc_thread (); - if (base_dev) - res = acc_device_type (base_dev->type); + if (thr && thr->base_dev) + res = acc_device_type (thr->base_dev->type); else { gomp_init_targets_once (); @@ -475,78 +482,65 @@ int acc_get_device_num (acc_device_t d) { const struct gomp_device_descr *dev; - int num; + struct goacc_thread *thr = goacc_thread (); if (d >= _ACC_device_hwm) gomp_fatal ("device %u out of range", (unsigned)d); - if (!base_dev) + if (!cached_base_dev) gomp_init_targets_once (); dev = resolve_device (d); if (!dev) - gomp_fatal ("no devices of type %u", d); + gomp_fatal ("device %s not supported", name_of_acc_device_t (d)); - /* We might not have called lazy_open for this host thread yet, in which case - the get_device_num_func hook will return -1. */ - num = dev->openacc.get_device_num_func (); - if (num < 0) - num = goacc_device_num; + if (thr && thr->base_dev == dev && thr->dev) + return thr->dev->target_id; - return num; + return goacc_device_num; } ialias (acc_get_device_num) void -acc_set_device_num (int n, acc_device_t d) +acc_set_device_num (int ord, acc_device_t d) { - const struct gomp_device_descr *dev; + struct gomp_device_descr *base_dev, *acc_dev; int num_devices; - if (!base_dev) + if (!cached_base_dev) gomp_init_targets_once (); - if ((int) d == 0) - { - int i; - - /* A device setting of zero sets all device types on the system to use - the Nth instance of that device type. Only attempt it for initialized - devices though. */ - for (i = acc_device_not_host + 1; i < _ACC_device_hwm; i++) - { - dev = resolve_device (d); - if (dev && dev->is_initialized) - dev->openacc.set_device_num_func (n); - } + if (ord < 0) + ord = goacc_device_num; - /* ...and for future calls to acc_init/acc_set_device_type, etc. */ - goacc_device_num = n; - } + if ((int) d == 0) + /* Set whatever device is being used by the current host thread to use + device instance ORD. It's unclear if this is supposed to affect other + host threads too (OpenACC 2.0 (3.2.4) acc_set_device_num). */ + goacc_attach_host_thread_to_device (ord); else { - struct goacc_thread *thr = goacc_thread (); - gomp_mutex_lock (&acc_device_lock); - base_dev = lazy_init (d); + cached_base_dev = base_dev = resolve_device (d); num_devices = base_dev->get_num_devices_func (); - if (n >= num_devices) - gomp_fatal ("device %u out of range", n); + if (ord >= num_devices) + gomp_fatal ("device %u out of range", ord); - /* If we're changing the device number, de-associate this thread with - the device (but don't close the device, since it may be in use by - other threads). */ - if (thr && thr->dev && n != thr->dev->target_id) - thr->dev = NULL; + acc_dev = &base_dev[ord]; - lazy_open (n); + if (!acc_dev->is_initialized) + gomp_init_device (acc_dev); gomp_mutex_unlock (&acc_device_lock); + + goacc_attach_host_thread_to_device (ord); } + + goacc_device_num = ord; } ialias (acc_set_device_num) @@ -554,10 +548,7 @@ ialias (acc_set_device_num) int acc_on_device (acc_device_t dev) { - struct goacc_thread *thr = goacc_thread (); - - if (thr && thr->dev - && acc_device_type (thr->dev->type) == acc_device_host_nonshm) + if (acc_get_device_type () == acc_device_host_nonshm) return dev == acc_device_host_nonshm || dev == acc_device_not_host; /* Just rely on the compiler builtin. */ @@ -577,7 +568,7 @@ goacc_runtime_initialize (void) pthread_key_create (&goacc_cleanup_key, goacc_destroy_thread); - base_dev = NULL; + cached_base_dev = NULL; goacc_threads = NULL; gomp_mutex_init (&goacc_thread_lock); @@ -606,9 +597,8 @@ goacc_restore_bind (void) } /* This is called from any OpenACC support function that may need to implicitly - initialize the libgomp runtime. On exit all such initialization will have - been done, and both the global ACC_dev and the per-host-thread ACC_memmap - pointers will be valid. */ + initialize the libgomp runtime, either globally or from a new host thread. + On exit "goacc_thread" will return a valid & populated thread block. */ attribute_hidden void goacc_lazy_initialize (void) @@ -618,12 +608,8 @@ goacc_lazy_initialize (void) if (thr && thr->dev) return; - if (!base_dev) - lazy_init_and_open (acc_device_default); + if (!cached_base_dev) + acc_init (acc_device_default); else - { - gomp_mutex_lock (&acc_device_lock); - lazy_open (-1); - gomp_mutex_unlock (&acc_device_lock); - } + goacc_attach_host_thread_to_device (-1); } diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h index 85619c8d10a..0ace737884a 100644 --- a/libgomp/oacc-int.h +++ b/libgomp/oacc-int.h @@ -56,6 +56,9 @@ acc_device_type (enum offload_target_type type) struct goacc_thread { + /* The base device for the current thread. */ + struct gomp_device_descr *base_dev; + /* The device for the current thread. */ struct gomp_device_descr *dev; @@ -89,10 +92,7 @@ goacc_thread (void) #endif void goacc_register (struct gomp_device_descr *) __GOACC_NOTHROW; - -/* Current dispatcher. */ -extern struct gomp_device_descr *base_dev; - +void goacc_attach_host_thread_to_device (int); void goacc_runtime_initialize (void); void goacc_save_and_set_bind (acc_device_t); void goacc_restore_bind (void); diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c index fdc82e654f9..89ef5fcd887 100644 --- a/libgomp/oacc-mem.c +++ b/libgomp/oacc-mem.c @@ -107,7 +107,9 @@ acc_malloc (size_t s) struct goacc_thread *thr = goacc_thread (); - return base_dev->alloc_func (thr->dev->target_id, s); + assert (thr->dev); + + return thr->dev->alloc_func (thr->dev->target_id, s); } /* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event @@ -122,6 +124,8 @@ acc_free (void *d) if (!d) return; + assert (thr && thr->dev); + /* We don't have to call lazy open here, as the ptr value must have been returned by acc_malloc. It's not permitted to pass NULL in (unless you got that null from acc_malloc). */ @@ -134,7 +138,7 @@ acc_free (void *d) acc_unmap_data ((void *)(k->host_start + offset)); } - base_dev->free_func (thr->dev->target_id, d); + thr->dev->free_func (thr->dev->target_id, d); } void @@ -144,7 +148,9 @@ acc_memcpy_to_device (void *d, void *h, size_t s) been obtained from a routine that did that. */ struct goacc_thread *thr = goacc_thread (); - base_dev->host2dev_func (thr->dev->target_id, d, h, s); + assert (thr && thr->dev); + + thr->dev->host2dev_func (thr->dev->target_id, d, h, s); } void @@ -154,7 +160,9 @@ acc_memcpy_from_device (void *h, void *d, size_t s) been obtained from a routine that did that. */ struct goacc_thread *thr = goacc_thread (); - base_dev->dev2host_func (thr->dev->target_id, h, d, s); + assert (thr && thr->dev); + + thr->dev->dev2host_func (thr->dev->target_id, h, d, s); } /* Return the device pointer that corresponds to host data H. Or NULL diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c index 563f9bb5b4b..d8999463d6d 100644 --- a/libgomp/oacc-parallel.c +++ b/libgomp/oacc-parallel.c @@ -49,32 +49,6 @@ find_pset (int pos, size_t mapnum, unsigned short *kinds) return kind == GOMP_MAP_TO_PSET; } - -/* Ensure that the target device for DEVICE_TYPE is initialised (and that - plugins have been loaded if appropriate). The ACC_dev variable for the - current thread will be set appropriately for the given device type on - return. */ - -attribute_hidden void -select_acc_device (int device_type) -{ - goacc_lazy_initialize (); - - if (device_type == GOMP_DEVICE_HOST_FALLBACK) - return; - - if (device_type == acc_device_none) - device_type = acc_device_host; - - if (device_type >= 0) - { - /* NOTE: this will go badly if the surrounding data environment is set up - to use a different device type. We'll just have to trust that users - know what they're doing... */ - acc_set_device_type (device_type); - } -} - static void goacc_wait (int async, int num_waits, va_list ap); void @@ -111,7 +85,7 @@ GOACC_parallel (int device, void (*fn) (void *), __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds, async); #endif - select_acc_device (device); + goacc_lazy_initialize (); thr = goacc_thread (); acc_dev = thr->dev; @@ -151,7 +125,7 @@ GOACC_parallel (int device, void (*fn) (void *), if (tgt_fn_key == NULL) gomp_fatal ("target function wasn't mapped"); - tgt_fn = (void (*)) tgt_fn_key->tgt->tgt_start; + tgt_fn = (void (*)) tgt_fn_key->tgt_offset; } else tgt_fn = (void (*)) fn; @@ -195,7 +169,7 @@ GOACC_data_start (int device, size_t mapnum, __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds); #endif - select_acc_device (device); + goacc_lazy_initialize (); struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; @@ -242,7 +216,7 @@ GOACC_enter_exit_data (int device, size_t mapnum, bool data_enter = false; size_t i; - select_acc_device (device); + goacc_lazy_initialize (); thr = goacc_thread (); acc_dev = thr->dev; @@ -429,7 +403,7 @@ GOACC_update (int device, size_t mapnum, bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; size_t i; - select_acc_device (device); + goacc_lazy_initialize (); struct goacc_thread *thr = goacc_thread (); struct gomp_device_descr *acc_dev = thr->dev; diff --git a/libgomp/plugin/plugin-host.c b/libgomp/plugin/plugin-host.c index bc60f72d05e..1faf5bc194e 100644 --- a/libgomp/plugin/plugin-host.c +++ b/libgomp/plugin/plugin-host.c @@ -118,31 +118,6 @@ GOMP_OFFLOAD_unload_image (int n __attribute__ ((unused)), { } -STATIC void * -GOMP_OFFLOAD_openacc_open_device (int n) -{ - return (void *) (intptr_t) n; -} - -STATIC int -GOMP_OFFLOAD_openacc_close_device (void *hnd) -{ - return 0; -} - -STATIC int -GOMP_OFFLOAD_openacc_get_device_num (void) -{ - return 0; -} - -STATIC void -GOMP_OFFLOAD_openacc_set_device_num (int n) -{ - if (n > 0) - GOMP (fatal) ("device number %u out of range for host execution", n); -} - STATIC void * GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t s) { @@ -254,7 +229,7 @@ GOMP_OFFLOAD_openacc_async_wait_all_async (int async __attribute__ ((unused))) } STATIC void * -GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data +GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__ ((unused))) { return NULL; diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 483cb7559e8..583ec87aeee 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -133,7 +133,8 @@ struct targ_fn_descriptor const char *name; }; -static bool ptx_inited = false; +static unsigned int instantiated_devices = 0; +static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; struct ptx_stream { @@ -331,9 +332,21 @@ struct ptx_event struct ptx_event *next; }; +struct ptx_image_data +{ + void *target_data; + CUmodule module; + struct ptx_image_data *next; +}; + static pthread_mutex_t ptx_event_lock; static struct ptx_event *ptx_events; +static struct ptx_device **ptx_devices; + +static struct ptx_image_data *ptx_images = NULL; +static pthread_mutex_t ptx_image_lock = PTHREAD_MUTEX_INITIALIZER; + #define _XSTR(s) _STR(s) #define _STR(s) #s @@ -450,8 +463,8 @@ fini_streams_for_device (struct ptx_device *ptx_dev) struct ptx_stream *s = ptx_dev->active_streams; ptx_dev->active_streams = ptx_dev->active_streams->next; - cuStreamDestroy (s->stream); map_fini (s); + cuStreamDestroy (s->stream); free (s); } @@ -575,21 +588,21 @@ select_stream_for_async (int async, pthread_t thread, bool create, return stream; } -static int nvptx_get_num_devices (void); - -/* Initialize the device. */ -static int +/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK + should be locked on entry and remains locked on exit. */ +static bool nvptx_init (void) { CUresult r; int rc; + int ndevs; - if (ptx_inited) - return nvptx_get_num_devices (); + if (instantiated_devices != 0) + return true; rc = verify_device_library (); if (rc < 0) - return -1; + return false; r = cuInit (0); if (r != CUDA_SUCCESS) @@ -599,22 +612,64 @@ nvptx_init (void) pthread_mutex_init (&ptx_event_lock, NULL); - ptx_inited = true; + r = cuDeviceGetCount (&ndevs); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r)); - return nvptx_get_num_devices (); + ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *) + * ndevs); + + return true; } +/* Select the N'th PTX device for the current host thread. The device must + have been previously opened before calling this function. */ + static void -nvptx_fini (void) +nvptx_attach_host_thread_to_device (int n) { - ptx_inited = false; + CUdevice dev; + CUresult r; + struct ptx_device *ptx_dev; + CUcontext thd_ctx; + + r = cuCtxGetDevice (&dev); + if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) + GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); + + if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n) + return; + else + { + CUcontext old_ctx; + + ptx_dev = ptx_devices[n]; + assert (ptx_dev); + + r = cuCtxGetCurrent (&thd_ctx); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); + + /* We don't necessarily have a current context (e.g. if it has been + destroyed. Pop it if we do though. */ + if (thd_ctx != NULL) + { + r = cuCtxPopCurrent (&old_ctx); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); + } + + r = cuCtxPushCurrent (ptx_dev->ctx); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r)); + } } -static void * +static struct ptx_device * nvptx_open_device (int n) { struct ptx_device *ptx_dev; - CUdevice dev; + CUdevice dev, ctx_dev; CUresult r; int async_engines, pi; @@ -628,6 +683,21 @@ nvptx_open_device (int n) ptx_dev->dev = dev; ptx_dev->ctx_shared = false; + r = cuCtxGetDevice (&ctx_dev); + if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) + GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); + + if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev) + { + /* The current host thread has an active context for a different device. + Detach it. */ + CUcontext old_ctx; + + r = cuCtxPopCurrent (&old_ctx); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); + } + r = cuCtxGetCurrent (&ptx_dev->ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); @@ -678,17 +748,16 @@ nvptx_open_device (int n) init_streams_for_device (ptx_dev, async_engines); - return (void *) ptx_dev; + return ptx_dev; } -static int -nvptx_close_device (void *targ_data) +static void +nvptx_close_device (struct ptx_device *ptx_dev) { CUresult r; - struct ptx_device *ptx_dev = targ_data; if (!ptx_dev) - return 0; + return; fini_streams_for_device (ptx_dev); @@ -700,8 +769,6 @@ nvptx_close_device (void *targ_data) } free (ptx_dev); - - return 0; } static int @@ -714,7 +781,7 @@ nvptx_get_num_devices (void) order to enumerate available devices, but CUDA API routines can't be used until cuInit has been called. Just call it now (but don't yet do any further initialization). */ - if (!ptx_inited) + if (instantiated_devices == 0) cuInit (0); r = cuDeviceGetCount (&n); @@ -1507,64 +1574,84 @@ GOMP_OFFLOAD_get_num_devices (void) return nvptx_get_num_devices (); } -static void **kernel_target_data; -static void **kernel_host_table; - void -GOMP_OFFLOAD_register_image (void *host_table, void *target_data) +GOMP_OFFLOAD_init_device (int n) { - kernel_target_data = target_data; - kernel_host_table = host_table; -} + pthread_mutex_lock (&ptx_dev_lock); -void -GOMP_OFFLOAD_init_device (int n __attribute__ ((unused))) -{ - (void) nvptx_init (); + if (!nvptx_init () || ptx_devices[n] != NULL) + { + pthread_mutex_unlock (&ptx_dev_lock); + return; + } + + ptx_devices[n] = nvptx_open_device (n); + instantiated_devices++; + + pthread_mutex_unlock (&ptx_dev_lock); } void -GOMP_OFFLOAD_fini_device (int n __attribute__ ((unused))) +GOMP_OFFLOAD_fini_device (int n) { - nvptx_fini (); + pthread_mutex_lock (&ptx_dev_lock); + + if (ptx_devices[n] != NULL) + { + nvptx_attach_host_thread_to_device (n); + nvptx_close_device (ptx_devices[n]); + ptx_devices[n] = NULL; + instantiated_devices--; + } + + pthread_mutex_unlock (&ptx_dev_lock); } int -GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)), - struct mapping_table **tablep) +GOMP_OFFLOAD_load_image (int ord, void *target_data, + struct addr_pair **target_table) { CUmodule module; - void **fn_table; - char **fn_names; - int fn_entries, i; + char **fn_names, **var_names; + unsigned int fn_entries, var_entries, i, j; CUresult r; struct targ_fn_descriptor *targ_fns; + void **img_header = (void **) target_data; + struct ptx_image_data *new_image; - if (nvptx_init () <= 0) - return 0; + GOMP_OFFLOAD_init_device (ord); - /* This isn't an error, because an image may legitimately have no offloaded - regions and so will not call GOMP_offload_register. */ - if (kernel_target_data == NULL) - return 0; + nvptx_attach_host_thread_to_device (ord); + + link_ptx (&module, img_header[0]); - link_ptx (&module, kernel_target_data[0]); + pthread_mutex_lock (&ptx_image_lock); + new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data)); + new_image->target_data = target_data; + new_image->module = module; + new_image->next = ptx_images; + ptx_images = new_image; + pthread_mutex_unlock (&ptx_image_lock); - /* kernel_target_data[0] -> ptx code - kernel_target_data[1] -> variable mappings - kernel_target_data[2] -> array of kernel names in ascii + /* The mkoffload utility emits a table of pointers/integers at the start of + each offload image: - kernel_host_table[0] -> start of function addresses (__offload_func_table) - kernel_host_table[1] -> end of function addresses (__offload_funcs_end) + img_header[0] -> ptx code + img_header[1] -> number of variables + img_header[2] -> array of variable names (pointers to strings) + img_header[3] -> number of kernels + img_header[4] -> array of kernel names (pointers to strings) The array of kernel names and the functions addresses form a one-to-one correspondence. */ - fn_table = kernel_host_table[0]; - fn_names = (char **) kernel_target_data[2]; - fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *); + var_entries = (uintptr_t) img_header[1]; + var_names = (char **) img_header[2]; + fn_entries = (uintptr_t) img_header[3]; + fn_names = (char **) img_header[4]; - *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries); + *target_table = GOMP_PLUGIN_malloc (sizeof (struct addr_pair) + * (fn_entries + var_entries)); targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor) * fn_entries); @@ -1579,38 +1666,86 @@ GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)), targ_fns[i].fn = function; targ_fns[i].name = (const char *) fn_names[i]; - (*tablep)[i].host_start = (uintptr_t) fn_table[i]; - (*tablep)[i].host_end = (*tablep)[i].host_start + 1; - (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i]; - (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1; + (*target_table)[i].start = (uintptr_t) &targ_fns[i]; + (*target_table)[i].end = (*target_table)[i].start + 1; } - return fn_entries; + for (j = 0; j < var_entries; j++, i++) + { + CUdeviceptr var; + size_t bytes; + + r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); + + (*target_table)[i].start = (uintptr_t) var; + (*target_table)[i].end = (*target_table)[i].start + bytes; + } + + return i; +} + +void +GOMP_OFFLOAD_unload_image (int tid __attribute__((unused)), void *target_data) +{ + void **img_header = (void **) target_data; + struct targ_fn_descriptor *targ_fns + = (struct targ_fn_descriptor *) img_header[0]; + struct ptx_image_data *image, *prev = NULL, *newhd = NULL; + + free (targ_fns); + + pthread_mutex_lock (&ptx_image_lock); + for (image = ptx_images; image != NULL;) + { + struct ptx_image_data *next = image->next; + + if (image->target_data == target_data) + { + cuModuleUnload (image->module); + free (image); + if (prev) + prev->next = next; + } + else + { + prev = image; + if (!newhd) + newhd = image; + } + + image = next; + } + ptx_images = newhd; + pthread_mutex_unlock (&ptx_image_lock); } void * -GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t size) +GOMP_OFFLOAD_alloc (int ord, size_t size) { + nvptx_attach_host_thread_to_device (ord); return nvptx_alloc (size); } void -GOMP_OFFLOAD_free (int n __attribute__ ((unused)), void *ptr) +GOMP_OFFLOAD_free (int ord, void *ptr) { + nvptx_attach_host_thread_to_device (ord); nvptx_free (ptr); } void * -GOMP_OFFLOAD_dev2host (int ord __attribute__ ((unused)), void *dst, - const void *src, size_t n) +GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) { + nvptx_attach_host_thread_to_device (ord); return nvptx_dev2host (dst, src, n); } void * -GOMP_OFFLOAD_host2dev (int ord __attribute__ ((unused)), void *dst, - const void *src, size_t n) +GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) { + nvptx_attach_host_thread_to_device (ord); return nvptx_host2dev (dst, src, n); } @@ -1627,45 +1762,6 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum, num_workers, vector_length, async, targ_mem_desc); } -void * -GOMP_OFFLOAD_openacc_open_device (int n) -{ - return nvptx_open_device (n); -} - -int -GOMP_OFFLOAD_openacc_close_device (void *h) -{ - return nvptx_close_device (h); -} - -void -GOMP_OFFLOAD_openacc_set_device_num (int n) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - - assert (n >= 0); - - if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n) - (void) nvptx_open_device (n); -} - -/* This can be called before the device is "opened" for the current thread, in - which case we can't tell which device number should be returned. We don't - actually want to open the device here, so just return -1 and let the caller - (oacc-init.c:acc_get_device_num) handle it. */ - -int -GOMP_OFFLOAD_openacc_get_device_num (void) -{ - struct nvptx_thread *nvthd = nvptx_thread (); - - if (nvthd && nvthd->ptx_dev) - return nvthd->ptx_dev->ord; - else - return -1; -} - void GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc) { @@ -1729,14 +1825,18 @@ GOMP_OFFLOAD_openacc_async_set_async (int async) } void * -GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data) +GOMP_OFFLOAD_openacc_create_thread_data (int ord) { - struct ptx_device *ptx_dev = (struct ptx_device *) targ_data; + struct ptx_device *ptx_dev; struct nvptx_thread *nvthd = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread)); CUresult r; CUcontext thd_ctx; + ptx_dev = ptx_devices[ord]; + + assert (ptx_dev); + r = cuCtxGetCurrent (&thd_ctx); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); diff --git a/libgomp/target.c b/libgomp/target.c index dfe7fb9dbf9..d8da7833aa9 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -178,7 +178,6 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, tgt->list_count = mapnum; tgt->refcount = 1; tgt->device_descr = devicep; - tgt->mem_map = mem_map; if (mapnum == 0) return tgt; @@ -597,7 +596,7 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom) devicep->dev2host_func (devicep->target_id, (void *) k->host_start, (void *) (k->tgt->tgt_start + k->tgt_offset), k->host_end - k->host_start); - splay_tree_remove (tgt->mem_map, k); + splay_tree_remove (&devicep->mem_map, k); if (k->tgt->refcount > 1) k->tgt->refcount--; else @@ -1159,10 +1158,6 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, { optional_present = optional_total = 0; DLSYM_OPT (openacc.exec, openacc_parallel); - DLSYM_OPT (openacc.open_device, openacc_open_device); - DLSYM_OPT (openacc.close_device, openacc_close_device); - DLSYM_OPT (openacc.get_device_num, openacc_get_device_num); - DLSYM_OPT (openacc.set_device_num, openacc_set_device_num); DLSYM_OPT (openacc.register_async_cleanup, openacc_register_async_cleanup); DLSYM_OPT (openacc.async_test, openacc_async_test); @@ -1271,7 +1266,6 @@ gomp_target_init (void) current_device.mem_map.root = NULL; current_device.is_initialized = false; current_device.openacc.data_environ = NULL; - current_device.openacc.target_data = NULL; for (i = 0; i < new_num_devices; i++) { current_device.target_id = i; diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-9.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-9.c index 84045dbe328..a4cf7f2e848 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-9.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-9.c @@ -58,7 +58,7 @@ main (int argc, char **argv) acc_set_device_num (1, (acc_device_t) 0); devnum = acc_get_device_num (devtype); - if (devnum != 0) + if (devnum != 1) abort (); } -- 2.30.2