From: Kwok Cheung Yeung Date: Tue, 14 Jul 2020 17:31:35 +0000 (-0700) Subject: libgomp: Fix hang when profiling OpenACC programs with CUDA 9.0 nvprof X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b52643ab9004ba8ecea06a399885fe1e04183eda;p=gcc.git libgomp: Fix hang when profiling OpenACC programs with CUDA 9.0 nvprof The version of nvprof in CUDA 9.0 causes a hang when used to profile an OpenACC program. This is because it calls acc_get_device_type from a callback called during device initialization, which then attempts to acquire acc_device_lock while it is already taken, resulting in deadlock. This works around the issue by returning acc_device_none from acc_get_device_type without attempting to acquire the lock when initialization has not completed yet. 2020-07-14 Tom de Vries Cesar Philippidis Thomas Schwinge Kwok Cheung Yeung libgomp/ * oacc-init.c (acc_init_state_lock, acc_init_state, acc_init_thread): New variable. (acc_init_1): Set acc_init_thread to pthread_self (). Set acc_init_state to initializing at the start, and to initialized at the end. (self_initializing_p): New function. (acc_get_device_type): Return acc_device_none if called by thread that is currently executing acc_init_1. * libgomp.texi (acc_get_device_type): Update documentation. (Implementation Status and Implementation-Defined Behavior): Likewise. * testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c: New. --- diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi index b946743f9b1..5331230c207 100644 --- a/libgomp/libgomp.texi +++ b/libgomp/libgomp.texi @@ -1967,6 +1967,12 @@ in @var{devicetype}, to use when executing a parallel or kernels region. This function returns what device type will be used when executing a parallel or kernels region. +This function returns @code{acc_device_none} if +@code{acc_get_device_type} is called from +@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end} +callbacks of the OpenACC Profiling Interface (@ref{OpenACC Profiling +Interface}), that is, if the device is currently being initialized. + @item @emph{C/C++}: @multitable @columnfractions .20 .80 @item @emph{Prototype}: @tab @code{acc_device_t acc_get_device_type(void);} @@ -3382,6 +3388,11 @@ every event that has been registered. We're not yet accounting for the fact that @cite{OpenACC events may occur during event processing}. +We just handle one case specially, as required by CUDA 9.0 +@command{nvprof}, that @code{acc_get_device_type} +(@ref{acc_get_device_type})) may be called from +@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end} +callbacks. We're not yet implementing initialization via a @code{acc_register_library} function that is either statically linked diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c index 5d786a5a2e7..1e7f9345c68 100644 --- a/libgomp/oacc-init.c +++ b/libgomp/oacc-init.c @@ -40,6 +40,11 @@ static gomp_mutex_t acc_device_lock; +static gomp_mutex_t acc_init_state_lock; +static enum { uninitialized, initializing, initialized } acc_init_state + = uninitialized; +static pthread_t acc_init_thread; + /* A cached version of the dispatcher for the global "current" accelerator type, e.g. used as the default when creating new host threads. This is the device-type equivalent of goacc_device_num (which specifies which device to @@ -228,6 +233,11 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs) static struct gomp_device_descr * acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit) { + gomp_mutex_lock (&acc_init_state_lock); + acc_init_state = initializing; + acc_init_thread = pthread_self (); + gomp_mutex_unlock (&acc_init_state_lock); + bool check_not_nested_p; if (implicit) { @@ -317,6 +327,14 @@ acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit) &api_info); } + /* We're setting 'initialized' *after* 'goacc_profiling_dispatch', so that a + nested 'acc_get_device_type' called from a profiling callback still sees + 'initializing', so that we don't deadlock when it then again tries to lock + 'goacc_prof_lock'. See also the discussion in 'acc_get_device_type'. */ + gomp_mutex_lock (&acc_init_state_lock); + acc_init_state = initialized; + gomp_mutex_unlock (&acc_init_state_lock); + return base_dev; } @@ -643,6 +661,17 @@ acc_set_device_type (acc_device_t d) ialias (acc_set_device_type) +static bool +self_initializing_p (void) +{ + bool res; + gomp_mutex_lock (&acc_init_state_lock); + res = (acc_init_state == initializing + && pthread_equal (acc_init_thread, pthread_self ())); + gomp_mutex_unlock (&acc_init_state_lock); + return res; +} + acc_device_t acc_get_device_type (void) { @@ -652,6 +681,15 @@ acc_get_device_type (void) if (thr && thr->base_dev) res = acc_device_type (thr->base_dev->type); + else if (self_initializing_p ()) + /* The Cuda libaccinj64.so version 9.0+ calls acc_get_device_type during the + acc_ev_device_init_start event callback, which is dispatched during + acc_init_1. Trying to lock acc_device_lock during such a call (as we do + in the else clause below), will result in deadlock, since the lock has + already been taken by the acc_init_1 caller. We work around this problem + by using the acc_get_device_type property "If the device type has not yet + been selected, the value acc_device_none may be returned". */ + ; else { acc_prof_info prof_info; diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c new file mode 100644 index 00000000000..b4e9f188aa6 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c @@ -0,0 +1,80 @@ +/* { dg-do run } */ +/* { dg-timeout 10 } */ + +/* Test the calling of 'acc_get_device_type' from within + 'cb_device_init_start' and 'cb_device_init_end' callbacks. This occurs + when the CUDA 9.0 'nvprof' tool is used, and previously deadlocked. */ + +#include +#include +#include + +static acc_prof_reg reg; +static acc_prof_reg unreg; +static acc_prof_lookup_func lookup; + +void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_) +{ + reg = reg_; + unreg = unreg_; + lookup = lookup_; +} + +static bool expect_cb_device_init_start; +static bool expect_cb_device_init_end; + +static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + assert (expect_cb_device_init_start); + expect_cb_device_init_start = false; + + acc_device_t acc_device_type; + acc_device_type = acc_get_device_type (); + assert (acc_device_type == acc_device_none); + + expect_cb_device_init_end = true; +} + +static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + assert (expect_cb_device_init_end); + expect_cb_device_init_end = false; + + acc_device_t acc_device_type; + acc_device_type = acc_get_device_type (); + assert (acc_device_type == acc_device_none); +} + +int main(void) +{ + acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup); + + reg (acc_ev_device_init_start, cb_device_init_start, acc_reg); + reg (acc_ev_device_init_end, cb_device_init_end, acc_reg); + + expect_cb_device_init_start = true; + expect_cb_device_init_end = false; + acc_init (acc_device_host); + assert (!expect_cb_device_init_start); + assert (!expect_cb_device_init_end); + { + acc_device_t acc_device_type; + acc_device_type = acc_get_device_type (); + assert (acc_device_type == acc_device_host); + } + acc_shutdown (acc_device_host); + + expect_cb_device_init_start = true; + expect_cb_device_init_end = false; + acc_init (acc_device_default); + assert (!expect_cb_device_init_start); + assert (!expect_cb_device_init_end); + { + acc_device_t acc_device_type; + acc_device_type = acc_get_device_type (); + assert (acc_device_type != acc_device_none); + } + acc_shutdown (acc_device_default); + + return 0; +}