libgomp: Fix hang when profiling OpenACC programs with CUDA 9.0 nvprof

author Kwok Cheung Yeung <kcy@codesourcery.com>

Tue, 14 Jul 2020 17:31:35 +0000 (10:31 -0700)

committer Kwok Cheung Yeung <kcy@codesourcery.com>

Tue, 14 Jul 2020 17:31:35 +0000 (10:31 -0700)
author Kwok Cheung Yeung <kcy@codesourcery.com>
Tue, 14 Jul 2020 17:31:35 +0000 (10:31 -0700)
committer Kwok Cheung Yeung <kcy@codesourcery.com>
Tue, 14 Jul 2020 17:31:35 +0000 (10:31 -0700)
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi

index b946743f9b1abcdbfd1add5d8ce406c9acee6dd8..5331230c20740e8a612a6b8c9a6fafdd20a1af2a 100644 (file)
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -1967,6 +1967,12 @@ in @var{devicetype}, to use when executing a parallel or kernels region.
  This function returns what device type will be used when executing a
  parallel or kernels region.
  
+This function returns @code{acc_device_none} if
+@code{acc_get_device_type} is called from
+@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
+callbacks of the OpenACC Profiling Interface (@ref{OpenACC Profiling
+Interface}), that is, if the device is currently being initialized.
+
  @item @emph{C/C++}:
  @multitable @columnfractions .20 .80
  @item @emph{Prototype}: @tab @code{acc_device_t acc_get_device_type(void);}
@@ -3382,6 +3388,11 @@ every event that has been registered.
  
  We're not yet accounting for the fact that @cite{OpenACC events may
  occur during event processing}.
+We just handle one case specially, as required by CUDA 9.0
+@command{nvprof}, that @code{acc_get_device_type}
+(@ref{acc_get_device_type})) may be called from
+@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
+callbacks.
  
  We're not yet implementing initialization via a
  @code{acc_register_library} function that is either statically linked
diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c

index 5d786a5a2e7c8ce26cc878c83ba332ca4a3a895e..1e7f9345c682b30f1e342f4435d24f013f417e6e 100644 (file)
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -40,6 +40,11 @@
  
  static gomp_mutex_t acc_device_lock;
  
+static gomp_mutex_t acc_init_state_lock;
+static enum { uninitialized, initializing, initialized } acc_init_state
+  = uninitialized;
+static pthread_t acc_init_thread;
+
  /* A cached version of the dispatcher for the global "current" accelerator type,
     e.g. used as the default when creating new host threads.  This is the
     device-type equivalent of goacc_device_num (which specifies which device to
@@ -228,6 +233,11 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
  static struct gomp_device_descr *
  acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
  {
+  gomp_mutex_lock (&acc_init_state_lock);
+  acc_init_state = initializing;
+  acc_init_thread = pthread_self ();
+  gomp_mutex_unlock (&acc_init_state_lock);
+
    bool check_not_nested_p;
    if (implicit)
      {
@@ -317,6 +327,14 @@ acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
                                 &api_info);
      }
  
+  /* We're setting 'initialized' *after* 'goacc_profiling_dispatch', so that a
+     nested 'acc_get_device_type' called from a profiling callback still sees
+     'initializing', so that we don't deadlock when it then again tries to lock
+     'goacc_prof_lock'.  See also the discussion in 'acc_get_device_type'.  */
+  gomp_mutex_lock (&acc_init_state_lock);
+  acc_init_state = initialized;
+  gomp_mutex_unlock (&acc_init_state_lock);
+
    return base_dev;
  }
  
@@ -643,6 +661,17 @@ acc_set_device_type (acc_device_t d)
  
  ialias (acc_set_device_type)
  
+static bool
+self_initializing_p (void)
+{
+  bool res;
+  gomp_mutex_lock (&acc_init_state_lock);
+  res = (acc_init_state == initializing
+        && pthread_equal (acc_init_thread, pthread_self ()));
+  gomp_mutex_unlock (&acc_init_state_lock);
+  return res;
+}
+
  acc_device_t
  acc_get_device_type (void)
  {
@@ -652,6 +681,15 @@ acc_get_device_type (void)
  
    if (thr && thr->base_dev)
      res = acc_device_type (thr->base_dev->type);
+  else if (self_initializing_p ())
+    /* The Cuda libaccinj64.so version 9.0+ calls acc_get_device_type during the
+       acc_ev_device_init_start event callback, which is dispatched during
+       acc_init_1.  Trying to lock acc_device_lock during such a call (as we do
+       in the else clause below), will result in deadlock, since the lock has
+       already been taken by the acc_init_1 caller.  We work around this problem
+       by using the acc_get_device_type property "If the device type has not yet
+       been selected, the value acc_device_none may be returned".  */
+    ;
    else
      {
        acc_prof_info prof_info;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c

new file mode 100644 (file)

index 0000000..b4e9f18
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c
@@ -0,0 +1,80 @@
+/* { dg-do run } */
+/* { dg-timeout 10 } */
+
+/* Test the calling of 'acc_get_device_type' from within
+   'cb_device_init_start' and 'cb_device_init_end' callbacks.  This occurs
+   when the CUDA 9.0 'nvprof' tool is used, and previously deadlocked.  */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <acc_prof.h>
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+static bool expect_cb_device_init_start;
+static bool expect_cb_device_init_end;
+
+static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  assert (expect_cb_device_init_start);
+  expect_cb_device_init_start = false;
+
+  acc_device_t acc_device_type;
+  acc_device_type = acc_get_device_type ();
+  assert (acc_device_type == acc_device_none);
+
+  expect_cb_device_init_end = true;
+}
+
+static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  assert (expect_cb_device_init_end);
+  expect_cb_device_init_end = false;
+
+  acc_device_t acc_device_type;
+  acc_device_type = acc_get_device_type ();
+  assert (acc_device_type == acc_device_none);
+}
+
+int main(void)
+{
+  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
+
+  reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
+  reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
+
+  expect_cb_device_init_start = true;
+  expect_cb_device_init_end = false;
+  acc_init (acc_device_host);
+  assert (!expect_cb_device_init_start);
+  assert (!expect_cb_device_init_end);
+  {
+    acc_device_t acc_device_type;
+    acc_device_type = acc_get_device_type ();
+    assert (acc_device_type == acc_device_host);
+  }
+  acc_shutdown (acc_device_host);
+
+  expect_cb_device_init_start = true;
+  expect_cb_device_init_end = false;
+  acc_init (acc_device_default);
+  assert (!expect_cb_device_init_start);
+  assert (!expect_cb_device_init_end);
+  {
+    acc_device_t acc_device_type;
+    acc_device_type = acc_get_device_type ();
+    assert (acc_device_type != acc_device_none);
+  }
+  acc_shutdown (acc_device_default);
+
+  return 0;
+}
author	Kwok Cheung Yeung <kcy@codesourcery.com>
	Tue, 14 Jul 2020 17:31:35 +0000 (10:31 -0700)
committer	Kwok Cheung Yeung <kcy@codesourcery.com>
	Tue, 14 Jul 2020 17:31:35 +0000 (10:31 -0700)
libgomp/libgomp.texi		patch \| blob \| history
libgomp/oacc-init.c		patch \| blob \| history
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c	[new file with mode: 0644]	patch \| blob