--- /dev/null
+/* CUDA API description.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>.
+
+This header provides the minimum amount of typedefs, enums and function
+declarations to be able to compile plugin-nvptx.c if cuda.h and
+libcuda.so.1 are not available. */
+
+#ifndef GCC_CUDA_H
+#define GCC_CUDA_H
+
+#include <stdlib.h>
+
+#define CUDA_VERSION 8000
+
+typedef void *CUcontext;
+typedef int CUdevice;
+#ifdef __LP64__
+typedef unsigned long long CUdeviceptr;
+#else
+typedef unsigned CUdeviceptr;
+#endif
+typedef void *CUevent;
+typedef void *CUfunction;
+typedef void *CUlinkState;
+typedef void *CUmodule;
+typedef void *CUstream;
+
+typedef enum {
+ CUDA_SUCCESS = 0,
+ CUDA_ERROR_INVALID_VALUE = 1,
+ CUDA_ERROR_OUT_OF_MEMORY = 2,
+ CUDA_ERROR_INVALID_CONTEXT = 201,
+ CUDA_ERROR_NOT_FOUND = 500,
+ CUDA_ERROR_NOT_READY = 600,
+ CUDA_ERROR_LAUNCH_FAILED = 719
+} CUresult;
+
+typedef enum {
+ CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
+ CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
+ CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
+ CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
+ CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
+ CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
+ CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
+ CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
+ CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,
+ CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
+ CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
+ CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
+ CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
+} CUdevice_attribute;
+
+enum {
+ CU_EVENT_DEFAULT = 0,
+ CU_EVENT_DISABLE_TIMING = 2
+};
+
+typedef enum {
+ CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+ CU_FUNC_ATTRIBUTE_NUM_REGS = 4
+} CUfunction_attribute;
+
+typedef enum {
+ CU_JIT_WALL_TIME = 2,
+ CU_JIT_INFO_LOG_BUFFER = 3,
+ CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4,
+ CU_JIT_ERROR_LOG_BUFFER = 5,
+ CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6,
+ CU_JIT_LOG_VERBOSE = 12
+} CUjit_option;
+
+typedef enum {
+ CU_JIT_INPUT_PTX = 1
+} CUjitInputType;
+
+enum {
+ CU_CTX_SCHED_AUTO = 0
+};
+
+#define CU_LAUNCH_PARAM_END ((void *) 0)
+#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void *) 1)
+#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void *) 2)
+
+enum {
+ CU_STREAM_DEFAULT = 0,
+ CU_STREAM_NON_BLOCKING = 1
+};
+
+#define cuCtxCreate cuCtxCreate_v2
+CUresult cuCtxCreate (CUcontext *, unsigned, CUdevice);
+#define cuCtxDestroy cuCtxDestroy_v2
+CUresult cuCtxDestroy (CUcontext);
+CUresult cuCtxGetCurrent (CUcontext *);
+CUresult cuCtxGetDevice (CUdevice *);
+#define cuCtxPopCurrent cuCtxPopCurrent_v2
+CUresult cuCtxPopCurrent (CUcontext *);
+#define cuCtxPushCurrent cuCtxPushCurrent_v2
+CUresult cuCtxPushCurrent (CUcontext);
+CUresult cuCtxSynchronize (void);
+CUresult cuDeviceGet (CUdevice *, int);
+CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
+CUresult cuDeviceGetCount (int *);
+CUresult cuEventCreate (CUevent *, unsigned);
+#define cuEventDestroy cuEventDestroy_v2
+CUresult cuEventDestroy (CUevent);
+CUresult cuEventElapsedTime (float *, CUevent, CUevent);
+CUresult cuEventQuery (CUevent);
+CUresult cuEventRecord (CUevent, CUstream);
+CUresult cuEventSynchronize (CUevent);
+CUresult cuFuncGetAttribute (int *, CUfunction_attribute, CUfunction);
+CUresult cuGetErrorString (CUresult, const char **);
+CUresult cuInit (unsigned);
+CUresult cuLaunchKernel (CUfunction, unsigned, unsigned, unsigned, unsigned,
+ unsigned, unsigned, unsigned, CUstream, void **, void **);
+#define cuLinkAddData cuLinkAddData_v2
+CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t, const char *,
+ unsigned, CUjit_option *, void **);
+CUresult cuLinkComplete (CUlinkState, void **, size_t *);
+#define cuLinkCreate cuLinkCreate_v2
+CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
+CUresult cuLinkDestroy (CUlinkState);
+#define cuMemAlloc cuMemAlloc_v2
+CUresult cuMemAlloc (CUdeviceptr *, size_t);
+#define cuMemAllocHost cuMemAllocHost_v2
+CUresult cuMemAllocHost (void **, size_t);
+CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
+#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
+CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream);
+#define cuMemcpyDtoH cuMemcpyDtoH_v2
+CUresult cuMemcpyDtoH (void *, CUdeviceptr, size_t);
+#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2
+CUresult cuMemcpyDtoHAsync (void *, CUdeviceptr, size_t, CUstream);
+#define cuMemcpyHtoD cuMemcpyHtoD_v2
+CUresult cuMemcpyHtoD (CUdeviceptr, const void *, size_t);
+#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
+CUresult cuMemcpyHtoDAsync (CUdeviceptr, const void *, size_t, CUstream);
+#define cuMemFree cuMemFree_v2
+CUresult cuMemFree (CUdeviceptr);
+CUresult cuMemFreeHost (void *);
+#define cuMemGetAddressRange cuMemGetAddressRange_v2
+CUresult cuMemGetAddressRange (CUdeviceptr *, size_t *, CUdeviceptr);
+#define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2
+CUresult cuMemHostGetDevicePointer (CUdeviceptr *, void *, unsigned);
+CUresult cuModuleGetFunction (CUfunction *, CUmodule, const char *);
+#define cuModuleGetGlobal cuModuleGetGlobal_v2
+CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
+CUresult cuModuleLoad (CUmodule *, const char *);
+CUresult cuModuleLoadData (CUmodule *, const void *);
+CUresult cuModuleUnload (CUmodule);
+CUresult cuStreamCreate (CUstream *, unsigned);
+#define cuStreamDestroy cuStreamDestroy_v2
+CUresult cuStreamDestroy (CUstream);
+CUresult cuStreamQuery (CUstream);
+CUresult cuStreamSynchronize (CUstream);
+CUresult cuStreamWaitEvent (CUstream, CUevent, unsigned);
+
+#endif /* GCC_CUDA_H */
#include <assert.h>
#include <errno.h>
-static const char *
-cuda_error (CUresult r)
-{
-#if CUDA_VERSION < 7000
- /* Specified in documentation and present in library from at least
- 5.5. Not declared in header file prior to 7.0. */
- extern CUresult cuGetErrorString (CUresult, const char **);
-#endif
- const char *desc;
-
- r = cuGetErrorString (r, &desc);
- if (r != CUDA_SUCCESS)
- desc = "unknown cuda error";
-
- return desc;
+#if PLUGIN_NVPTX_DYNAMIC
+# include <dlfcn.h>
+
+# define CUDA_CALLS \
+CUDA_ONE_CALL (cuCtxCreate) \
+CUDA_ONE_CALL (cuCtxDestroy) \
+CUDA_ONE_CALL (cuCtxGetCurrent) \
+CUDA_ONE_CALL (cuCtxGetDevice) \
+CUDA_ONE_CALL (cuCtxPopCurrent) \
+CUDA_ONE_CALL (cuCtxPushCurrent) \
+CUDA_ONE_CALL (cuCtxSynchronize) \
+CUDA_ONE_CALL (cuDeviceGet) \
+CUDA_ONE_CALL (cuDeviceGetAttribute) \
+CUDA_ONE_CALL (cuDeviceGetCount) \
+CUDA_ONE_CALL (cuEventCreate) \
+CUDA_ONE_CALL (cuEventDestroy) \
+CUDA_ONE_CALL (cuEventElapsedTime) \
+CUDA_ONE_CALL (cuEventQuery) \
+CUDA_ONE_CALL (cuEventRecord) \
+CUDA_ONE_CALL (cuEventSynchronize) \
+CUDA_ONE_CALL (cuFuncGetAttribute) \
+CUDA_ONE_CALL (cuGetErrorString) \
+CUDA_ONE_CALL (cuInit) \
+CUDA_ONE_CALL (cuLaunchKernel) \
+CUDA_ONE_CALL (cuLinkAddData) \
+CUDA_ONE_CALL (cuLinkComplete) \
+CUDA_ONE_CALL (cuLinkCreate) \
+CUDA_ONE_CALL (cuLinkDestroy) \
+CUDA_ONE_CALL (cuMemAlloc) \
+CUDA_ONE_CALL (cuMemAllocHost) \
+CUDA_ONE_CALL (cuMemcpy) \
+CUDA_ONE_CALL (cuMemcpyDtoDAsync) \
+CUDA_ONE_CALL (cuMemcpyDtoH) \
+CUDA_ONE_CALL (cuMemcpyDtoHAsync) \
+CUDA_ONE_CALL (cuMemcpyHtoD) \
+CUDA_ONE_CALL (cuMemcpyHtoDAsync) \
+CUDA_ONE_CALL (cuMemFree) \
+CUDA_ONE_CALL (cuMemFreeHost) \
+CUDA_ONE_CALL (cuMemGetAddressRange) \
+CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
+CUDA_ONE_CALL (cuModuleGetFunction) \
+CUDA_ONE_CALL (cuModuleGetGlobal) \
+CUDA_ONE_CALL (cuModuleLoad) \
+CUDA_ONE_CALL (cuModuleLoadData) \
+CUDA_ONE_CALL (cuModuleUnload) \
+CUDA_ONE_CALL (cuStreamCreate) \
+CUDA_ONE_CALL (cuStreamDestroy) \
+CUDA_ONE_CALL (cuStreamQuery) \
+CUDA_ONE_CALL (cuStreamSynchronize) \
+CUDA_ONE_CALL (cuStreamWaitEvent)
+# define CUDA_ONE_CALL(call) \
+ __typeof (call) *call;
+struct cuda_lib_s {
+ CUDA_CALLS
+} cuda_lib;
+
+/* -1 if init_cuda_lib has not been called yet, false
+ if it has been and failed, true if it has been and succeeded. */
+static char cuda_lib_inited = -1;
+
+/* Dynamically load the CUDA runtime library and initialize function
+ pointers, return false if unsuccessful, true if successful. */
+static bool
+init_cuda_lib (void)
+{
+ if (cuda_lib_inited != -1)
+ return cuda_lib_inited;
+ const char *cuda_runtime_lib = "libcuda.so.1";
+ void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
+ cuda_lib_inited = false;
+ if (h == NULL)
+ return false;
+# undef CUDA_ONE_CALL
+# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
+# define CUDA_ONE_CALL_1(call) \
+ cuda_lib.call = dlsym (h, #call); \
+ if (cuda_lib.call == NULL) \
+ return false;
+ CUDA_CALLS
+ cuda_lib_inited = true;
+ return true;
}
+# undef CUDA_ONE_CALL
+# undef CUDA_ONE_CALL_1
+# define CUDA_CALL_PREFIX cuda_lib.
+#else
+# define CUDA_CALL_PREFIX
+# define init_cuda_lib() true
+#endif
/* Convenience macros for the frequently used CUDA library call and
- error handling sequence. This does not capture all the cases we
- use in this file, but is common enough. */
+ error handling sequence as well as CUDA library calls that
+ do the error checking themselves or don't do it at all. */
#define CUDA_CALL_ERET(ERET, FN, ...) \
do { \
- unsigned __r = FN (__VA_ARGS__); \
+ unsigned __r \
+ = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
if (__r != CUDA_SUCCESS) \
{ \
GOMP_PLUGIN_error (#FN " error: %s", \
} while (0)
#define CUDA_CALL(FN, ...) \
- CUDA_CALL_ERET (false, (FN), __VA_ARGS__)
+ CUDA_CALL_ERET (false, FN, __VA_ARGS__)
#define CUDA_CALL_ASSERT(FN, ...) \
do { \
- unsigned __r = FN (__VA_ARGS__); \
+ unsigned __r \
+ = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
if (__r != CUDA_SUCCESS) \
{ \
GOMP_PLUGIN_fatal (#FN " error: %s", \
} \
} while (0)
+#define CUDA_CALL_NOCHECK(FN, ...) \
+ CUDA_CALL_PREFIX FN (__VA_ARGS__)
+
+static const char *
+cuda_error (CUresult r)
+{
+#if CUDA_VERSION < 7000
+ /* Specified in documentation and present in library from at least
+ 5.5. Not declared in header file prior to 7.0. */
+ extern CUresult cuGetErrorString (CUresult, const char **);
+#endif
+ const char *desc;
+
+ r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
+ if (r != CUDA_SUCCESS)
+ desc = "unknown cuda error";
+
+ return desc;
+}
+
static unsigned int instantiated_devices = 0;
static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
ret &= map_fini (s);
- CUresult r = cuStreamDestroy (s->stream);
+ CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
if (r != CUDA_SUCCESS)
{
GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
s->stream = existing;
else
{
- r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
+ r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
+ CU_STREAM_DEFAULT);
if (r != CUDA_SUCCESS)
{
pthread_mutex_unlock (&ptx_dev->stream_lock);
if (instantiated_devices != 0)
return true;
- CUDA_CALL (cuInit, 0);
ptx_events = NULL;
pthread_mutex_init (&ptx_event_lock, NULL);
+ if (!init_cuda_lib ())
+ return false;
+
+ CUDA_CALL (cuInit, 0);
+
CUDA_CALL (cuDeviceGetCount, &ndevs);
ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
* ndevs);
struct ptx_device *ptx_dev;
CUcontext thd_ctx;
- r = cuCtxGetDevice (&dev);
+ r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
{
GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
ptx_dev->dev = dev;
ptx_dev->ctx_shared = false;
- r = cuCtxGetDevice (&ctx_dev);
+ r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
{
GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
&pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
ptx_dev->clock_khz = pi;
- CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
ptx_dev->num_sms = pi;
/* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
in CUDA 6.0 and newer. */
- r = cuDeviceGetAttribute (&pi, 82, dev);
+ r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
/* Fallback: use limit of registers per block, which is usually equal. */
if (r == CUDA_ERROR_INVALID_VALUE)
pi = ptx_dev->regs_per_block;
return NULL;
}
- r = cuDeviceGetAttribute (&async_engines,
- CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
+ r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
+ CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
if (r != CUDA_SUCCESS)
async_engines = 1;
further initialization). */
if (instantiated_devices == 0)
{
- CUresult r = cuInit (0);
+ if (!init_cuda_lib ())
+ return 0;
+ CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
/* This is not an error: e.g. we may have CUDA libraries installed but
no devices available. */
if (r != CUDA_SUCCESS)
/* cuLinkAddData's 'data' argument erroneously omits the const
qualifier. */
GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
- r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
- ptx_objs->size, 0, 0, 0, 0);
+ r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
+ (char *) ptx_objs->code, ptx_objs->size,
+ 0, 0, 0, 0);
if (r != CUDA_SUCCESS)
{
GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
}
GOMP_PLUGIN_debug (0, "Linking\n");
- r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
+ r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
if (e->ord != nvthd->ptx_dev->ord)
continue;
- r = cuEventQuery (*e->evt);
+ r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
if (r == CUDA_SUCCESS)
{
bool append_async = false;
break;
}
- cuEventDestroy (*te);
+ CUDA_CALL_NOCHECK (cuEventDestroy, *te);
free ((void *)te);
/* Unlink 'e' from ptx_events list. */
cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
- if (cuDeviceGetAttribute (&block_size, cu_tpb, dev) == CUDA_SUCCESS
- && cuDeviceGetAttribute (&warp_size, cu_ws, dev) == CUDA_SUCCESS
- && cuDeviceGetAttribute (&dev_size, cu_mpc, dev) == CUDA_SUCCESS
- && cuDeviceGetAttribute (&cpu_size, cu_tpm, dev) == CUDA_SUCCESS)
+ if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
+ dev) == CUDA_SUCCESS
+ && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
+ dev) == CUDA_SUCCESS
+ && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
+ dev) == CUDA_SUCCESS
+ && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
+ dev) == CUDA_SUCCESS)
{
GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
" dev_size=%d, cpu_size=%d\n",
#ifndef DISABLE_ASYNC
if (async < acc_async_noval)
{
- r = cuStreamSynchronize (dev_str->stream);
+ r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
maybe_abort_msg);
e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
+ r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
maybe_abort_msg);
event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
}
#else
- r = cuCtxSynchronize ();
+ r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
maybe_abort_msg);
if (!s)
GOMP_PLUGIN_fatal ("unknown async %d", async);
- r = cuStreamQuery (s->stream);
+ r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
if (r == CUDA_SUCCESS)
{
/* The oacc-parallel.c:goacc_wait function calls this hook to determine
for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
{
if ((s->multithreaded || pthread_equal (s->host_thread, self))
- && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
+ && CUDA_CALL_NOCHECK (cuStreamQuery,
+ s->stream) == CUDA_ERROR_NOT_READY)
{
pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
return 0;
{
if (s->multithreaded || pthread_equal (s->host_thread, self))
{
- r = cuStreamQuery (s->stream);
+ r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
if (r == CUDA_SUCCESS)
continue;
else if (r != CUDA_ERROR_NOT_READY)
nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
{
CUdeviceptr dptr;
- CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick");
+ CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
+ module, "__nvptx_clocktick");
if (r == CUDA_ERROR_NOT_FOUND)
return;
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
double __nvptx_clocktick = 1e-3 / dev->clock_khz;
- r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick));
+ r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
+ sizeof (__nvptx_clocktick));
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
}
if (image->target_data == target_data)
{
*prev_p = image->next;
- if (cuModuleUnload (image->module) != CUDA_SUCCESS)
+ if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
ret = false;
free (image->fns);
free (image);
nvptx_stacks_alloc (size_t size, int num)
{
CUdeviceptr stacks;
- CUresult r = cuMemAlloc (&stacks, size * num);
+ CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
return (void *) stacks;
static void
nvptx_stacks_free (void *p, int num)
{
- CUresult r = cuMemFree ((CUdeviceptr) p);
+ CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
}
CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
CU_LAUNCH_PARAM_END
};
- r = cuLaunchKernel (function,
- teams, 1, 1,
- 32, threads, 1,
- 0, ptx_dev->null_stream->stream, NULL, config);
+ r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
+ 32, threads, 1, 0, ptx_dev->null_stream->stream,
+ NULL, config);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
- r = cuCtxSynchronize ();
+ r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
maybe_abort_msg);