Daily bump.
[gcc.git] / libgomp / plugin / plugin-nvptx.c
1 /* Plugin for NVPTX execution.
2
3 Copyright (C) 2013-2021 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
40 #include "oacc-int.h"
41
42 #include <pthread.h>
43 #include <cuda.h>
44 #include <stdbool.h>
45 #include <limits.h>
46 #include <string.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <assert.h>
50 #include <errno.h>
51
52 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
53 block to cache between kernel invocations. For soft-stacks blocks bigger
54 than this, we will free the block before attempting another GPU memory
55 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
56 we will free the cached soft-stacks block anyway then retry the
57 allocation. If that fails too, we lose. */
58
59 #define SOFTSTACK_CACHE_LIMIT 134217728
60
61 #if CUDA_VERSION < 6000
62 extern CUresult cuGetErrorString (CUresult, const char **);
63 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
64 #endif
65
66 #if CUDA_VERSION >= 6050
67 #undef cuLinkCreate
68 #undef cuLinkAddData
69 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
70 const char *, unsigned, CUjit_option *, void **);
71 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
72 #else
73 typedef size_t (*CUoccupancyB2DSize)(int);
74 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
75 const char *, unsigned, CUjit_option *, void **);
76 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
77 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
78 CUoccupancyB2DSize, size_t, int);
79 #endif
80
81 #define DO_PRAGMA(x) _Pragma (#x)
82
83 #if PLUGIN_NVPTX_DYNAMIC
84 # include <dlfcn.h>
85
86 struct cuda_lib_s {
87
88 # define CUDA_ONE_CALL(call) \
89 __typeof (call) *call;
90 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
91 CUDA_ONE_CALL (call)
92 #include "cuda-lib.def"
93 # undef CUDA_ONE_CALL
94 # undef CUDA_ONE_CALL_MAYBE_NULL
95
96 } cuda_lib;
97
98 /* -1 if init_cuda_lib has not been called yet, false
99 if it has been and failed, true if it has been and succeeded. */
100 static signed char cuda_lib_inited = -1;
101
102 /* Dynamically load the CUDA runtime library and initialize function
103 pointers, return false if unsuccessful, true if successful. */
104 static bool
105 init_cuda_lib (void)
106 {
107 if (cuda_lib_inited != -1)
108 return cuda_lib_inited;
109 const char *cuda_runtime_lib = "libcuda.so.1";
110 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
111 cuda_lib_inited = false;
112 if (h == NULL)
113 return false;
114
115 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
116 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
117 # define CUDA_ONE_CALL_1(call, allow_null) \
118 cuda_lib.call = dlsym (h, #call); \
119 if (!allow_null && cuda_lib.call == NULL) \
120 return false;
121 #include "cuda-lib.def"
122 # undef CUDA_ONE_CALL
123 # undef CUDA_ONE_CALL_1
124 # undef CUDA_ONE_CALL_MAYBE_NULL
125
126 cuda_lib_inited = true;
127 return true;
128 }
129 # define CUDA_CALL_PREFIX cuda_lib.
130 #else
131
132 # define CUDA_ONE_CALL(call)
133 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
134 #include "cuda-lib.def"
135 #undef CUDA_ONE_CALL_MAYBE_NULL
136 #undef CUDA_ONE_CALL
137
138 # define CUDA_CALL_PREFIX
139 # define init_cuda_lib() true
140 #endif
141
142 #include "secure_getenv.h"
143
144 #undef MIN
145 #undef MAX
146 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
147 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
148
149 /* Convenience macros for the frequently used CUDA library call and
150 error handling sequence as well as CUDA library calls that
151 do the error checking themselves or don't do it at all. */
152
153 #define CUDA_CALL_ERET(ERET, FN, ...) \
154 do { \
155 unsigned __r \
156 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
157 if (__r != CUDA_SUCCESS) \
158 { \
159 GOMP_PLUGIN_error (#FN " error: %s", \
160 cuda_error (__r)); \
161 return ERET; \
162 } \
163 } while (0)
164
165 #define CUDA_CALL(FN, ...) \
166 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
167
168 #define CUDA_CALL_ASSERT(FN, ...) \
169 do { \
170 unsigned __r \
171 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
172 if (__r != CUDA_SUCCESS) \
173 { \
174 GOMP_PLUGIN_fatal (#FN " error: %s", \
175 cuda_error (__r)); \
176 } \
177 } while (0)
178
179 #define CUDA_CALL_NOCHECK(FN, ...) \
180 CUDA_CALL_PREFIX FN (__VA_ARGS__)
181
182 #define CUDA_CALL_EXISTS(FN) \
183 CUDA_CALL_PREFIX FN
184
185 static const char *
186 cuda_error (CUresult r)
187 {
188 const char *fallback = "unknown cuda error";
189 const char *desc;
190
191 if (!CUDA_CALL_EXISTS (cuGetErrorString))
192 return fallback;
193
194 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
195 if (r == CUDA_SUCCESS)
196 return desc;
197
198 return fallback;
199 }
200
201 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
202 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
203 static char cuda_driver_version_s[30];
204
205 static unsigned int instantiated_devices = 0;
206 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
207
208 /* NVPTX/CUDA specific definition of asynchronous queues. */
209 struct goacc_asyncqueue
210 {
211 CUstream cuda_stream;
212 };
213
214 struct nvptx_callback
215 {
216 void (*fn) (void *);
217 void *ptr;
218 struct goacc_asyncqueue *aq;
219 struct nvptx_callback *next;
220 };
221
222 /* Thread-specific data for PTX. */
223
224 struct nvptx_thread
225 {
226 /* We currently have this embedded inside the plugin because libgomp manages
227 devices through integer target_ids. This might be better if using an
228 opaque target-specific pointer directly from gomp_device_descr. */
229 struct ptx_device *ptx_dev;
230 };
231
232 /* Target data function launch information. */
233
234 struct targ_fn_launch
235 {
236 const char *fn;
237 unsigned short dim[GOMP_DIM_MAX];
238 };
239
240 /* Target PTX object information. */
241
242 struct targ_ptx_obj
243 {
244 const char *code;
245 size_t size;
246 };
247
248 /* Target data image information. */
249
250 typedef struct nvptx_tdata
251 {
252 const struct targ_ptx_obj *ptx_objs;
253 unsigned ptx_num;
254
255 const char *const *var_names;
256 unsigned var_num;
257
258 const struct targ_fn_launch *fn_descs;
259 unsigned fn_num;
260 } nvptx_tdata_t;
261
262 /* Descriptor of a loaded function. */
263
264 struct targ_fn_descriptor
265 {
266 CUfunction fn;
267 const struct targ_fn_launch *launch;
268 int regs_per_thread;
269 int max_threads_per_block;
270 };
271
272 /* A loaded PTX image. */
273 struct ptx_image_data
274 {
275 const void *target_data;
276 CUmodule module;
277
278 struct targ_fn_descriptor *fns; /* Array of functions. */
279
280 struct ptx_image_data *next;
281 };
282
283 struct ptx_free_block
284 {
285 void *ptr;
286 struct ptx_free_block *next;
287 };
288
289 struct ptx_device
290 {
291 CUcontext ctx;
292 bool ctx_shared;
293 CUdevice dev;
294
295 int ord;
296 bool overlap;
297 bool map;
298 bool concur;
299 bool mkern;
300 int mode;
301 int clock_khz;
302 int num_sms;
303 int regs_per_block;
304 int regs_per_sm;
305 int warp_size;
306 int max_threads_per_block;
307 int max_threads_per_multiprocessor;
308 int default_dims[GOMP_DIM_MAX];
309
310 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
311 char name[256];
312
313 struct ptx_image_data *images; /* Images loaded on device. */
314 pthread_mutex_t image_lock; /* Lock for above list. */
315
316 struct ptx_free_block *free_blocks;
317 pthread_mutex_t free_blocks_lock;
318
319 /* OpenMP stacks, cached between kernel invocations. */
320 struct
321 {
322 CUdeviceptr ptr;
323 size_t size;
324 pthread_mutex_t lock;
325 } omp_stacks;
326
327 struct ptx_device *next;
328 };
329
330 static struct ptx_device **ptx_devices;
331
332 static inline struct nvptx_thread *
333 nvptx_thread (void)
334 {
335 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
336 }
337
338 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
339 should be locked on entry and remains locked on exit. */
340
341 static bool
342 nvptx_init (void)
343 {
344 int ndevs;
345
346 if (instantiated_devices != 0)
347 return true;
348
349 if (!init_cuda_lib ())
350 return false;
351
352 CUDA_CALL (cuInit, 0);
353
354 int cuda_driver_version;
355 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
356 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
357 "CUDA Driver %u.%u",
358 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
359
360 CUDA_CALL (cuDeviceGetCount, &ndevs);
361 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
362 * ndevs);
363
364 return true;
365 }
366
367 /* Select the N'th PTX device for the current host thread. The device must
368 have been previously opened before calling this function. */
369
370 static bool
371 nvptx_attach_host_thread_to_device (int n)
372 {
373 CUdevice dev;
374 CUresult r;
375 struct ptx_device *ptx_dev;
376 CUcontext thd_ctx;
377
378 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
379 if (r == CUDA_ERROR_NOT_PERMITTED)
380 {
381 /* Assume we're in a CUDA callback, just return true. */
382 return true;
383 }
384 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
385 {
386 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
387 return false;
388 }
389
390 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
391 return true;
392 else
393 {
394 CUcontext old_ctx;
395
396 ptx_dev = ptx_devices[n];
397 if (!ptx_dev)
398 {
399 GOMP_PLUGIN_error ("device %d not found", n);
400 return false;
401 }
402
403 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
404
405 /* We don't necessarily have a current context (e.g. if it has been
406 destroyed. Pop it if we do though. */
407 if (thd_ctx != NULL)
408 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
409
410 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
411 }
412 return true;
413 }
414
415 static struct ptx_device *
416 nvptx_open_device (int n)
417 {
418 struct ptx_device *ptx_dev;
419 CUdevice dev, ctx_dev;
420 CUresult r;
421 int async_engines, pi;
422
423 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
424
425 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
426
427 ptx_dev->ord = n;
428 ptx_dev->dev = dev;
429 ptx_dev->ctx_shared = false;
430
431 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
432 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
433 {
434 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
435 return NULL;
436 }
437
438 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
439 {
440 /* The current host thread has an active context for a different device.
441 Detach it. */
442 CUcontext old_ctx;
443 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
444 }
445
446 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
447
448 if (!ptx_dev->ctx)
449 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
450 else
451 ptx_dev->ctx_shared = true;
452
453 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
454 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
455 ptx_dev->overlap = pi;
456
457 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
458 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
459 ptx_dev->map = pi;
460
461 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
462 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
463 ptx_dev->concur = pi;
464
465 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
466 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
467 ptx_dev->mode = pi;
468
469 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
470 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
471 ptx_dev->mkern = pi;
472
473 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
474 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
475 ptx_dev->clock_khz = pi;
476
477 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
478 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
479 ptx_dev->num_sms = pi;
480
481 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
482 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
483 ptx_dev->regs_per_block = pi;
484
485 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
486 in CUDA 6.0 and newer. */
487 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
488 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
489 dev);
490 /* Fallback: use limit of registers per block, which is usually equal. */
491 if (r == CUDA_ERROR_INVALID_VALUE)
492 pi = ptx_dev->regs_per_block;
493 else if (r != CUDA_SUCCESS)
494 {
495 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
496 return NULL;
497 }
498 ptx_dev->regs_per_sm = pi;
499
500 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
501 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
502 if (pi != 32)
503 {
504 GOMP_PLUGIN_error ("Only warp size 32 is supported");
505 return NULL;
506 }
507 ptx_dev->warp_size = pi;
508
509 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
510 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
511 ptx_dev->max_threads_per_block = pi;
512
513 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
514 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
515 ptx_dev->max_threads_per_multiprocessor = pi;
516
517 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
518 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
519 if (r != CUDA_SUCCESS)
520 async_engines = 1;
521
522 for (int i = 0; i != GOMP_DIM_MAX; i++)
523 ptx_dev->default_dims[i] = 0;
524
525 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
526 dev);
527
528 ptx_dev->images = NULL;
529 pthread_mutex_init (&ptx_dev->image_lock, NULL);
530
531 ptx_dev->free_blocks = NULL;
532 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
533
534 ptx_dev->omp_stacks.ptr = 0;
535 ptx_dev->omp_stacks.size = 0;
536 pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
537
538 return ptx_dev;
539 }
540
541 static bool
542 nvptx_close_device (struct ptx_device *ptx_dev)
543 {
544 if (!ptx_dev)
545 return true;
546
547 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
548 {
549 struct ptx_free_block *b_next = b->next;
550 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
551 free (b);
552 b = b_next;
553 }
554
555 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
556 pthread_mutex_destroy (&ptx_dev->image_lock);
557
558 pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
559
560 if (ptx_dev->omp_stacks.ptr)
561 CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
562
563 if (!ptx_dev->ctx_shared)
564 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
565
566 free (ptx_dev);
567 return true;
568 }
569
570 static int
571 nvptx_get_num_devices (void)
572 {
573 int n;
574
575 /* This function will be called before the plugin has been initialized in
576 order to enumerate available devices, but CUDA API routines can't be used
577 until cuInit has been called. Just call it now (but don't yet do any
578 further initialization). */
579 if (instantiated_devices == 0)
580 {
581 if (!init_cuda_lib ())
582 return 0;
583 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
584 /* This is not an error: e.g. we may have CUDA libraries installed but
585 no devices available. */
586 if (r != CUDA_SUCCESS)
587 {
588 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
589 cuda_error (r));
590 return 0;
591 }
592 }
593
594 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
595 return n;
596 }
597
598 static void
599 notify_var (const char *var_name, const char *env_var)
600 {
601 if (env_var == NULL)
602 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
603 else
604 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
605 }
606
607 static void
608 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
609 {
610 const char *var_name = "GOMP_NVPTX_JIT";
611 const char *env_var = secure_getenv (var_name);
612 notify_var (var_name, env_var);
613
614 if (env_var == NULL)
615 return;
616
617 const char *c = env_var;
618 while (*c != '\0')
619 {
620 while (*c == ' ')
621 c++;
622
623 if (c[0] == '-' && c[1] == 'O'
624 && '0' <= c[2] && c[2] <= '4'
625 && (c[3] == '\0' || c[3] == ' '))
626 {
627 *gomp_nvptx_o = c[2] - '0';
628 c += 3;
629 continue;
630 }
631
632 GOMP_PLUGIN_error ("Error parsing %s", var_name);
633 break;
634 }
635 }
636
637 static bool
638 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
639 unsigned num_objs)
640 {
641 CUjit_option opts[7];
642 void *optvals[7];
643 float elapsed = 0.0;
644 char elog[1024];
645 char ilog[16384];
646 CUlinkState linkstate;
647 CUresult r;
648 void *linkout;
649 size_t linkoutsize __attribute__ ((unused));
650
651 opts[0] = CU_JIT_WALL_TIME;
652 optvals[0] = &elapsed;
653
654 opts[1] = CU_JIT_INFO_LOG_BUFFER;
655 optvals[1] = &ilog[0];
656
657 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
658 optvals[2] = (void *) sizeof ilog;
659
660 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
661 optvals[3] = &elog[0];
662
663 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
664 optvals[4] = (void *) sizeof elog;
665
666 opts[5] = CU_JIT_LOG_VERBOSE;
667 optvals[5] = (void *) 1;
668
669 static intptr_t gomp_nvptx_o = -1;
670
671 static bool init_done = false;
672 if (!init_done)
673 {
674 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
675 init_done = true;
676 }
677
678 int nopts = 6;
679 if (gomp_nvptx_o != -1)
680 {
681 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
682 optvals[nopts] = (void *) gomp_nvptx_o;
683 nopts++;
684 }
685
686 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
687 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
688 else
689 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
690
691 for (; num_objs--; ptx_objs++)
692 {
693 /* cuLinkAddData's 'data' argument erroneously omits the const
694 qualifier. */
695 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
696 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
697 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
698 (char *) ptx_objs->code, ptx_objs->size,
699 0, 0, 0, 0);
700 else
701 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
702 (char *) ptx_objs->code, ptx_objs->size,
703 0, 0, 0, 0);
704 if (r != CUDA_SUCCESS)
705 {
706 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
707 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
708 cuda_error (r));
709 return false;
710 }
711 }
712
713 GOMP_PLUGIN_debug (0, "Linking\n");
714 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
715
716 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
717 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
718
719 if (r != CUDA_SUCCESS)
720 {
721 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
722 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
723 return false;
724 }
725
726 CUDA_CALL (cuModuleLoadData, module, linkout);
727 CUDA_CALL (cuLinkDestroy, linkstate);
728 return true;
729 }
730
731 static void
732 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
733 unsigned *dims, void *targ_mem_desc,
734 CUdeviceptr dp, CUstream stream)
735 {
736 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
737 CUfunction function;
738 int i;
739 void *kargs[1];
740 struct nvptx_thread *nvthd = nvptx_thread ();
741 int warp_size = nvthd->ptx_dev->warp_size;
742
743 function = targ_fn->fn;
744
745 /* Initialize the launch dimensions. Typically this is constant,
746 provided by the device compiler, but we must permit runtime
747 values. */
748 int seen_zero = 0;
749 for (i = 0; i != GOMP_DIM_MAX; i++)
750 {
751 if (targ_fn->launch->dim[i])
752 dims[i] = targ_fn->launch->dim[i];
753 if (!dims[i])
754 seen_zero = 1;
755 }
756
757 if (seen_zero)
758 {
759 pthread_mutex_lock (&ptx_dev_lock);
760
761 static int gomp_openacc_dims[GOMP_DIM_MAX];
762 if (!gomp_openacc_dims[0])
763 {
764 /* See if the user provided GOMP_OPENACC_DIM environment
765 variable to specify runtime defaults. */
766 for (int i = 0; i < GOMP_DIM_MAX; ++i)
767 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
768 }
769
770 if (!nvthd->ptx_dev->default_dims[0])
771 {
772 int default_dims[GOMP_DIM_MAX];
773 for (int i = 0; i < GOMP_DIM_MAX; ++i)
774 default_dims[i] = gomp_openacc_dims[i];
775
776 int gang, worker, vector;
777 {
778 int block_size = nvthd->ptx_dev->max_threads_per_block;
779 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
780 int dev_size = nvthd->ptx_dev->num_sms;
781 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
782 " dev_size=%d, cpu_size=%d\n",
783 warp_size, block_size, dev_size, cpu_size);
784
785 gang = (cpu_size / block_size) * dev_size;
786 worker = block_size / warp_size;
787 vector = warp_size;
788 }
789
790 /* There is no upper bound on the gang size. The best size
791 matches the hardware configuration. Logical gangs are
792 scheduled onto physical hardware. To maximize usage, we
793 should guess a large number. */
794 if (default_dims[GOMP_DIM_GANG] < 1)
795 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
796 /* The worker size must not exceed the hardware. */
797 if (default_dims[GOMP_DIM_WORKER] < 1
798 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
799 default_dims[GOMP_DIM_WORKER] = worker;
800 /* The vector size must exactly match the hardware. */
801 if (default_dims[GOMP_DIM_VECTOR] < 1
802 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
803 default_dims[GOMP_DIM_VECTOR] = vector;
804
805 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
806 default_dims[GOMP_DIM_GANG],
807 default_dims[GOMP_DIM_WORKER],
808 default_dims[GOMP_DIM_VECTOR]);
809
810 for (i = 0; i != GOMP_DIM_MAX; i++)
811 nvthd->ptx_dev->default_dims[i] = default_dims[i];
812 }
813 pthread_mutex_unlock (&ptx_dev_lock);
814
815 {
816 bool default_dim_p[GOMP_DIM_MAX];
817 for (i = 0; i != GOMP_DIM_MAX; i++)
818 default_dim_p[i] = !dims[i];
819
820 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
821 {
822 for (i = 0; i != GOMP_DIM_MAX; i++)
823 if (default_dim_p[i])
824 dims[i] = nvthd->ptx_dev->default_dims[i];
825
826 if (default_dim_p[GOMP_DIM_VECTOR])
827 dims[GOMP_DIM_VECTOR]
828 = MIN (dims[GOMP_DIM_VECTOR],
829 (targ_fn->max_threads_per_block / warp_size
830 * warp_size));
831
832 if (default_dim_p[GOMP_DIM_WORKER])
833 dims[GOMP_DIM_WORKER]
834 = MIN (dims[GOMP_DIM_WORKER],
835 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
836 }
837 else
838 {
839 /* Handle the case that the compiler allows the runtime to choose
840 the vector-length conservatively, by ignoring
841 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
842 it. */
843 int vectors = 0;
844 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
845 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
846 exceed targ_fn->max_threads_per_block. */
847 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
848 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
849 int grids, blocks;
850
851 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
852 &blocks, function, NULL, 0,
853 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
854 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
855 "grid = %d, block = %d\n", grids, blocks);
856
857 /* Keep the num_gangs proportional to the block size. In
858 the case were a block size is limited by shared-memory
859 or the register file capacity, the runtime will not
860 excessively over assign gangs to the multiprocessor
861 units if their state is going to be swapped out even
862 more than necessary. The constant factor 2 is there to
863 prevent threads from idling when there is insufficient
864 work for them. */
865 if (gangs == 0)
866 gangs = 2 * grids * (blocks / warp_size);
867
868 if (vectors == 0)
869 vectors = warp_size;
870
871 if (workers == 0)
872 {
873 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
874 ? vectors
875 : dims[GOMP_DIM_VECTOR]);
876 workers = blocks / actual_vectors;
877 workers = MAX (workers, 1);
878 /* If we need a per-worker barrier ... . */
879 if (actual_vectors > 32)
880 /* Don't use more barriers than available. */
881 workers = MIN (workers, 15);
882 }
883
884 for (i = 0; i != GOMP_DIM_MAX; i++)
885 if (default_dim_p[i])
886 switch (i)
887 {
888 case GOMP_DIM_GANG: dims[i] = gangs; break;
889 case GOMP_DIM_WORKER: dims[i] = workers; break;
890 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
891 default: GOMP_PLUGIN_fatal ("invalid dim");
892 }
893 }
894 }
895 }
896
897 /* Check if the accelerator has sufficient hardware resources to
898 launch the offloaded kernel. */
899 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
900 > targ_fn->max_threads_per_block)
901 {
902 const char *msg
903 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
904 " with num_workers = %d and vector_length = %d"
905 "; "
906 "recompile the program with 'num_workers = x and vector_length = y'"
907 " on that offloaded region or '-fopenacc-dim=:x:y' where"
908 " x * y <= %d"
909 ".\n");
910 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
911 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
912 }
913
914 /* Check if the accelerator has sufficient barrier resources to
915 launch the offloaded kernel. */
916 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
917 {
918 const char *msg
919 = ("The Nvidia accelerator has insufficient barrier resources to launch"
920 " '%s' with num_workers = %d and vector_length = %d"
921 "; "
922 "recompile the program with 'num_workers = x' on that offloaded"
923 " region or '-fopenacc-dim=:x:' where x <= 15"
924 "; "
925 "or, recompile the program with 'vector_length = 32' on that"
926 " offloaded region or '-fopenacc-dim=::32'"
927 ".\n");
928 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
929 dims[GOMP_DIM_VECTOR]);
930 }
931
932 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
933 " gangs=%u, workers=%u, vectors=%u\n",
934 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
935 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
936
937 // OpenACC CUDA
938 //
939 // num_gangs nctaid.x
940 // num_workers ntid.y
941 // vector length ntid.x
942
943 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
944 acc_prof_info *prof_info = thr->prof_info;
945 acc_event_info enqueue_launch_event_info;
946 acc_api_info *api_info = thr->api_info;
947 bool profiling_p = __builtin_expect (prof_info != NULL, false);
948 if (profiling_p)
949 {
950 prof_info->event_type = acc_ev_enqueue_launch_start;
951
952 enqueue_launch_event_info.launch_event.event_type
953 = prof_info->event_type;
954 enqueue_launch_event_info.launch_event.valid_bytes
955 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
956 enqueue_launch_event_info.launch_event.parent_construct
957 = acc_construct_parallel;
958 enqueue_launch_event_info.launch_event.implicit = 1;
959 enqueue_launch_event_info.launch_event.tool_info = NULL;
960 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
961 enqueue_launch_event_info.launch_event.num_gangs
962 = dims[GOMP_DIM_GANG];
963 enqueue_launch_event_info.launch_event.num_workers
964 = dims[GOMP_DIM_WORKER];
965 enqueue_launch_event_info.launch_event.vector_length
966 = dims[GOMP_DIM_VECTOR];
967
968 api_info->device_api = acc_device_api_cuda;
969
970 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
971 api_info);
972 }
973
974 kargs[0] = &dp;
975 CUDA_CALL_ASSERT (cuLaunchKernel, function,
976 dims[GOMP_DIM_GANG], 1, 1,
977 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
978 0, stream, kargs, 0);
979
980 if (profiling_p)
981 {
982 prof_info->event_type = acc_ev_enqueue_launch_end;
983 enqueue_launch_event_info.launch_event.event_type
984 = prof_info->event_type;
985 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
986 api_info);
987 }
988
989 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
990 targ_fn->launch->fn);
991 }
992
993 void * openacc_get_current_cuda_context (void);
994
995 static void
996 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
997 {
998 acc_prof_info *prof_info = thr->prof_info;
999 acc_event_info data_event_info;
1000 acc_api_info *api_info = thr->api_info;
1001
1002 prof_info->event_type = acc_ev_alloc;
1003
1004 data_event_info.data_event.event_type = prof_info->event_type;
1005 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1006 data_event_info.data_event.parent_construct = acc_construct_parallel;
1007 data_event_info.data_event.implicit = 1;
1008 data_event_info.data_event.tool_info = NULL;
1009 data_event_info.data_event.var_name = NULL;
1010 data_event_info.data_event.bytes = s;
1011 data_event_info.data_event.host_ptr = NULL;
1012 data_event_info.data_event.device_ptr = dp;
1013
1014 api_info->device_api = acc_device_api_cuda;
1015
1016 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1017 }
1018
1019 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1020 size threshold, or if FORCE is true. */
1021
1022 static void
1023 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1024 {
1025 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1026 if (ptx_dev->omp_stacks.ptr
1027 && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1028 {
1029 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1030 if (r != CUDA_SUCCESS)
1031 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1032 ptx_dev->omp_stacks.ptr = 0;
1033 ptx_dev->omp_stacks.size = 0;
1034 }
1035 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1036 }
1037
1038 static void *
1039 nvptx_alloc (size_t s, bool suppress_errors)
1040 {
1041 CUdeviceptr d;
1042
1043 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1044 if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1045 return NULL;
1046 else if (r != CUDA_SUCCESS)
1047 {
1048 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1049 return NULL;
1050 }
1051
1052 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
1053 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1054 bool profiling_p
1055 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1056 if (profiling_p)
1057 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1058
1059 return (void *) d;
1060 }
1061
1062 static void
1063 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1064 {
1065 acc_prof_info *prof_info = thr->prof_info;
1066 acc_event_info data_event_info;
1067 acc_api_info *api_info = thr->api_info;
1068
1069 prof_info->event_type = acc_ev_free;
1070
1071 data_event_info.data_event.event_type = prof_info->event_type;
1072 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1073 data_event_info.data_event.parent_construct = acc_construct_parallel;
1074 data_event_info.data_event.implicit = 1;
1075 data_event_info.data_event.tool_info = NULL;
1076 data_event_info.data_event.var_name = NULL;
1077 data_event_info.data_event.bytes = -1;
1078 data_event_info.data_event.host_ptr = NULL;
1079 data_event_info.data_event.device_ptr = p;
1080
1081 api_info->device_api = acc_device_api_cuda;
1082
1083 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1084 }
1085
1086 static bool
1087 nvptx_free (void *p, struct ptx_device *ptx_dev)
1088 {
1089 CUdeviceptr pb;
1090 size_t ps;
1091
1092 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1093 (CUdeviceptr) p);
1094 if (r == CUDA_ERROR_NOT_PERMITTED)
1095 {
1096 /* We assume that this error indicates we are in a CUDA callback context,
1097 where all CUDA calls are not allowed (see cuStreamAddCallback
1098 documentation for description). Arrange to free this piece of device
1099 memory later. */
1100 struct ptx_free_block *n
1101 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1102 n->ptr = p;
1103 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1104 n->next = ptx_dev->free_blocks;
1105 ptx_dev->free_blocks = n;
1106 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1107 return true;
1108 }
1109 else if (r != CUDA_SUCCESS)
1110 {
1111 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1112 return false;
1113 }
1114 if ((CUdeviceptr) p != pb)
1115 {
1116 GOMP_PLUGIN_error ("invalid device address");
1117 return false;
1118 }
1119
1120 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1121 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1122 bool profiling_p
1123 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1124 if (profiling_p)
1125 goacc_profiling_acc_ev_free (thr, p);
1126
1127 return true;
1128 }
1129
1130 static void *
1131 nvptx_get_current_cuda_device (void)
1132 {
1133 struct nvptx_thread *nvthd = nvptx_thread ();
1134
1135 if (!nvthd || !nvthd->ptx_dev)
1136 return NULL;
1137
1138 return &nvthd->ptx_dev->dev;
1139 }
1140
1141 static void *
1142 nvptx_get_current_cuda_context (void)
1143 {
1144 struct nvptx_thread *nvthd = nvptx_thread ();
1145
1146 if (!nvthd || !nvthd->ptx_dev)
1147 return NULL;
1148
1149 return nvthd->ptx_dev->ctx;
1150 }
1151
1152 /* Plugin entry points. */
1153
1154 const char *
1155 GOMP_OFFLOAD_get_name (void)
1156 {
1157 return "nvptx";
1158 }
1159
1160 unsigned int
1161 GOMP_OFFLOAD_get_caps (void)
1162 {
1163 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1164 }
1165
1166 int
1167 GOMP_OFFLOAD_get_type (void)
1168 {
1169 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1170 }
1171
1172 int
1173 GOMP_OFFLOAD_get_num_devices (void)
1174 {
1175 return nvptx_get_num_devices ();
1176 }
1177
1178 bool
1179 GOMP_OFFLOAD_init_device (int n)
1180 {
1181 struct ptx_device *dev;
1182
1183 pthread_mutex_lock (&ptx_dev_lock);
1184
1185 if (!nvptx_init () || ptx_devices[n] != NULL)
1186 {
1187 pthread_mutex_unlock (&ptx_dev_lock);
1188 return false;
1189 }
1190
1191 dev = nvptx_open_device (n);
1192 if (dev)
1193 {
1194 ptx_devices[n] = dev;
1195 instantiated_devices++;
1196 }
1197
1198 pthread_mutex_unlock (&ptx_dev_lock);
1199
1200 return dev != NULL;
1201 }
1202
1203 bool
1204 GOMP_OFFLOAD_fini_device (int n)
1205 {
1206 pthread_mutex_lock (&ptx_dev_lock);
1207
1208 if (ptx_devices[n] != NULL)
1209 {
1210 if (!nvptx_attach_host_thread_to_device (n)
1211 || !nvptx_close_device (ptx_devices[n]))
1212 {
1213 pthread_mutex_unlock (&ptx_dev_lock);
1214 return false;
1215 }
1216 ptx_devices[n] = NULL;
1217 instantiated_devices--;
1218 }
1219
1220 if (instantiated_devices == 0)
1221 {
1222 free (ptx_devices);
1223 ptx_devices = NULL;
1224 }
1225
1226 pthread_mutex_unlock (&ptx_dev_lock);
1227 return true;
1228 }
1229
1230 /* Return the libgomp version number we're compatible with. There is
1231 no requirement for cross-version compatibility. */
1232
1233 unsigned
1234 GOMP_OFFLOAD_version (void)
1235 {
1236 return GOMP_VERSION;
1237 }
1238
1239 /* Initialize __nvptx_clocktick, if present in MODULE. */
1240
1241 static void
1242 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1243 {
1244 CUdeviceptr dptr;
1245 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1246 module, "__nvptx_clocktick");
1247 if (r == CUDA_ERROR_NOT_FOUND)
1248 return;
1249 if (r != CUDA_SUCCESS)
1250 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1251 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1252 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1253 sizeof (__nvptx_clocktick));
1254 if (r != CUDA_SUCCESS)
1255 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1256 }
1257
1258 /* Load the (partial) program described by TARGET_DATA to device
1259 number ORD. Allocate and return TARGET_TABLE. */
1260
1261 int
1262 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1263 struct addr_pair **target_table)
1264 {
1265 CUmodule module;
1266 const char *const *var_names;
1267 const struct targ_fn_launch *fn_descs;
1268 unsigned int fn_entries, var_entries, i, j;
1269 struct targ_fn_descriptor *targ_fns;
1270 struct addr_pair *targ_tbl;
1271 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1272 struct ptx_image_data *new_image;
1273 struct ptx_device *dev;
1274
1275 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1276 {
1277 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1278 " (expected %u, received %u)",
1279 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1280 return -1;
1281 }
1282
1283 if (!nvptx_attach_host_thread_to_device (ord)
1284 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1285 return -1;
1286
1287 dev = ptx_devices[ord];
1288
1289 /* The mkoffload utility emits a struct of pointers/integers at the
1290 start of each offload image. The array of kernel names and the
1291 functions addresses form a one-to-one correspondence. */
1292
1293 var_entries = img_header->var_num;
1294 var_names = img_header->var_names;
1295 fn_entries = img_header->fn_num;
1296 fn_descs = img_header->fn_descs;
1297
1298 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1299 * (fn_entries + var_entries));
1300 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1301 * fn_entries);
1302
1303 *target_table = targ_tbl;
1304
1305 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1306 new_image->target_data = target_data;
1307 new_image->module = module;
1308 new_image->fns = targ_fns;
1309
1310 pthread_mutex_lock (&dev->image_lock);
1311 new_image->next = dev->images;
1312 dev->images = new_image;
1313 pthread_mutex_unlock (&dev->image_lock);
1314
1315 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1316 {
1317 CUfunction function;
1318 int nregs, mthrs;
1319
1320 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1321 fn_descs[i].fn);
1322 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1323 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1324 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1325 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1326
1327 targ_fns->fn = function;
1328 targ_fns->launch = &fn_descs[i];
1329 targ_fns->regs_per_thread = nregs;
1330 targ_fns->max_threads_per_block = mthrs;
1331
1332 targ_tbl->start = (uintptr_t) targ_fns;
1333 targ_tbl->end = targ_tbl->start + 1;
1334 }
1335
1336 for (j = 0; j < var_entries; j++, targ_tbl++)
1337 {
1338 CUdeviceptr var;
1339 size_t bytes;
1340
1341 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1342 &var, &bytes, module, var_names[j]);
1343
1344 targ_tbl->start = (uintptr_t) var;
1345 targ_tbl->end = targ_tbl->start + bytes;
1346 }
1347
1348 nvptx_set_clocktick (module, dev);
1349
1350 return fn_entries + var_entries;
1351 }
1352
1353 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1354 function descriptors allocated by G_O_load_image. */
1355
1356 bool
1357 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1358 {
1359 struct ptx_image_data *image, **prev_p;
1360 struct ptx_device *dev = ptx_devices[ord];
1361
1362 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1363 {
1364 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1365 " (expected %u, received %u)",
1366 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1367 return false;
1368 }
1369
1370 bool ret = true;
1371 pthread_mutex_lock (&dev->image_lock);
1372 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1373 if (image->target_data == target_data)
1374 {
1375 *prev_p = image->next;
1376 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1377 ret = false;
1378 free (image->fns);
1379 free (image);
1380 break;
1381 }
1382 pthread_mutex_unlock (&dev->image_lock);
1383 return ret;
1384 }
1385
1386 void *
1387 GOMP_OFFLOAD_alloc (int ord, size_t size)
1388 {
1389 if (!nvptx_attach_host_thread_to_device (ord))
1390 return NULL;
1391
1392 struct ptx_device *ptx_dev = ptx_devices[ord];
1393 struct ptx_free_block *blocks, *tmp;
1394
1395 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1396 blocks = ptx_dev->free_blocks;
1397 ptx_dev->free_blocks = NULL;
1398 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1399
1400 nvptx_stacks_free (ptx_dev, false);
1401
1402 while (blocks)
1403 {
1404 tmp = blocks->next;
1405 nvptx_free (blocks->ptr, ptx_dev);
1406 free (blocks);
1407 blocks = tmp;
1408 }
1409
1410 void *d = nvptx_alloc (size, true);
1411 if (d)
1412 return d;
1413 else
1414 {
1415 /* Memory allocation failed. Try freeing the stacks block, and
1416 retrying. */
1417 nvptx_stacks_free (ptx_dev, true);
1418 return nvptx_alloc (size, false);
1419 }
1420 }
1421
1422 bool
1423 GOMP_OFFLOAD_free (int ord, void *ptr)
1424 {
1425 return (nvptx_attach_host_thread_to_device (ord)
1426 && nvptx_free (ptr, ptx_devices[ord]));
1427 }
1428
1429 void
1430 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1431 void **hostaddrs, void **devaddrs,
1432 unsigned *dims, void *targ_mem_desc)
1433 {
1434 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1435
1436 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1437 acc_prof_info *prof_info = thr->prof_info;
1438 acc_event_info data_event_info;
1439 acc_api_info *api_info = thr->api_info;
1440 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1441
1442 void **hp = NULL;
1443 CUdeviceptr dp = 0;
1444
1445 if (mapnum > 0)
1446 {
1447 size_t s = mapnum * sizeof (void *);
1448 hp = alloca (s);
1449 for (int i = 0; i < mapnum; i++)
1450 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1451 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1452 if (profiling_p)
1453 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1454 }
1455
1456 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1457 fact have the same value on a unified-memory system). */
1458 if (mapnum > 0)
1459 {
1460 if (profiling_p)
1461 {
1462 prof_info->event_type = acc_ev_enqueue_upload_start;
1463
1464 data_event_info.data_event.event_type = prof_info->event_type;
1465 data_event_info.data_event.valid_bytes
1466 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1467 data_event_info.data_event.parent_construct
1468 = acc_construct_parallel;
1469 data_event_info.data_event.implicit = 1; /* Always implicit. */
1470 data_event_info.data_event.tool_info = NULL;
1471 data_event_info.data_event.var_name = NULL;
1472 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1473 data_event_info.data_event.host_ptr = hp;
1474 data_event_info.data_event.device_ptr = (const void *) dp;
1475
1476 api_info->device_api = acc_device_api_cuda;
1477
1478 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1479 api_info);
1480 }
1481 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1482 mapnum * sizeof (void *));
1483 if (profiling_p)
1484 {
1485 prof_info->event_type = acc_ev_enqueue_upload_end;
1486 data_event_info.data_event.event_type = prof_info->event_type;
1487 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1488 api_info);
1489 }
1490 }
1491
1492 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1493 dp, NULL);
1494
1495 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1496 const char *maybe_abort_msg = "(perhaps abort was called)";
1497 if (r == CUDA_ERROR_LAUNCH_FAILED)
1498 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1499 maybe_abort_msg);
1500 else if (r != CUDA_SUCCESS)
1501 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1502
1503 CUDA_CALL_ASSERT (cuMemFree, dp);
1504 if (profiling_p)
1505 goacc_profiling_acc_ev_free (thr, (void *) dp);
1506 }
1507
1508 static void
1509 cuda_free_argmem (void *ptr)
1510 {
1511 void **block = (void **) ptr;
1512 nvptx_free (block[0], (struct ptx_device *) block[1]);
1513 free (block);
1514 }
1515
1516 void
1517 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1518 void **hostaddrs, void **devaddrs,
1519 unsigned *dims, void *targ_mem_desc,
1520 struct goacc_asyncqueue *aq)
1521 {
1522 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1523
1524 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1525 acc_prof_info *prof_info = thr->prof_info;
1526 acc_event_info data_event_info;
1527 acc_api_info *api_info = thr->api_info;
1528 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1529
1530 void **hp = NULL;
1531 CUdeviceptr dp = 0;
1532 void **block = NULL;
1533
1534 if (mapnum > 0)
1535 {
1536 size_t s = mapnum * sizeof (void *);
1537 block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1538 hp = block + 2;
1539 for (int i = 0; i < mapnum; i++)
1540 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1541 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1542 if (profiling_p)
1543 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1544 }
1545
1546 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1547 fact have the same value on a unified-memory system). */
1548 if (mapnum > 0)
1549 {
1550 if (profiling_p)
1551 {
1552 prof_info->event_type = acc_ev_enqueue_upload_start;
1553
1554 data_event_info.data_event.event_type = prof_info->event_type;
1555 data_event_info.data_event.valid_bytes
1556 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1557 data_event_info.data_event.parent_construct
1558 = acc_construct_parallel;
1559 data_event_info.data_event.implicit = 1; /* Always implicit. */
1560 data_event_info.data_event.tool_info = NULL;
1561 data_event_info.data_event.var_name = NULL;
1562 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1563 data_event_info.data_event.host_ptr = hp;
1564 data_event_info.data_event.device_ptr = (const void *) dp;
1565
1566 api_info->device_api = acc_device_api_cuda;
1567
1568 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1569 api_info);
1570 }
1571
1572 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1573 mapnum * sizeof (void *), aq->cuda_stream);
1574 block[0] = (void *) dp;
1575
1576 struct nvptx_thread *nvthd =
1577 (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1578 block[1] = (void *) nvthd->ptx_dev;
1579
1580 if (profiling_p)
1581 {
1582 prof_info->event_type = acc_ev_enqueue_upload_end;
1583 data_event_info.data_event.event_type = prof_info->event_type;
1584 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1585 api_info);
1586 }
1587 }
1588
1589 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1590 dp, aq->cuda_stream);
1591
1592 if (mapnum > 0)
1593 GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1594 }
1595
1596 void *
1597 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1598 {
1599 struct ptx_device *ptx_dev;
1600 struct nvptx_thread *nvthd
1601 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1602 CUcontext thd_ctx;
1603
1604 ptx_dev = ptx_devices[ord];
1605
1606 assert (ptx_dev);
1607
1608 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1609
1610 assert (ptx_dev->ctx);
1611
1612 if (!thd_ctx)
1613 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1614
1615 nvthd->ptx_dev = ptx_dev;
1616
1617 return (void *) nvthd;
1618 }
1619
1620 void
1621 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1622 {
1623 free (data);
1624 }
1625
1626 void *
1627 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1628 {
1629 return nvptx_get_current_cuda_device ();
1630 }
1631
1632 void *
1633 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1634 {
1635 return nvptx_get_current_cuda_context ();
1636 }
1637
1638 /* This returns a CUstream. */
1639 void *
1640 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1641 {
1642 return (void *) aq->cuda_stream;
1643 }
1644
1645 /* This takes a CUstream. */
1646 int
1647 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1648 {
1649 if (aq->cuda_stream)
1650 {
1651 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1652 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1653 }
1654
1655 aq->cuda_stream = (CUstream) stream;
1656 return 1;
1657 }
1658
1659 struct goacc_asyncqueue *
1660 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1661 {
1662 CUstream stream = NULL;
1663 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1664
1665 struct goacc_asyncqueue *aq
1666 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1667 aq->cuda_stream = stream;
1668 return aq;
1669 }
1670
1671 bool
1672 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1673 {
1674 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1675 free (aq);
1676 return true;
1677 }
1678
1679 int
1680 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1681 {
1682 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1683 if (r == CUDA_SUCCESS)
1684 return 1;
1685 if (r == CUDA_ERROR_NOT_READY)
1686 return 0;
1687
1688 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1689 return -1;
1690 }
1691
1692 bool
1693 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1694 {
1695 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1696 return true;
1697 }
1698
1699 bool
1700 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1701 struct goacc_asyncqueue *aq2)
1702 {
1703 CUevent e;
1704 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1705 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1706 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1707 return true;
1708 }
1709
1710 static void
1711 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1712 {
1713 if (res != CUDA_SUCCESS)
1714 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1715 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1716 cb->fn (cb->ptr);
1717 free (ptr);
1718 }
1719
1720 void
1721 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1722 void (*callback_fn)(void *),
1723 void *userptr)
1724 {
1725 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1726 b->fn = callback_fn;
1727 b->ptr = userptr;
1728 b->aq = aq;
1729 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1730 cuda_callback_wrapper, (void *) b, 0);
1731 }
1732
1733 static bool
1734 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1735 {
1736 CUdeviceptr pb;
1737 size_t ps;
1738 if (!s)
1739 return true;
1740 if (!d)
1741 {
1742 GOMP_PLUGIN_error ("invalid device address");
1743 return false;
1744 }
1745 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1746 if (!pb)
1747 {
1748 GOMP_PLUGIN_error ("invalid device address");
1749 return false;
1750 }
1751 if (!h)
1752 {
1753 GOMP_PLUGIN_error ("invalid host address");
1754 return false;
1755 }
1756 if (d == h)
1757 {
1758 GOMP_PLUGIN_error ("invalid host or device address");
1759 return false;
1760 }
1761 if ((void *)(d + s) > (void *)(pb + ps))
1762 {
1763 GOMP_PLUGIN_error ("invalid size");
1764 return false;
1765 }
1766 return true;
1767 }
1768
1769 bool
1770 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1771 {
1772 if (!nvptx_attach_host_thread_to_device (ord)
1773 || !cuda_memcpy_sanity_check (src, dst, n))
1774 return false;
1775 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1776 return true;
1777 }
1778
1779 bool
1780 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1781 {
1782 if (!nvptx_attach_host_thread_to_device (ord)
1783 || !cuda_memcpy_sanity_check (dst, src, n))
1784 return false;
1785 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1786 return true;
1787 }
1788
1789 bool
1790 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1791 {
1792 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1793 return true;
1794 }
1795
1796 bool
1797 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1798 size_t n, struct goacc_asyncqueue *aq)
1799 {
1800 if (!nvptx_attach_host_thread_to_device (ord)
1801 || !cuda_memcpy_sanity_check (src, dst, n))
1802 return false;
1803 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1804 return true;
1805 }
1806
1807 bool
1808 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1809 size_t n, struct goacc_asyncqueue *aq)
1810 {
1811 if (!nvptx_attach_host_thread_to_device (ord)
1812 || !cuda_memcpy_sanity_check (dst, src, n))
1813 return false;
1814 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1815 return true;
1816 }
1817
1818 union goacc_property_value
1819 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1820 {
1821 union goacc_property_value propval = { .val = 0 };
1822
1823 pthread_mutex_lock (&ptx_dev_lock);
1824
1825 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1826 {
1827 pthread_mutex_unlock (&ptx_dev_lock);
1828 return propval;
1829 }
1830
1831 struct ptx_device *ptx_dev = ptx_devices[n];
1832 switch (prop)
1833 {
1834 case GOACC_PROPERTY_MEMORY:
1835 {
1836 size_t total_mem;
1837
1838 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1839 propval.val = total_mem;
1840 }
1841 break;
1842 case GOACC_PROPERTY_FREE_MEMORY:
1843 {
1844 size_t total_mem;
1845 size_t free_mem;
1846 CUdevice ctxdev;
1847
1848 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1849 if (ptx_dev->dev == ctxdev)
1850 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1851 else if (ptx_dev->ctx)
1852 {
1853 CUcontext old_ctx;
1854
1855 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1856 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1857 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1858 }
1859 else
1860 {
1861 CUcontext new_ctx;
1862
1863 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1864 ptx_dev->dev);
1865 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1866 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1867 }
1868 propval.val = free_mem;
1869 }
1870 break;
1871 case GOACC_PROPERTY_NAME:
1872 propval.ptr = ptx_dev->name;
1873 break;
1874 case GOACC_PROPERTY_VENDOR:
1875 propval.ptr = "Nvidia";
1876 break;
1877 case GOACC_PROPERTY_DRIVER:
1878 propval.ptr = cuda_driver_version_s;
1879 break;
1880 default:
1881 break;
1882 }
1883
1884 pthread_mutex_unlock (&ptx_dev_lock);
1885 return propval;
1886 }
1887
1888 /* Adjust launch dimensions: pick good values for number of blocks and warps
1889 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1890 own limits. */
1891
1892 static void
1893 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1894 struct ptx_device *ptx_dev,
1895 int *teams_p, int *threads_p)
1896 {
1897 int max_warps_block = fn->max_threads_per_block / 32;
1898 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1899 and libgcc, which matches documented limit of all GPUs as of 2015. */
1900 if (max_warps_block > 32)
1901 max_warps_block = 32;
1902 if (*threads_p <= 0)
1903 *threads_p = 8;
1904 if (*threads_p > max_warps_block)
1905 *threads_p = max_warps_block;
1906
1907 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1908 /* This is an estimate of how many blocks the device can host simultaneously.
1909 Actual limit, which may be lower, can be queried with "occupancy control"
1910 driver interface (since CUDA 6.0). */
1911 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1912 if (*teams_p <= 0 || *teams_p > max_blocks)
1913 *teams_p = max_blocks;
1914 }
1915
1916 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1917 target regions. */
1918
1919 static size_t
1920 nvptx_stacks_size ()
1921 {
1922 return 128 * 1024;
1923 }
1924
1925 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
1926 the storage should be held on entry, and remains held on exit. */
1927
1928 static void *
1929 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
1930 {
1931 if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
1932 return (void *) ptx_dev->omp_stacks.ptr;
1933
1934 /* Free the old, too-small stacks. */
1935 if (ptx_dev->omp_stacks.ptr)
1936 {
1937 CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1938 if (r != CUDA_SUCCESS)
1939 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
1940 r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1941 if (r != CUDA_SUCCESS)
1942 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1943 }
1944
1945 /* Make new and bigger stacks, and remember where we put them and how big
1946 they are. */
1947 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
1948 size * num);
1949 if (r != CUDA_SUCCESS)
1950 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1951
1952 ptx_dev->omp_stacks.size = size * num;
1953
1954 return (void *) ptx_dev->omp_stacks.ptr;
1955 }
1956
1957 void
1958 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1959 {
1960 struct targ_fn_descriptor *tgt_fn_desc
1961 = (struct targ_fn_descriptor *) tgt_fn;
1962 CUfunction function = tgt_fn_desc->fn;
1963 const struct targ_fn_launch *launch = tgt_fn_desc->launch;
1964 const char *fn_name = launch->fn;
1965 CUresult r;
1966 struct ptx_device *ptx_dev = ptx_devices[ord];
1967 const char *maybe_abort_msg = "(perhaps abort was called)";
1968 int teams = 0, threads = 0;
1969
1970 if (!args)
1971 GOMP_PLUGIN_fatal ("No target arguments provided");
1972 while (*args)
1973 {
1974 intptr_t id = (intptr_t) *args++, val;
1975 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1976 val = (intptr_t) *args++;
1977 else
1978 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1979 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1980 continue;
1981 val = val > INT_MAX ? INT_MAX : val;
1982 id &= GOMP_TARGET_ARG_ID_MASK;
1983 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
1984 teams = val;
1985 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
1986 threads = val;
1987 }
1988 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
1989
1990 size_t stack_size = nvptx_stacks_size ();
1991
1992 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1993 void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
1994 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
1995 size_t fn_args_size = sizeof fn_args;
1996 void *config[] = {
1997 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
1998 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
1999 CU_LAUNCH_PARAM_END
2000 };
2001 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2002 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2003 __FUNCTION__, fn_name, teams, threads);
2004 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2005 32, threads, 1, 0, NULL, NULL, config);
2006 if (r != CUDA_SUCCESS)
2007 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2008
2009 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2010 if (r == CUDA_ERROR_LAUNCH_FAILED)
2011 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2012 maybe_abort_msg);
2013 else if (r != CUDA_SUCCESS)
2014 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2015
2016 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2017 }
2018
2019 /* TODO: Implement GOMP_OFFLOAD_async_run. */