libgomp: Provide prototypes for functions implemented by libgomp plugins
[gcc.git] / libgomp / plugin / plugin-nvptx.c
1 /* Plugin for NVPTX execution.
2
3 Copyright (C) 2013-2017 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-plugin.h"
38 #include "gomp-constants.h"
39
40 #include <pthread.h>
41 #include <cuda.h>
42 #include <stdbool.h>
43 #include <stdint.h>
44 #include <limits.h>
45 #include <string.h>
46 #include <stdio.h>
47 #include <unistd.h>
48 #include <assert.h>
49 #include <errno.h>
50
51 #if PLUGIN_NVPTX_DYNAMIC
52 # include <dlfcn.h>
53
54 # define CUDA_CALLS \
55 CUDA_ONE_CALL (cuCtxCreate) \
56 CUDA_ONE_CALL (cuCtxDestroy) \
57 CUDA_ONE_CALL (cuCtxGetCurrent) \
58 CUDA_ONE_CALL (cuCtxGetDevice) \
59 CUDA_ONE_CALL (cuCtxPopCurrent) \
60 CUDA_ONE_CALL (cuCtxPushCurrent) \
61 CUDA_ONE_CALL (cuCtxSynchronize) \
62 CUDA_ONE_CALL (cuDeviceGet) \
63 CUDA_ONE_CALL (cuDeviceGetAttribute) \
64 CUDA_ONE_CALL (cuDeviceGetCount) \
65 CUDA_ONE_CALL (cuEventCreate) \
66 CUDA_ONE_CALL (cuEventDestroy) \
67 CUDA_ONE_CALL (cuEventElapsedTime) \
68 CUDA_ONE_CALL (cuEventQuery) \
69 CUDA_ONE_CALL (cuEventRecord) \
70 CUDA_ONE_CALL (cuEventSynchronize) \
71 CUDA_ONE_CALL (cuFuncGetAttribute) \
72 CUDA_ONE_CALL (cuGetErrorString) \
73 CUDA_ONE_CALL (cuInit) \
74 CUDA_ONE_CALL (cuLaunchKernel) \
75 CUDA_ONE_CALL (cuLinkAddData) \
76 CUDA_ONE_CALL (cuLinkComplete) \
77 CUDA_ONE_CALL (cuLinkCreate) \
78 CUDA_ONE_CALL (cuLinkDestroy) \
79 CUDA_ONE_CALL (cuMemAlloc) \
80 CUDA_ONE_CALL (cuMemAllocHost) \
81 CUDA_ONE_CALL (cuMemcpy) \
82 CUDA_ONE_CALL (cuMemcpyDtoDAsync) \
83 CUDA_ONE_CALL (cuMemcpyDtoH) \
84 CUDA_ONE_CALL (cuMemcpyDtoHAsync) \
85 CUDA_ONE_CALL (cuMemcpyHtoD) \
86 CUDA_ONE_CALL (cuMemcpyHtoDAsync) \
87 CUDA_ONE_CALL (cuMemFree) \
88 CUDA_ONE_CALL (cuMemFreeHost) \
89 CUDA_ONE_CALL (cuMemGetAddressRange) \
90 CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
91 CUDA_ONE_CALL (cuModuleGetFunction) \
92 CUDA_ONE_CALL (cuModuleGetGlobal) \
93 CUDA_ONE_CALL (cuModuleLoad) \
94 CUDA_ONE_CALL (cuModuleLoadData) \
95 CUDA_ONE_CALL (cuModuleUnload) \
96 CUDA_ONE_CALL (cuStreamCreate) \
97 CUDA_ONE_CALL (cuStreamDestroy) \
98 CUDA_ONE_CALL (cuStreamQuery) \
99 CUDA_ONE_CALL (cuStreamSynchronize) \
100 CUDA_ONE_CALL (cuStreamWaitEvent)
101 # define CUDA_ONE_CALL(call) \
102 __typeof (call) *call;
103 struct cuda_lib_s {
104 CUDA_CALLS
105 } cuda_lib;
106
107 /* -1 if init_cuda_lib has not been called yet, false
108 if it has been and failed, true if it has been and succeeded. */
109 static char cuda_lib_inited = -1;
110
111 /* Dynamically load the CUDA runtime library and initialize function
112 pointers, return false if unsuccessful, true if successful. */
113 static bool
114 init_cuda_lib (void)
115 {
116 if (cuda_lib_inited != -1)
117 return cuda_lib_inited;
118 const char *cuda_runtime_lib = "libcuda.so.1";
119 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
120 cuda_lib_inited = false;
121 if (h == NULL)
122 return false;
123 # undef CUDA_ONE_CALL
124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
125 # define CUDA_ONE_CALL_1(call) \
126 cuda_lib.call = dlsym (h, #call); \
127 if (cuda_lib.call == NULL) \
128 return false;
129 CUDA_CALLS
130 cuda_lib_inited = true;
131 return true;
132 }
133 # undef CUDA_ONE_CALL
134 # undef CUDA_ONE_CALL_1
135 # define CUDA_CALL_PREFIX cuda_lib.
136 #else
137 # define CUDA_CALL_PREFIX
138 # define init_cuda_lib() true
139 #endif
140
141 /* Convenience macros for the frequently used CUDA library call and
142 error handling sequence as well as CUDA library calls that
143 do the error checking themselves or don't do it at all. */
144
145 #define CUDA_CALL_ERET(ERET, FN, ...) \
146 do { \
147 unsigned __r \
148 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
149 if (__r != CUDA_SUCCESS) \
150 { \
151 GOMP_PLUGIN_error (#FN " error: %s", \
152 cuda_error (__r)); \
153 return ERET; \
154 } \
155 } while (0)
156
157 #define CUDA_CALL(FN, ...) \
158 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
159
160 #define CUDA_CALL_ASSERT(FN, ...) \
161 do { \
162 unsigned __r \
163 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
164 if (__r != CUDA_SUCCESS) \
165 { \
166 GOMP_PLUGIN_fatal (#FN " error: %s", \
167 cuda_error (__r)); \
168 } \
169 } while (0)
170
171 #define CUDA_CALL_NOCHECK(FN, ...) \
172 CUDA_CALL_PREFIX FN (__VA_ARGS__)
173
174 static const char *
175 cuda_error (CUresult r)
176 {
177 #if CUDA_VERSION < 7000
178 /* Specified in documentation and present in library from at least
179 5.5. Not declared in header file prior to 7.0. */
180 extern CUresult cuGetErrorString (CUresult, const char **);
181 #endif
182 const char *desc;
183
184 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
185 if (r != CUDA_SUCCESS)
186 desc = "unknown cuda error";
187
188 return desc;
189 }
190
191 static unsigned int instantiated_devices = 0;
192 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
193
194 struct ptx_stream
195 {
196 CUstream stream;
197 pthread_t host_thread;
198 bool multithreaded;
199
200 CUdeviceptr d;
201 void *h;
202 void *h_begin;
203 void *h_end;
204 void *h_next;
205 void *h_prev;
206 void *h_tail;
207
208 struct ptx_stream *next;
209 };
210
211 /* Thread-specific data for PTX. */
212
213 struct nvptx_thread
214 {
215 struct ptx_stream *current_stream;
216 struct ptx_device *ptx_dev;
217 };
218
219 struct map
220 {
221 int async;
222 size_t size;
223 char mappings[0];
224 };
225
226 static bool
227 map_init (struct ptx_stream *s)
228 {
229 int size = getpagesize ();
230
231 assert (s);
232 assert (!s->d);
233 assert (!s->h);
234
235 CUDA_CALL (cuMemAllocHost, &s->h, size);
236 CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
237
238 assert (s->h);
239
240 s->h_begin = s->h;
241 s->h_end = s->h_begin + size;
242 s->h_next = s->h_prev = s->h_tail = s->h_begin;
243
244 assert (s->h_next);
245 assert (s->h_end);
246 return true;
247 }
248
249 static bool
250 map_fini (struct ptx_stream *s)
251 {
252 CUDA_CALL (cuMemFreeHost, s->h);
253 return true;
254 }
255
256 static void
257 map_pop (struct ptx_stream *s)
258 {
259 struct map *m;
260
261 assert (s != NULL);
262 assert (s->h_next);
263 assert (s->h_prev);
264 assert (s->h_tail);
265
266 m = s->h_tail;
267
268 s->h_tail += m->size;
269
270 if (s->h_tail >= s->h_end)
271 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
272
273 if (s->h_next == s->h_tail)
274 s->h_prev = s->h_next;
275
276 assert (s->h_next >= s->h_begin);
277 assert (s->h_tail >= s->h_begin);
278 assert (s->h_prev >= s->h_begin);
279
280 assert (s->h_next <= s->h_end);
281 assert (s->h_tail <= s->h_end);
282 assert (s->h_prev <= s->h_end);
283 }
284
285 static void
286 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
287 {
288 int left;
289 int offset;
290 struct map *m;
291
292 assert (s != NULL);
293
294 left = s->h_end - s->h_next;
295 size += sizeof (struct map);
296
297 assert (s->h_prev);
298 assert (s->h_next);
299
300 if (size >= left)
301 {
302 m = s->h_prev;
303 m->size += left;
304 s->h_next = s->h_begin;
305
306 if (s->h_next + size > s->h_end)
307 GOMP_PLUGIN_fatal ("unable to push map");
308 }
309
310 assert (s->h_next);
311
312 m = s->h_next;
313 m->async = async;
314 m->size = size;
315
316 offset = (void *)&m->mappings[0] - s->h;
317
318 *d = (void *)(s->d + offset);
319 *h = (void *)(s->h + offset);
320
321 s->h_prev = s->h_next;
322 s->h_next += size;
323
324 assert (s->h_prev);
325 assert (s->h_next);
326
327 assert (s->h_next >= s->h_begin);
328 assert (s->h_tail >= s->h_begin);
329 assert (s->h_prev >= s->h_begin);
330 assert (s->h_next <= s->h_end);
331 assert (s->h_tail <= s->h_end);
332 assert (s->h_prev <= s->h_end);
333
334 return;
335 }
336
337 /* Target data function launch information. */
338
339 struct targ_fn_launch
340 {
341 const char *fn;
342 unsigned short dim[GOMP_DIM_MAX];
343 };
344
345 /* Target PTX object information. */
346
347 struct targ_ptx_obj
348 {
349 const char *code;
350 size_t size;
351 };
352
353 /* Target data image information. */
354
355 typedef struct nvptx_tdata
356 {
357 const struct targ_ptx_obj *ptx_objs;
358 unsigned ptx_num;
359
360 const char *const *var_names;
361 unsigned var_num;
362
363 const struct targ_fn_launch *fn_descs;
364 unsigned fn_num;
365 } nvptx_tdata_t;
366
367 /* Descriptor of a loaded function. */
368
369 struct targ_fn_descriptor
370 {
371 CUfunction fn;
372 const struct targ_fn_launch *launch;
373 int regs_per_thread;
374 int max_threads_per_block;
375 };
376
377 /* A loaded PTX image. */
378 struct ptx_image_data
379 {
380 const void *target_data;
381 CUmodule module;
382
383 struct targ_fn_descriptor *fns; /* Array of functions. */
384
385 struct ptx_image_data *next;
386 };
387
388 struct ptx_device
389 {
390 CUcontext ctx;
391 bool ctx_shared;
392 CUdevice dev;
393 struct ptx_stream *null_stream;
394 /* All non-null streams associated with this device (actually context),
395 either created implicitly or passed in from the user (via
396 acc_set_cuda_stream). */
397 struct ptx_stream *active_streams;
398 struct {
399 struct ptx_stream **arr;
400 int size;
401 } async_streams;
402 /* A lock for use when manipulating the above stream list and array. */
403 pthread_mutex_t stream_lock;
404 int ord;
405 bool overlap;
406 bool map;
407 bool concur;
408 bool mkern;
409 int mode;
410 int clock_khz;
411 int num_sms;
412 int regs_per_block;
413 int regs_per_sm;
414
415 struct ptx_image_data *images; /* Images loaded on device. */
416 pthread_mutex_t image_lock; /* Lock for above list. */
417
418 struct ptx_device *next;
419 };
420
421 enum ptx_event_type
422 {
423 PTX_EVT_MEM,
424 PTX_EVT_KNL,
425 PTX_EVT_SYNC,
426 PTX_EVT_ASYNC_CLEANUP
427 };
428
429 struct ptx_event
430 {
431 CUevent *evt;
432 int type;
433 void *addr;
434 int ord;
435 int val;
436
437 struct ptx_event *next;
438 };
439
440 static pthread_mutex_t ptx_event_lock;
441 static struct ptx_event *ptx_events;
442
443 static struct ptx_device **ptx_devices;
444
445 static inline struct nvptx_thread *
446 nvptx_thread (void)
447 {
448 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
449 }
450
451 static bool
452 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
453 {
454 int i;
455 struct ptx_stream *null_stream
456 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
457
458 null_stream->stream = NULL;
459 null_stream->host_thread = pthread_self ();
460 null_stream->multithreaded = true;
461 null_stream->d = (CUdeviceptr) NULL;
462 null_stream->h = NULL;
463 if (!map_init (null_stream))
464 return false;
465
466 ptx_dev->null_stream = null_stream;
467 ptx_dev->active_streams = NULL;
468 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
469
470 if (concurrency < 1)
471 concurrency = 1;
472
473 /* This is just a guess -- make space for as many async streams as the
474 current device is capable of concurrently executing. This can grow
475 later as necessary. No streams are created yet. */
476 ptx_dev->async_streams.arr
477 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
478 ptx_dev->async_streams.size = concurrency;
479
480 for (i = 0; i < concurrency; i++)
481 ptx_dev->async_streams.arr[i] = NULL;
482
483 return true;
484 }
485
486 static bool
487 fini_streams_for_device (struct ptx_device *ptx_dev)
488 {
489 free (ptx_dev->async_streams.arr);
490
491 bool ret = true;
492 while (ptx_dev->active_streams != NULL)
493 {
494 struct ptx_stream *s = ptx_dev->active_streams;
495 ptx_dev->active_streams = ptx_dev->active_streams->next;
496
497 ret &= map_fini (s);
498
499 CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
500 if (r != CUDA_SUCCESS)
501 {
502 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
503 ret = false;
504 }
505 free (s);
506 }
507
508 ret &= map_fini (ptx_dev->null_stream);
509 free (ptx_dev->null_stream);
510 return ret;
511 }
512
513 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
514 thread THREAD (and also current device/context). If CREATE is true, create
515 the stream if it does not exist (or use EXISTING if it is non-NULL), and
516 associate the stream with the same thread argument. Returns stream to use
517 as result. */
518
519 static struct ptx_stream *
520 select_stream_for_async (int async, pthread_t thread, bool create,
521 CUstream existing)
522 {
523 struct nvptx_thread *nvthd = nvptx_thread ();
524 /* Local copy of TLS variable. */
525 struct ptx_device *ptx_dev = nvthd->ptx_dev;
526 struct ptx_stream *stream = NULL;
527 int orig_async = async;
528
529 /* The special value acc_async_noval (-1) maps (for now) to an
530 implicitly-created stream, which is then handled the same as any other
531 numbered async stream. Other options are available, e.g. using the null
532 stream for anonymous async operations, or choosing an idle stream from an
533 active set. But, stick with this for now. */
534 if (async > acc_async_sync)
535 async++;
536
537 if (create)
538 pthread_mutex_lock (&ptx_dev->stream_lock);
539
540 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
541 null stream, and in fact better performance may be obtainable if it doesn't
542 (because the null stream enforces overly-strict synchronisation with
543 respect to other streams for legacy reasons, and that's probably not
544 needed with OpenACC). Maybe investigate later. */
545 if (async == acc_async_sync)
546 stream = ptx_dev->null_stream;
547 else if (async >= 0 && async < ptx_dev->async_streams.size
548 && ptx_dev->async_streams.arr[async] && !(create && existing))
549 stream = ptx_dev->async_streams.arr[async];
550 else if (async >= 0 && create)
551 {
552 if (async >= ptx_dev->async_streams.size)
553 {
554 int i, newsize = ptx_dev->async_streams.size * 2;
555
556 if (async >= newsize)
557 newsize = async + 1;
558
559 ptx_dev->async_streams.arr
560 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
561 newsize * sizeof (struct ptx_stream *));
562
563 for (i = ptx_dev->async_streams.size; i < newsize; i++)
564 ptx_dev->async_streams.arr[i] = NULL;
565
566 ptx_dev->async_streams.size = newsize;
567 }
568
569 /* Create a new stream on-demand if there isn't one already, or if we're
570 setting a particular async value to an existing (externally-provided)
571 stream. */
572 if (!ptx_dev->async_streams.arr[async] || existing)
573 {
574 CUresult r;
575 struct ptx_stream *s
576 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
577
578 if (existing)
579 s->stream = existing;
580 else
581 {
582 r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
583 CU_STREAM_DEFAULT);
584 if (r != CUDA_SUCCESS)
585 {
586 pthread_mutex_unlock (&ptx_dev->stream_lock);
587 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
588 cuda_error (r));
589 }
590 }
591
592 /* If CREATE is true, we're going to be queueing some work on this
593 stream. Associate it with the current host thread. */
594 s->host_thread = thread;
595 s->multithreaded = false;
596
597 s->d = (CUdeviceptr) NULL;
598 s->h = NULL;
599 if (!map_init (s))
600 {
601 pthread_mutex_unlock (&ptx_dev->stream_lock);
602 GOMP_PLUGIN_fatal ("map_init fail");
603 }
604
605 s->next = ptx_dev->active_streams;
606 ptx_dev->active_streams = s;
607 ptx_dev->async_streams.arr[async] = s;
608 }
609
610 stream = ptx_dev->async_streams.arr[async];
611 }
612 else if (async < 0)
613 {
614 if (create)
615 pthread_mutex_unlock (&ptx_dev->stream_lock);
616 GOMP_PLUGIN_fatal ("bad async %d", async);
617 }
618
619 if (create)
620 {
621 assert (stream != NULL);
622
623 /* If we're trying to use the same stream from different threads
624 simultaneously, set stream->multithreaded to true. This affects the
625 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
626 only wait for asynchronous launches from the same host thread they are
627 invoked on. If multiple threads use the same async value, we make note
628 of that here and fall back to testing/waiting for all threads in those
629 functions. */
630 if (thread != stream->host_thread)
631 stream->multithreaded = true;
632
633 pthread_mutex_unlock (&ptx_dev->stream_lock);
634 }
635 else if (stream && !stream->multithreaded
636 && !pthread_equal (stream->host_thread, thread))
637 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
638
639 return stream;
640 }
641
642 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
643 should be locked on entry and remains locked on exit. */
644
645 static bool
646 nvptx_init (void)
647 {
648 int ndevs;
649
650 if (instantiated_devices != 0)
651 return true;
652
653 ptx_events = NULL;
654 pthread_mutex_init (&ptx_event_lock, NULL);
655
656 if (!init_cuda_lib ())
657 return false;
658
659 CUDA_CALL (cuInit, 0);
660
661 CUDA_CALL (cuDeviceGetCount, &ndevs);
662 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
663 * ndevs);
664 return true;
665 }
666
667 /* Select the N'th PTX device for the current host thread. The device must
668 have been previously opened before calling this function. */
669
670 static bool
671 nvptx_attach_host_thread_to_device (int n)
672 {
673 CUdevice dev;
674 CUresult r;
675 struct ptx_device *ptx_dev;
676 CUcontext thd_ctx;
677
678 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
679 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
680 {
681 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
682 return false;
683 }
684
685 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
686 return true;
687 else
688 {
689 CUcontext old_ctx;
690
691 ptx_dev = ptx_devices[n];
692 if (!ptx_dev)
693 {
694 GOMP_PLUGIN_error ("device %d not found", n);
695 return false;
696 }
697
698 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
699
700 /* We don't necessarily have a current context (e.g. if it has been
701 destroyed. Pop it if we do though. */
702 if (thd_ctx != NULL)
703 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
704
705 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
706 }
707 return true;
708 }
709
710 static struct ptx_device *
711 nvptx_open_device (int n)
712 {
713 struct ptx_device *ptx_dev;
714 CUdevice dev, ctx_dev;
715 CUresult r;
716 int async_engines, pi;
717
718 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
719
720 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
721
722 ptx_dev->ord = n;
723 ptx_dev->dev = dev;
724 ptx_dev->ctx_shared = false;
725
726 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
727 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
728 {
729 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
730 return NULL;
731 }
732
733 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
734 {
735 /* The current host thread has an active context for a different device.
736 Detach it. */
737 CUcontext old_ctx;
738 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
739 }
740
741 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
742
743 if (!ptx_dev->ctx)
744 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
745 else
746 ptx_dev->ctx_shared = true;
747
748 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
749 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
750 ptx_dev->overlap = pi;
751
752 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
753 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
754 ptx_dev->map = pi;
755
756 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
757 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
758 ptx_dev->concur = pi;
759
760 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
761 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
762 ptx_dev->mode = pi;
763
764 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
765 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
766 ptx_dev->mkern = pi;
767
768 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
769 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
770 ptx_dev->clock_khz = pi;
771
772 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
773 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
774 ptx_dev->num_sms = pi;
775
776 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
777 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
778 ptx_dev->regs_per_block = pi;
779
780 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
781 in CUDA 6.0 and newer. */
782 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
783 /* Fallback: use limit of registers per block, which is usually equal. */
784 if (r == CUDA_ERROR_INVALID_VALUE)
785 pi = ptx_dev->regs_per_block;
786 else if (r != CUDA_SUCCESS)
787 {
788 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
789 return NULL;
790 }
791 ptx_dev->regs_per_sm = pi;
792
793 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
794 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
795 if (pi != 32)
796 {
797 GOMP_PLUGIN_error ("Only warp size 32 is supported");
798 return NULL;
799 }
800
801 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
802 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
803 if (r != CUDA_SUCCESS)
804 async_engines = 1;
805
806 ptx_dev->images = NULL;
807 pthread_mutex_init (&ptx_dev->image_lock, NULL);
808
809 if (!init_streams_for_device (ptx_dev, async_engines))
810 return NULL;
811
812 return ptx_dev;
813 }
814
815 static bool
816 nvptx_close_device (struct ptx_device *ptx_dev)
817 {
818 if (!ptx_dev)
819 return true;
820
821 if (!fini_streams_for_device (ptx_dev))
822 return false;
823
824 pthread_mutex_destroy (&ptx_dev->image_lock);
825
826 if (!ptx_dev->ctx_shared)
827 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
828
829 free (ptx_dev);
830 return true;
831 }
832
833 static int
834 nvptx_get_num_devices (void)
835 {
836 int n;
837
838 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
839 configurations. */
840 if (sizeof (void *) != 8)
841 return 0;
842
843 /* This function will be called before the plugin has been initialized in
844 order to enumerate available devices, but CUDA API routines can't be used
845 until cuInit has been called. Just call it now (but don't yet do any
846 further initialization). */
847 if (instantiated_devices == 0)
848 {
849 if (!init_cuda_lib ())
850 return 0;
851 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
852 /* This is not an error: e.g. we may have CUDA libraries installed but
853 no devices available. */
854 if (r != CUDA_SUCCESS)
855 return 0;
856 }
857
858 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
859 return n;
860 }
861
862
863 static bool
864 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
865 unsigned num_objs)
866 {
867 CUjit_option opts[6];
868 void *optvals[6];
869 float elapsed = 0.0;
870 char elog[1024];
871 char ilog[16384];
872 CUlinkState linkstate;
873 CUresult r;
874 void *linkout;
875 size_t linkoutsize __attribute__ ((unused));
876
877 opts[0] = CU_JIT_WALL_TIME;
878 optvals[0] = &elapsed;
879
880 opts[1] = CU_JIT_INFO_LOG_BUFFER;
881 optvals[1] = &ilog[0];
882
883 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
884 optvals[2] = (void *) sizeof ilog;
885
886 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
887 optvals[3] = &elog[0];
888
889 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
890 optvals[4] = (void *) sizeof elog;
891
892 opts[5] = CU_JIT_LOG_VERBOSE;
893 optvals[5] = (void *) 1;
894
895 CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
896
897 for (; num_objs--; ptx_objs++)
898 {
899 /* cuLinkAddData's 'data' argument erroneously omits the const
900 qualifier. */
901 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
902 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
903 (char *) ptx_objs->code, ptx_objs->size,
904 0, 0, 0, 0);
905 if (r != CUDA_SUCCESS)
906 {
907 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
908 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
909 cuda_error (r));
910 return false;
911 }
912 }
913
914 GOMP_PLUGIN_debug (0, "Linking\n");
915 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
916
917 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
918 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
919
920 if (r != CUDA_SUCCESS)
921 {
922 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
923 return false;
924 }
925
926 CUDA_CALL (cuModuleLoadData, module, linkout);
927 CUDA_CALL (cuLinkDestroy, linkstate);
928 return true;
929 }
930
931 static void
932 event_gc (bool memmap_lockable)
933 {
934 struct ptx_event *ptx_event = ptx_events;
935 struct ptx_event *async_cleanups = NULL;
936 struct nvptx_thread *nvthd = nvptx_thread ();
937
938 pthread_mutex_lock (&ptx_event_lock);
939
940 while (ptx_event != NULL)
941 {
942 CUresult r;
943 struct ptx_event *e = ptx_event;
944
945 ptx_event = ptx_event->next;
946
947 if (e->ord != nvthd->ptx_dev->ord)
948 continue;
949
950 r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
951 if (r == CUDA_SUCCESS)
952 {
953 bool append_async = false;
954 CUevent *te;
955
956 te = e->evt;
957
958 switch (e->type)
959 {
960 case PTX_EVT_MEM:
961 case PTX_EVT_SYNC:
962 break;
963
964 case PTX_EVT_KNL:
965 map_pop (e->addr);
966 break;
967
968 case PTX_EVT_ASYNC_CLEANUP:
969 {
970 /* The function gomp_plugin_async_unmap_vars needs to claim the
971 memory-map splay tree lock for the current device, so we
972 can't call it when one of our callers has already claimed
973 the lock. In that case, just delay the GC for this event
974 until later. */
975 if (!memmap_lockable)
976 continue;
977
978 append_async = true;
979 }
980 break;
981 }
982
983 CUDA_CALL_NOCHECK (cuEventDestroy, *te);
984 free ((void *)te);
985
986 /* Unlink 'e' from ptx_events list. */
987 if (ptx_events == e)
988 ptx_events = ptx_events->next;
989 else
990 {
991 struct ptx_event *e_ = ptx_events;
992 while (e_->next != e)
993 e_ = e_->next;
994 e_->next = e_->next->next;
995 }
996
997 if (append_async)
998 {
999 e->next = async_cleanups;
1000 async_cleanups = e;
1001 }
1002 else
1003 free (e);
1004 }
1005 }
1006
1007 pthread_mutex_unlock (&ptx_event_lock);
1008
1009 /* We have to do these here, after ptx_event_lock is released. */
1010 while (async_cleanups)
1011 {
1012 struct ptx_event *e = async_cleanups;
1013 async_cleanups = async_cleanups->next;
1014
1015 GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1016 free (e);
1017 }
1018 }
1019
1020 static void
1021 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1022 {
1023 struct ptx_event *ptx_event;
1024 struct nvptx_thread *nvthd = nvptx_thread ();
1025
1026 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1027 || type == PTX_EVT_ASYNC_CLEANUP);
1028
1029 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1030 ptx_event->type = type;
1031 ptx_event->evt = e;
1032 ptx_event->addr = h;
1033 ptx_event->ord = nvthd->ptx_dev->ord;
1034 ptx_event->val = val;
1035
1036 pthread_mutex_lock (&ptx_event_lock);
1037
1038 ptx_event->next = ptx_events;
1039 ptx_events = ptx_event;
1040
1041 pthread_mutex_unlock (&ptx_event_lock);
1042 }
1043
1044 void
1045 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1046 int async, unsigned *dims, void *targ_mem_desc)
1047 {
1048 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1049 CUfunction function;
1050 CUresult r;
1051 int i;
1052 struct ptx_stream *dev_str;
1053 void *kargs[1];
1054 void *hp, *dp;
1055 struct nvptx_thread *nvthd = nvptx_thread ();
1056 const char *maybe_abort_msg = "(perhaps abort was called)";
1057
1058 function = targ_fn->fn;
1059
1060 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1061 assert (dev_str == nvthd->current_stream);
1062
1063 /* Initialize the launch dimensions. Typically this is constant,
1064 provided by the device compiler, but we must permit runtime
1065 values. */
1066 int seen_zero = 0;
1067 for (i = 0; i != GOMP_DIM_MAX; i++)
1068 {
1069 if (targ_fn->launch->dim[i])
1070 dims[i] = targ_fn->launch->dim[i];
1071 if (!dims[i])
1072 seen_zero = 1;
1073 }
1074
1075 if (seen_zero)
1076 {
1077 /* See if the user provided GOMP_OPENACC_DIM environment
1078 variable to specify runtime defaults. */
1079 static int default_dims[GOMP_DIM_MAX];
1080
1081 pthread_mutex_lock (&ptx_dev_lock);
1082 if (!default_dims[0])
1083 {
1084 /* We only read the environment variable once. You can't
1085 change it in the middle of execution. The syntax is
1086 the same as for the -fopenacc-dim compilation option. */
1087 const char *env_var = getenv ("GOMP_OPENACC_DIM");
1088 if (env_var)
1089 {
1090 const char *pos = env_var;
1091
1092 for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
1093 {
1094 if (i && *pos++ != ':')
1095 break;
1096 if (*pos != ':')
1097 {
1098 const char *eptr;
1099
1100 errno = 0;
1101 long val = strtol (pos, (char **)&eptr, 10);
1102 if (errno || val < 0 || (unsigned)val != val)
1103 break;
1104 default_dims[i] = (int)val;
1105 pos = eptr;
1106 }
1107 }
1108 }
1109
1110 int warp_size, block_size, dev_size, cpu_size;
1111 CUdevice dev = nvptx_thread()->ptx_dev->dev;
1112 /* 32 is the default for known hardware. */
1113 int gang = 0, worker = 32, vector = 32;
1114 CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
1115
1116 cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
1117 cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
1118 cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
1119 cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
1120
1121 if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
1122 dev) == CUDA_SUCCESS
1123 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
1124 dev) == CUDA_SUCCESS
1125 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
1126 dev) == CUDA_SUCCESS
1127 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
1128 dev) == CUDA_SUCCESS)
1129 {
1130 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1131 " dev_size=%d, cpu_size=%d\n",
1132 warp_size, block_size, dev_size, cpu_size);
1133 gang = (cpu_size / block_size) * dev_size;
1134 worker = block_size / warp_size;
1135 vector = warp_size;
1136 }
1137
1138 /* There is no upper bound on the gang size. The best size
1139 matches the hardware configuration. Logical gangs are
1140 scheduled onto physical hardware. To maximize usage, we
1141 should guess a large number. */
1142 if (default_dims[GOMP_DIM_GANG] < 1)
1143 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1144 /* The worker size must not exceed the hardware. */
1145 if (default_dims[GOMP_DIM_WORKER] < 1
1146 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1147 default_dims[GOMP_DIM_WORKER] = worker;
1148 /* The vector size must exactly match the hardware. */
1149 if (default_dims[GOMP_DIM_VECTOR] < 1
1150 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1151 default_dims[GOMP_DIM_VECTOR] = vector;
1152
1153 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1154 default_dims[GOMP_DIM_GANG],
1155 default_dims[GOMP_DIM_WORKER],
1156 default_dims[GOMP_DIM_VECTOR]);
1157 }
1158 pthread_mutex_unlock (&ptx_dev_lock);
1159
1160 for (i = 0; i != GOMP_DIM_MAX; i++)
1161 if (!dims[i])
1162 dims[i] = default_dims[i];
1163 }
1164
1165 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1166 the host and the device. HP is a host pointer to the new chunk, and DP is
1167 the corresponding device pointer. */
1168 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1169
1170 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1171
1172 /* Copy the array of arguments to the mapped page. */
1173 for (i = 0; i < mapnum; i++)
1174 ((void **) hp)[i] = devaddrs[i];
1175
1176 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1177 fact have the same value on a unified-memory system). */
1178 CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1179 mapnum * sizeof (void *));
1180 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1181 " gangs=%u, workers=%u, vectors=%u\n",
1182 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1183 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1184
1185 // OpenACC CUDA
1186 //
1187 // num_gangs nctaid.x
1188 // num_workers ntid.y
1189 // vector length ntid.x
1190
1191 kargs[0] = &dp;
1192 CUDA_CALL_ASSERT (cuLaunchKernel, function,
1193 dims[GOMP_DIM_GANG], 1, 1,
1194 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1195 0, dev_str->stream, kargs, 0);
1196
1197 #ifndef DISABLE_ASYNC
1198 if (async < acc_async_noval)
1199 {
1200 r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1201 if (r == CUDA_ERROR_LAUNCH_FAILED)
1202 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1203 maybe_abort_msg);
1204 else if (r != CUDA_SUCCESS)
1205 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1206 }
1207 else
1208 {
1209 CUevent *e;
1210
1211 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1212
1213 r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1214 if (r == CUDA_ERROR_LAUNCH_FAILED)
1215 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1216 maybe_abort_msg);
1217 else if (r != CUDA_SUCCESS)
1218 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1219
1220 event_gc (true);
1221
1222 CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1223
1224 event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1225 }
1226 #else
1227 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1228 if (r == CUDA_ERROR_LAUNCH_FAILED)
1229 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1230 maybe_abort_msg);
1231 else if (r != CUDA_SUCCESS)
1232 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1233 #endif
1234
1235 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1236 targ_fn->launch->fn);
1237
1238 #ifndef DISABLE_ASYNC
1239 if (async < acc_async_noval)
1240 #endif
1241 map_pop (dev_str);
1242 }
1243
1244 void * openacc_get_current_cuda_context (void);
1245
1246 static void *
1247 nvptx_alloc (size_t s)
1248 {
1249 CUdeviceptr d;
1250
1251 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1252 return (void *) d;
1253 }
1254
1255 static bool
1256 nvptx_free (void *p)
1257 {
1258 CUdeviceptr pb;
1259 size_t ps;
1260
1261 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1262 if ((CUdeviceptr) p != pb)
1263 {
1264 GOMP_PLUGIN_error ("invalid device address");
1265 return false;
1266 }
1267
1268 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1269 return true;
1270 }
1271
1272
1273 static bool
1274 nvptx_host2dev (void *d, const void *h, size_t s)
1275 {
1276 CUdeviceptr pb;
1277 size_t ps;
1278 struct nvptx_thread *nvthd = nvptx_thread ();
1279
1280 if (!s)
1281 return true;
1282 if (!d)
1283 {
1284 GOMP_PLUGIN_error ("invalid device address");
1285 return false;
1286 }
1287
1288 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1289
1290 if (!pb)
1291 {
1292 GOMP_PLUGIN_error ("invalid device address");
1293 return false;
1294 }
1295 if (!h)
1296 {
1297 GOMP_PLUGIN_error ("invalid host address");
1298 return false;
1299 }
1300 if (d == h)
1301 {
1302 GOMP_PLUGIN_error ("invalid host or device address");
1303 return false;
1304 }
1305 if ((void *)(d + s) > (void *)(pb + ps))
1306 {
1307 GOMP_PLUGIN_error ("invalid size");
1308 return false;
1309 }
1310
1311 #ifndef DISABLE_ASYNC
1312 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1313 {
1314 CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1315 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1316 event_gc (false);
1317 CUDA_CALL (cuMemcpyHtoDAsync,
1318 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1319 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1320 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1321 }
1322 else
1323 #endif
1324 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1325
1326 return true;
1327 }
1328
1329 static bool
1330 nvptx_dev2host (void *h, const void *d, size_t s)
1331 {
1332 CUdeviceptr pb;
1333 size_t ps;
1334 struct nvptx_thread *nvthd = nvptx_thread ();
1335
1336 if (!s)
1337 return true;
1338 if (!d)
1339 {
1340 GOMP_PLUGIN_error ("invalid device address");
1341 return false;
1342 }
1343
1344 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1345
1346 if (!pb)
1347 {
1348 GOMP_PLUGIN_error ("invalid device address");
1349 return false;
1350 }
1351 if (!h)
1352 {
1353 GOMP_PLUGIN_error ("invalid host address");
1354 return false;
1355 }
1356 if (d == h)
1357 {
1358 GOMP_PLUGIN_error ("invalid host or device address");
1359 return false;
1360 }
1361 if ((void *)(d + s) > (void *)(pb + ps))
1362 {
1363 GOMP_PLUGIN_error ("invalid size");
1364 return false;
1365 }
1366
1367 #ifndef DISABLE_ASYNC
1368 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1369 {
1370 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1371 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1372 event_gc (false);
1373 CUDA_CALL (cuMemcpyDtoHAsync,
1374 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1375 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1376 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1377 }
1378 else
1379 #endif
1380 CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1381
1382 return true;
1383 }
1384
1385 static void
1386 nvptx_set_async (int async)
1387 {
1388 struct nvptx_thread *nvthd = nvptx_thread ();
1389 nvthd->current_stream
1390 = select_stream_for_async (async, pthread_self (), true, NULL);
1391 }
1392
1393 static int
1394 nvptx_async_test (int async)
1395 {
1396 CUresult r;
1397 struct ptx_stream *s;
1398
1399 s = select_stream_for_async (async, pthread_self (), false, NULL);
1400
1401 if (!s)
1402 GOMP_PLUGIN_fatal ("unknown async %d", async);
1403
1404 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1405 if (r == CUDA_SUCCESS)
1406 {
1407 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1408 whether all work has completed on this stream, and if so omits the call
1409 to the wait hook. If that happens, event_gc might not get called
1410 (which prevents variables from getting unmapped and their associated
1411 device storage freed), so call it here. */
1412 event_gc (true);
1413 return 1;
1414 }
1415 else if (r == CUDA_ERROR_NOT_READY)
1416 return 0;
1417
1418 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1419
1420 return 0;
1421 }
1422
1423 static int
1424 nvptx_async_test_all (void)
1425 {
1426 struct ptx_stream *s;
1427 pthread_t self = pthread_self ();
1428 struct nvptx_thread *nvthd = nvptx_thread ();
1429
1430 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1431
1432 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1433 {
1434 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1435 && CUDA_CALL_NOCHECK (cuStreamQuery,
1436 s->stream) == CUDA_ERROR_NOT_READY)
1437 {
1438 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1439 return 0;
1440 }
1441 }
1442
1443 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1444
1445 event_gc (true);
1446
1447 return 1;
1448 }
1449
1450 static void
1451 nvptx_wait (int async)
1452 {
1453 struct ptx_stream *s;
1454
1455 s = select_stream_for_async (async, pthread_self (), false, NULL);
1456 if (!s)
1457 GOMP_PLUGIN_fatal ("unknown async %d", async);
1458
1459 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1460
1461 event_gc (true);
1462 }
1463
1464 static void
1465 nvptx_wait_async (int async1, int async2)
1466 {
1467 CUevent *e;
1468 struct ptx_stream *s1, *s2;
1469 pthread_t self = pthread_self ();
1470
1471 /* The stream that is waiting (rather than being waited for) doesn't
1472 necessarily have to exist already. */
1473 s2 = select_stream_for_async (async2, self, true, NULL);
1474
1475 s1 = select_stream_for_async (async1, self, false, NULL);
1476 if (!s1)
1477 GOMP_PLUGIN_fatal ("invalid async 1\n");
1478
1479 if (s1 == s2)
1480 GOMP_PLUGIN_fatal ("identical parameters");
1481
1482 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1483
1484 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1485
1486 event_gc (true);
1487
1488 CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1489
1490 event_add (PTX_EVT_SYNC, e, NULL, 0);
1491
1492 CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1493 }
1494
1495 static void
1496 nvptx_wait_all (void)
1497 {
1498 CUresult r;
1499 struct ptx_stream *s;
1500 pthread_t self = pthread_self ();
1501 struct nvptx_thread *nvthd = nvptx_thread ();
1502
1503 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1504
1505 /* Wait for active streams initiated by this thread (or by multiple threads)
1506 to complete. */
1507 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1508 {
1509 if (s->multithreaded || pthread_equal (s->host_thread, self))
1510 {
1511 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1512 if (r == CUDA_SUCCESS)
1513 continue;
1514 else if (r != CUDA_ERROR_NOT_READY)
1515 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1516
1517 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1518 }
1519 }
1520
1521 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1522
1523 event_gc (true);
1524 }
1525
1526 static void
1527 nvptx_wait_all_async (int async)
1528 {
1529 struct ptx_stream *waiting_stream, *other_stream;
1530 CUevent *e;
1531 struct nvptx_thread *nvthd = nvptx_thread ();
1532 pthread_t self = pthread_self ();
1533
1534 /* The stream doing the waiting. This could be the first mention of the
1535 stream, so create it if necessary. */
1536 waiting_stream
1537 = select_stream_for_async (async, pthread_self (), true, NULL);
1538
1539 /* Launches on the null stream already block on other streams in the
1540 context. */
1541 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1542 return;
1543
1544 event_gc (true);
1545
1546 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1547
1548 for (other_stream = nvthd->ptx_dev->active_streams;
1549 other_stream != NULL;
1550 other_stream = other_stream->next)
1551 {
1552 if (!other_stream->multithreaded
1553 && !pthread_equal (other_stream->host_thread, self))
1554 continue;
1555
1556 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1557
1558 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1559
1560 /* Record an event on the waited-for stream. */
1561 CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1562
1563 event_add (PTX_EVT_SYNC, e, NULL, 0);
1564
1565 CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1566 }
1567
1568 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1569 }
1570
1571 static void *
1572 nvptx_get_current_cuda_device (void)
1573 {
1574 struct nvptx_thread *nvthd = nvptx_thread ();
1575
1576 if (!nvthd || !nvthd->ptx_dev)
1577 return NULL;
1578
1579 return &nvthd->ptx_dev->dev;
1580 }
1581
1582 static void *
1583 nvptx_get_current_cuda_context (void)
1584 {
1585 struct nvptx_thread *nvthd = nvptx_thread ();
1586
1587 if (!nvthd || !nvthd->ptx_dev)
1588 return NULL;
1589
1590 return nvthd->ptx_dev->ctx;
1591 }
1592
1593 static void *
1594 nvptx_get_cuda_stream (int async)
1595 {
1596 struct ptx_stream *s;
1597 struct nvptx_thread *nvthd = nvptx_thread ();
1598
1599 if (!nvthd || !nvthd->ptx_dev)
1600 return NULL;
1601
1602 s = select_stream_for_async (async, pthread_self (), false, NULL);
1603
1604 return s ? s->stream : NULL;
1605 }
1606
1607 static int
1608 nvptx_set_cuda_stream (int async, void *stream)
1609 {
1610 struct ptx_stream *oldstream;
1611 pthread_t self = pthread_self ();
1612 struct nvptx_thread *nvthd = nvptx_thread ();
1613
1614 if (async < 0)
1615 GOMP_PLUGIN_fatal ("bad async %d", async);
1616
1617 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1618
1619 /* We have a list of active streams and an array mapping async values to
1620 entries of that list. We need to take "ownership" of the passed-in stream,
1621 and add it to our list, removing the previous entry also (if there was one)
1622 in order to prevent resource leaks. Note the potential for surprise
1623 here: maybe we should keep track of passed-in streams and leave it up to
1624 the user to tidy those up, but that doesn't work for stream handles
1625 returned from acc_get_cuda_stream above... */
1626
1627 oldstream = select_stream_for_async (async, self, false, NULL);
1628
1629 if (oldstream)
1630 {
1631 if (nvthd->ptx_dev->active_streams == oldstream)
1632 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1633 else
1634 {
1635 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1636 while (s->next != oldstream)
1637 s = s->next;
1638 s->next = s->next->next;
1639 }
1640
1641 CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1642
1643 if (!map_fini (oldstream))
1644 GOMP_PLUGIN_fatal ("error when freeing host memory");
1645
1646 free (oldstream);
1647 }
1648
1649 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1650
1651 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1652
1653 return 1;
1654 }
1655
1656 /* Plugin entry points. */
1657
1658 const char *
1659 GOMP_OFFLOAD_get_name (void)
1660 {
1661 return "nvptx";
1662 }
1663
1664 unsigned int
1665 GOMP_OFFLOAD_get_caps (void)
1666 {
1667 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1668 }
1669
1670 int
1671 GOMP_OFFLOAD_get_type (void)
1672 {
1673 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1674 }
1675
1676 int
1677 GOMP_OFFLOAD_get_num_devices (void)
1678 {
1679 return nvptx_get_num_devices ();
1680 }
1681
1682 bool
1683 GOMP_OFFLOAD_init_device (int n)
1684 {
1685 struct ptx_device *dev;
1686
1687 pthread_mutex_lock (&ptx_dev_lock);
1688
1689 if (!nvptx_init () || ptx_devices[n] != NULL)
1690 {
1691 pthread_mutex_unlock (&ptx_dev_lock);
1692 return false;
1693 }
1694
1695 dev = nvptx_open_device (n);
1696 if (dev)
1697 {
1698 ptx_devices[n] = dev;
1699 instantiated_devices++;
1700 }
1701
1702 pthread_mutex_unlock (&ptx_dev_lock);
1703
1704 return dev != NULL;
1705 }
1706
1707 bool
1708 GOMP_OFFLOAD_fini_device (int n)
1709 {
1710 pthread_mutex_lock (&ptx_dev_lock);
1711
1712 if (ptx_devices[n] != NULL)
1713 {
1714 if (!nvptx_attach_host_thread_to_device (n)
1715 || !nvptx_close_device (ptx_devices[n]))
1716 {
1717 pthread_mutex_unlock (&ptx_dev_lock);
1718 return false;
1719 }
1720 ptx_devices[n] = NULL;
1721 instantiated_devices--;
1722 }
1723
1724 pthread_mutex_unlock (&ptx_dev_lock);
1725 return true;
1726 }
1727
1728 /* Return the libgomp version number we're compatible with. There is
1729 no requirement for cross-version compatibility. */
1730
1731 unsigned
1732 GOMP_OFFLOAD_version (void)
1733 {
1734 return GOMP_VERSION;
1735 }
1736
1737 /* Initialize __nvptx_clocktick, if present in MODULE. */
1738
1739 static void
1740 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1741 {
1742 CUdeviceptr dptr;
1743 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1744 module, "__nvptx_clocktick");
1745 if (r == CUDA_ERROR_NOT_FOUND)
1746 return;
1747 if (r != CUDA_SUCCESS)
1748 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1749 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1750 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1751 sizeof (__nvptx_clocktick));
1752 if (r != CUDA_SUCCESS)
1753 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1754 }
1755
1756 /* Load the (partial) program described by TARGET_DATA to device
1757 number ORD. Allocate and return TARGET_TABLE. */
1758
1759 int
1760 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1761 struct addr_pair **target_table)
1762 {
1763 CUmodule module;
1764 const char *const *var_names;
1765 const struct targ_fn_launch *fn_descs;
1766 unsigned int fn_entries, var_entries, i, j;
1767 struct targ_fn_descriptor *targ_fns;
1768 struct addr_pair *targ_tbl;
1769 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1770 struct ptx_image_data *new_image;
1771 struct ptx_device *dev;
1772
1773 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1774 {
1775 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1776 " (expected %u, received %u)",
1777 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1778 return -1;
1779 }
1780
1781 if (!nvptx_attach_host_thread_to_device (ord)
1782 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1783 return -1;
1784
1785 dev = ptx_devices[ord];
1786
1787 /* The mkoffload utility emits a struct of pointers/integers at the
1788 start of each offload image. The array of kernel names and the
1789 functions addresses form a one-to-one correspondence. */
1790
1791 var_entries = img_header->var_num;
1792 var_names = img_header->var_names;
1793 fn_entries = img_header->fn_num;
1794 fn_descs = img_header->fn_descs;
1795
1796 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1797 * (fn_entries + var_entries));
1798 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1799 * fn_entries);
1800
1801 *target_table = targ_tbl;
1802
1803 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1804 new_image->target_data = target_data;
1805 new_image->module = module;
1806 new_image->fns = targ_fns;
1807
1808 pthread_mutex_lock (&dev->image_lock);
1809 new_image->next = dev->images;
1810 dev->images = new_image;
1811 pthread_mutex_unlock (&dev->image_lock);
1812
1813 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1814 {
1815 CUfunction function;
1816 int nregs, mthrs;
1817
1818 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1819 fn_descs[i].fn);
1820 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1821 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1822 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1823 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1824
1825 targ_fns->fn = function;
1826 targ_fns->launch = &fn_descs[i];
1827 targ_fns->regs_per_thread = nregs;
1828 targ_fns->max_threads_per_block = mthrs;
1829
1830 targ_tbl->start = (uintptr_t) targ_fns;
1831 targ_tbl->end = targ_tbl->start + 1;
1832 }
1833
1834 for (j = 0; j < var_entries; j++, targ_tbl++)
1835 {
1836 CUdeviceptr var;
1837 size_t bytes;
1838
1839 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1840 &var, &bytes, module, var_names[j]);
1841
1842 targ_tbl->start = (uintptr_t) var;
1843 targ_tbl->end = targ_tbl->start + bytes;
1844 }
1845
1846 nvptx_set_clocktick (module, dev);
1847
1848 return fn_entries + var_entries;
1849 }
1850
1851 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1852 function descriptors allocated by G_O_load_image. */
1853
1854 bool
1855 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1856 {
1857 struct ptx_image_data *image, **prev_p;
1858 struct ptx_device *dev = ptx_devices[ord];
1859
1860 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1861 {
1862 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1863 " (expected %u, received %u)",
1864 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1865 return false;
1866 }
1867
1868 bool ret = true;
1869 pthread_mutex_lock (&dev->image_lock);
1870 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1871 if (image->target_data == target_data)
1872 {
1873 *prev_p = image->next;
1874 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1875 ret = false;
1876 free (image->fns);
1877 free (image);
1878 break;
1879 }
1880 pthread_mutex_unlock (&dev->image_lock);
1881 return ret;
1882 }
1883
1884 void *
1885 GOMP_OFFLOAD_alloc (int ord, size_t size)
1886 {
1887 if (!nvptx_attach_host_thread_to_device (ord))
1888 return NULL;
1889 return nvptx_alloc (size);
1890 }
1891
1892 bool
1893 GOMP_OFFLOAD_free (int ord, void *ptr)
1894 {
1895 return (nvptx_attach_host_thread_to_device (ord)
1896 && nvptx_free (ptr));
1897 }
1898
1899 bool
1900 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1901 {
1902 return (nvptx_attach_host_thread_to_device (ord)
1903 && nvptx_dev2host (dst, src, n));
1904 }
1905
1906 bool
1907 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1908 {
1909 return (nvptx_attach_host_thread_to_device (ord)
1910 && nvptx_host2dev (dst, src, n));
1911 }
1912
1913 bool
1914 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1915 {
1916 struct ptx_device *ptx_dev = ptx_devices[ord];
1917 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
1918 ptx_dev->null_stream->stream);
1919 return true;
1920 }
1921
1922 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1923
1924 void
1925 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1926 void **hostaddrs, void **devaddrs,
1927 int async, unsigned *dims, void *targ_mem_desc)
1928 {
1929 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1930 }
1931
1932 void
1933 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
1934 {
1935 struct nvptx_thread *nvthd = nvptx_thread ();
1936 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1937
1938 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1939 CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
1940 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
1941 }
1942
1943 int
1944 GOMP_OFFLOAD_openacc_async_test (int async)
1945 {
1946 return nvptx_async_test (async);
1947 }
1948
1949 int
1950 GOMP_OFFLOAD_openacc_async_test_all (void)
1951 {
1952 return nvptx_async_test_all ();
1953 }
1954
1955 void
1956 GOMP_OFFLOAD_openacc_async_wait (int async)
1957 {
1958 nvptx_wait (async);
1959 }
1960
1961 void
1962 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1963 {
1964 nvptx_wait_async (async1, async2);
1965 }
1966
1967 void
1968 GOMP_OFFLOAD_openacc_async_wait_all (void)
1969 {
1970 nvptx_wait_all ();
1971 }
1972
1973 void
1974 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1975 {
1976 nvptx_wait_all_async (async);
1977 }
1978
1979 void
1980 GOMP_OFFLOAD_openacc_async_set_async (int async)
1981 {
1982 nvptx_set_async (async);
1983 }
1984
1985 void *
1986 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1987 {
1988 struct ptx_device *ptx_dev;
1989 struct nvptx_thread *nvthd
1990 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1991 CUcontext thd_ctx;
1992
1993 ptx_dev = ptx_devices[ord];
1994
1995 assert (ptx_dev);
1996
1997 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1998
1999 assert (ptx_dev->ctx);
2000
2001 if (!thd_ctx)
2002 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2003
2004 nvthd->current_stream = ptx_dev->null_stream;
2005 nvthd->ptx_dev = ptx_dev;
2006
2007 return (void *) nvthd;
2008 }
2009
2010 void
2011 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2012 {
2013 free (data);
2014 }
2015
2016 void *
2017 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
2018 {
2019 return nvptx_get_current_cuda_device ();
2020 }
2021
2022 void *
2023 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
2024 {
2025 return nvptx_get_current_cuda_context ();
2026 }
2027
2028 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
2029
2030 void *
2031 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
2032 {
2033 return nvptx_get_cuda_stream (async);
2034 }
2035
2036 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
2037
2038 int
2039 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
2040 {
2041 return nvptx_set_cuda_stream (async, stream);
2042 }
2043
2044 /* Adjust launch dimensions: pick good values for number of blocks and warps
2045 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2046 own limits. */
2047
2048 static void
2049 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2050 struct ptx_device *ptx_dev,
2051 int *teams_p, int *threads_p)
2052 {
2053 int max_warps_block = fn->max_threads_per_block / 32;
2054 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2055 and libgcc, which matches documented limit of all GPUs as of 2015. */
2056 if (max_warps_block > 32)
2057 max_warps_block = 32;
2058 if (*threads_p <= 0)
2059 *threads_p = 8;
2060 if (*threads_p > max_warps_block)
2061 *threads_p = max_warps_block;
2062
2063 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2064 /* This is an estimate of how many blocks the device can host simultaneously.
2065 Actual limit, which may be lower, can be queried with "occupancy control"
2066 driver interface (since CUDA 6.0). */
2067 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2068 if (*teams_p <= 0 || *teams_p > max_blocks)
2069 *teams_p = max_blocks;
2070 }
2071
2072 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2073 target regions. */
2074
2075 static size_t
2076 nvptx_stacks_size ()
2077 {
2078 return 128 * 1024;
2079 }
2080
2081 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
2082
2083 static void *
2084 nvptx_stacks_alloc (size_t size, int num)
2085 {
2086 CUdeviceptr stacks;
2087 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2088 if (r != CUDA_SUCCESS)
2089 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2090 return (void *) stacks;
2091 }
2092
2093 /* Release storage previously allocated by nvptx_stacks_alloc. */
2094
2095 static void
2096 nvptx_stacks_free (void *p, int num)
2097 {
2098 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2099 if (r != CUDA_SUCCESS)
2100 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2101 }
2102
2103 void
2104 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2105 {
2106 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2107 CUresult r;
2108 struct ptx_device *ptx_dev = ptx_devices[ord];
2109 const char *maybe_abort_msg = "(perhaps abort was called)";
2110 int teams = 0, threads = 0;
2111
2112 if (!args)
2113 GOMP_PLUGIN_fatal ("No target arguments provided");
2114 while (*args)
2115 {
2116 intptr_t id = (intptr_t) *args++, val;
2117 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2118 val = (intptr_t) *args++;
2119 else
2120 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2121 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2122 continue;
2123 val = val > INT_MAX ? INT_MAX : val;
2124 id &= GOMP_TARGET_ARG_ID_MASK;
2125 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2126 teams = val;
2127 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2128 threads = val;
2129 }
2130 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2131
2132 size_t stack_size = nvptx_stacks_size ();
2133 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2134 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2135 size_t fn_args_size = sizeof fn_args;
2136 void *config[] = {
2137 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2138 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2139 CU_LAUNCH_PARAM_END
2140 };
2141 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2142 32, threads, 1, 0, ptx_dev->null_stream->stream,
2143 NULL, config);
2144 if (r != CUDA_SUCCESS)
2145 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2146
2147 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2148 if (r == CUDA_ERROR_LAUNCH_FAILED)
2149 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2150 maybe_abort_msg);
2151 else if (r != CUDA_SUCCESS)
2152 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2153 nvptx_stacks_free (stacks, teams * threads);
2154 }
2155
2156 void
2157 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2158 void *async_data)
2159 {
2160 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
2161 }