fc296321812f76add1cdf1db9bb2f56e34066487
[gcc.git] / libgomp / plugin / plugin-nvptx.c
1 /* Plugin for NVPTX execution.
2
3 Copyright (C) 2013-2015 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-ptx.h"
38 #include "oacc-plugin.h"
39
40 #include <pthread.h>
41 #include <cuda.h>
42 #include <stdbool.h>
43 #include <stdint.h>
44 #include <string.h>
45 #include <stdio.h>
46 #include <dlfcn.h>
47 #include <unistd.h>
48 #include <assert.h>
49
50 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
51
52 static struct
53 {
54 CUresult r;
55 char *m;
56 } cuda_errlist[]=
57 {
58 { CUDA_ERROR_INVALID_VALUE, "invalid value" },
59 { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
60 { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
61 { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
62 { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
63 { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
64 { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
65 { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
66 { CUDA_ERROR_NO_DEVICE, "no device" },
67 { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
68 { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
69 { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
70 { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
71 { CUDA_ERROR_MAP_FAILED, "map error" },
72 { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
73 { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
74 { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
75 { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
76 { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
77 { CUDA_ERROR_NOT_MAPPED, "not mapped" },
78 { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
79 { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
80 { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
81 { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
82 { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
83 { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
84 { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
85 { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
86 { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
87 "shared object symbol not found" },
88 { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
89 { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
90 { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
91 { CUDA_ERROR_NOT_FOUND, "not found" },
92 { CUDA_ERROR_NOT_READY, "not ready" },
93 { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
94 { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
95 { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
96 { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
97 "launch incompatibe texturing" },
98 { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
99 { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
100 { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
101 { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
102 { CUDA_ERROR_ASSERT, "assert" },
103 { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
104 { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
105 "host memory already registered" },
106 { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
107 { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
108 { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
109 { CUDA_ERROR_UNKNOWN, "unknown" }
110 };
111
112 static char errmsg[128];
113
114 static char *
115 cuda_error (CUresult r)
116 {
117 int i;
118
119 for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
120 {
121 if (cuda_errlist[i].r == r)
122 return &cuda_errlist[i].m[0];
123 }
124
125 sprintf (&errmsg[0], "unknown result code: %5d", r);
126
127 return &errmsg[0];
128 }
129
130 static unsigned int instantiated_devices = 0;
131 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
132
133 struct ptx_stream
134 {
135 CUstream stream;
136 pthread_t host_thread;
137 bool multithreaded;
138
139 CUdeviceptr d;
140 void *h;
141 void *h_begin;
142 void *h_end;
143 void *h_next;
144 void *h_prev;
145 void *h_tail;
146
147 struct ptx_stream *next;
148 };
149
150 /* Thread-specific data for PTX. */
151
152 struct nvptx_thread
153 {
154 struct ptx_stream *current_stream;
155 struct ptx_device *ptx_dev;
156 };
157
158 struct map
159 {
160 int async;
161 size_t size;
162 char mappings[0];
163 };
164
165 static void
166 map_init (struct ptx_stream *s)
167 {
168 CUresult r;
169
170 int size = getpagesize ();
171
172 assert (s);
173 assert (!s->d);
174 assert (!s->h);
175
176 r = cuMemAllocHost (&s->h, size);
177 if (r != CUDA_SUCCESS)
178 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
179
180 r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
181 if (r != CUDA_SUCCESS)
182 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
183
184 assert (s->h);
185
186 s->h_begin = s->h;
187 s->h_end = s->h_begin + size;
188 s->h_next = s->h_prev = s->h_tail = s->h_begin;
189
190 assert (s->h_next);
191 assert (s->h_end);
192 }
193
194 static void
195 map_fini (struct ptx_stream *s)
196 {
197 CUresult r;
198
199 r = cuMemFreeHost (s->h);
200 if (r != CUDA_SUCCESS)
201 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
202 }
203
204 static void
205 map_pop (struct ptx_stream *s)
206 {
207 struct map *m;
208
209 assert (s != NULL);
210 assert (s->h_next);
211 assert (s->h_prev);
212 assert (s->h_tail);
213
214 m = s->h_tail;
215
216 s->h_tail += m->size;
217
218 if (s->h_tail >= s->h_end)
219 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
220
221 if (s->h_next == s->h_tail)
222 s->h_prev = s->h_next;
223
224 assert (s->h_next >= s->h_begin);
225 assert (s->h_tail >= s->h_begin);
226 assert (s->h_prev >= s->h_begin);
227
228 assert (s->h_next <= s->h_end);
229 assert (s->h_tail <= s->h_end);
230 assert (s->h_prev <= s->h_end);
231 }
232
233 static void
234 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
235 {
236 int left;
237 int offset;
238 struct map *m;
239
240 assert (s != NULL);
241
242 left = s->h_end - s->h_next;
243 size += sizeof (struct map);
244
245 assert (s->h_prev);
246 assert (s->h_next);
247
248 if (size >= left)
249 {
250 m = s->h_prev;
251 m->size += left;
252 s->h_next = s->h_begin;
253
254 if (s->h_next + size > s->h_end)
255 GOMP_PLUGIN_fatal ("unable to push map");
256 }
257
258 assert (s->h_next);
259
260 m = s->h_next;
261 m->async = async;
262 m->size = size;
263
264 offset = (void *)&m->mappings[0] - s->h;
265
266 *d = (void *)(s->d + offset);
267 *h = (void *)(s->h + offset);
268
269 s->h_prev = s->h_next;
270 s->h_next += size;
271
272 assert (s->h_prev);
273 assert (s->h_next);
274
275 assert (s->h_next >= s->h_begin);
276 assert (s->h_tail >= s->h_begin);
277 assert (s->h_prev >= s->h_begin);
278 assert (s->h_next <= s->h_end);
279 assert (s->h_tail <= s->h_end);
280 assert (s->h_prev <= s->h_end);
281
282 return;
283 }
284
285 /* Descriptor of a loaded function. */
286
287 struct targ_fn_descriptor
288 {
289 CUfunction fn;
290 const char *name;
291 };
292
293 /* A loaded PTX image. */
294 struct ptx_image_data
295 {
296 const void *target_data;
297 CUmodule module;
298
299 struct targ_fn_descriptor *fns; /* Array of functions. */
300
301 struct ptx_image_data *next;
302 };
303
304 struct ptx_device
305 {
306 CUcontext ctx;
307 bool ctx_shared;
308 CUdevice dev;
309 struct ptx_stream *null_stream;
310 /* All non-null streams associated with this device (actually context),
311 either created implicitly or passed in from the user (via
312 acc_set_cuda_stream). */
313 struct ptx_stream *active_streams;
314 struct {
315 struct ptx_stream **arr;
316 int size;
317 } async_streams;
318 /* A lock for use when manipulating the above stream list and array. */
319 pthread_mutex_t stream_lock;
320 int ord;
321 bool overlap;
322 bool map;
323 bool concur;
324 int mode;
325 bool mkern;
326
327 struct ptx_image_data *images; /* Images loaded on device. */
328 pthread_mutex_t image_lock; /* Lock for above list. */
329
330 struct ptx_device *next;
331 };
332
333 enum ptx_event_type
334 {
335 PTX_EVT_MEM,
336 PTX_EVT_KNL,
337 PTX_EVT_SYNC,
338 PTX_EVT_ASYNC_CLEANUP
339 };
340
341 struct ptx_event
342 {
343 CUevent *evt;
344 int type;
345 void *addr;
346 int ord;
347
348 struct ptx_event *next;
349 };
350
351 static pthread_mutex_t ptx_event_lock;
352 static struct ptx_event *ptx_events;
353
354 static struct ptx_device **ptx_devices;
355
356 #define _XSTR(s) _STR(s)
357 #define _STR(s) #s
358
359 static struct _synames
360 {
361 char *n;
362 } cuda_symnames[] =
363 {
364 { _XSTR (cuCtxCreate) },
365 { _XSTR (cuCtxDestroy) },
366 { _XSTR (cuCtxGetCurrent) },
367 { _XSTR (cuCtxPushCurrent) },
368 { _XSTR (cuCtxSynchronize) },
369 { _XSTR (cuDeviceGet) },
370 { _XSTR (cuDeviceGetAttribute) },
371 { _XSTR (cuDeviceGetCount) },
372 { _XSTR (cuEventCreate) },
373 { _XSTR (cuEventDestroy) },
374 { _XSTR (cuEventQuery) },
375 { _XSTR (cuEventRecord) },
376 { _XSTR (cuInit) },
377 { _XSTR (cuLaunchKernel) },
378 { _XSTR (cuLinkAddData) },
379 { _XSTR (cuLinkComplete) },
380 { _XSTR (cuLinkCreate) },
381 { _XSTR (cuMemAlloc) },
382 { _XSTR (cuMemAllocHost) },
383 { _XSTR (cuMemcpy) },
384 { _XSTR (cuMemcpyDtoH) },
385 { _XSTR (cuMemcpyDtoHAsync) },
386 { _XSTR (cuMemcpyHtoD) },
387 { _XSTR (cuMemcpyHtoDAsync) },
388 { _XSTR (cuMemFree) },
389 { _XSTR (cuMemFreeHost) },
390 { _XSTR (cuMemGetAddressRange) },
391 { _XSTR (cuMemHostGetDevicePointer) },
392 { _XSTR (cuMemHostRegister) },
393 { _XSTR (cuMemHostUnregister) },
394 { _XSTR (cuModuleGetFunction) },
395 { _XSTR (cuModuleLoadData) },
396 { _XSTR (cuStreamDestroy) },
397 { _XSTR (cuStreamQuery) },
398 { _XSTR (cuStreamSynchronize) },
399 { _XSTR (cuStreamWaitEvent) }
400 };
401
402 static int
403 verify_device_library (void)
404 {
405 int i;
406 void *dh, *ds;
407
408 dh = dlopen ("libcuda.so", RTLD_LAZY);
409 if (!dh)
410 return -1;
411
412 for (i = 0; i < ARRAYSIZE (cuda_symnames); i++)
413 {
414 ds = dlsym (dh, cuda_symnames[i].n);
415 if (!ds)
416 return -1;
417 }
418
419 dlclose (dh);
420
421 return 0;
422 }
423
424 static inline struct nvptx_thread *
425 nvptx_thread (void)
426 {
427 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
428 }
429
430 static void
431 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
432 {
433 int i;
434 struct ptx_stream *null_stream
435 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
436
437 null_stream->stream = NULL;
438 null_stream->host_thread = pthread_self ();
439 null_stream->multithreaded = true;
440 null_stream->d = (CUdeviceptr) NULL;
441 null_stream->h = NULL;
442 map_init (null_stream);
443 ptx_dev->null_stream = null_stream;
444
445 ptx_dev->active_streams = NULL;
446 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
447
448 if (concurrency < 1)
449 concurrency = 1;
450
451 /* This is just a guess -- make space for as many async streams as the
452 current device is capable of concurrently executing. This can grow
453 later as necessary. No streams are created yet. */
454 ptx_dev->async_streams.arr
455 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
456 ptx_dev->async_streams.size = concurrency;
457
458 for (i = 0; i < concurrency; i++)
459 ptx_dev->async_streams.arr[i] = NULL;
460 }
461
462 static void
463 fini_streams_for_device (struct ptx_device *ptx_dev)
464 {
465 free (ptx_dev->async_streams.arr);
466
467 while (ptx_dev->active_streams != NULL)
468 {
469 struct ptx_stream *s = ptx_dev->active_streams;
470 ptx_dev->active_streams = ptx_dev->active_streams->next;
471
472 map_fini (s);
473 cuStreamDestroy (s->stream);
474 free (s);
475 }
476
477 map_fini (ptx_dev->null_stream);
478 free (ptx_dev->null_stream);
479 }
480
481 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
482 thread THREAD (and also current device/context). If CREATE is true, create
483 the stream if it does not exist (or use EXISTING if it is non-NULL), and
484 associate the stream with the same thread argument. Returns stream to use
485 as result. */
486
487 static struct ptx_stream *
488 select_stream_for_async (int async, pthread_t thread, bool create,
489 CUstream existing)
490 {
491 struct nvptx_thread *nvthd = nvptx_thread ();
492 /* Local copy of TLS variable. */
493 struct ptx_device *ptx_dev = nvthd->ptx_dev;
494 struct ptx_stream *stream = NULL;
495 int orig_async = async;
496
497 /* The special value acc_async_noval (-1) maps (for now) to an
498 implicitly-created stream, which is then handled the same as any other
499 numbered async stream. Other options are available, e.g. using the null
500 stream for anonymous async operations, or choosing an idle stream from an
501 active set. But, stick with this for now. */
502 if (async > acc_async_sync)
503 async++;
504
505 if (create)
506 pthread_mutex_lock (&ptx_dev->stream_lock);
507
508 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
509 null stream, and in fact better performance may be obtainable if it doesn't
510 (because the null stream enforces overly-strict synchronisation with
511 respect to other streams for legacy reasons, and that's probably not
512 needed with OpenACC). Maybe investigate later. */
513 if (async == acc_async_sync)
514 stream = ptx_dev->null_stream;
515 else if (async >= 0 && async < ptx_dev->async_streams.size
516 && ptx_dev->async_streams.arr[async] && !(create && existing))
517 stream = ptx_dev->async_streams.arr[async];
518 else if (async >= 0 && create)
519 {
520 if (async >= ptx_dev->async_streams.size)
521 {
522 int i, newsize = ptx_dev->async_streams.size * 2;
523
524 if (async >= newsize)
525 newsize = async + 1;
526
527 ptx_dev->async_streams.arr
528 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
529 newsize * sizeof (struct ptx_stream *));
530
531 for (i = ptx_dev->async_streams.size; i < newsize; i++)
532 ptx_dev->async_streams.arr[i] = NULL;
533
534 ptx_dev->async_streams.size = newsize;
535 }
536
537 /* Create a new stream on-demand if there isn't one already, or if we're
538 setting a particular async value to an existing (externally-provided)
539 stream. */
540 if (!ptx_dev->async_streams.arr[async] || existing)
541 {
542 CUresult r;
543 struct ptx_stream *s
544 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
545
546 if (existing)
547 s->stream = existing;
548 else
549 {
550 r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
551 if (r != CUDA_SUCCESS)
552 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
553 }
554
555 /* If CREATE is true, we're going to be queueing some work on this
556 stream. Associate it with the current host thread. */
557 s->host_thread = thread;
558 s->multithreaded = false;
559
560 s->d = (CUdeviceptr) NULL;
561 s->h = NULL;
562 map_init (s);
563
564 s->next = ptx_dev->active_streams;
565 ptx_dev->active_streams = s;
566 ptx_dev->async_streams.arr[async] = s;
567 }
568
569 stream = ptx_dev->async_streams.arr[async];
570 }
571 else if (async < 0)
572 GOMP_PLUGIN_fatal ("bad async %d", async);
573
574 if (create)
575 {
576 assert (stream != NULL);
577
578 /* If we're trying to use the same stream from different threads
579 simultaneously, set stream->multithreaded to true. This affects the
580 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
581 only wait for asynchronous launches from the same host thread they are
582 invoked on. If multiple threads use the same async value, we make note
583 of that here and fall back to testing/waiting for all threads in those
584 functions. */
585 if (thread != stream->host_thread)
586 stream->multithreaded = true;
587
588 pthread_mutex_unlock (&ptx_dev->stream_lock);
589 }
590 else if (stream && !stream->multithreaded
591 && !pthread_equal (stream->host_thread, thread))
592 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
593
594 return stream;
595 }
596
597 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
598 should be locked on entry and remains locked on exit. */
599
600 static bool
601 nvptx_init (void)
602 {
603 CUresult r;
604 int rc;
605 int ndevs;
606
607 if (instantiated_devices != 0)
608 return true;
609
610 rc = verify_device_library ();
611 if (rc < 0)
612 return false;
613
614 r = cuInit (0);
615 if (r != CUDA_SUCCESS)
616 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
617
618 ptx_events = NULL;
619
620 pthread_mutex_init (&ptx_event_lock, NULL);
621
622 r = cuDeviceGetCount (&ndevs);
623 if (r != CUDA_SUCCESS)
624 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
625
626 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
627 * ndevs);
628
629 return true;
630 }
631
632 /* Select the N'th PTX device for the current host thread. The device must
633 have been previously opened before calling this function. */
634
635 static void
636 nvptx_attach_host_thread_to_device (int n)
637 {
638 CUdevice dev;
639 CUresult r;
640 struct ptx_device *ptx_dev;
641 CUcontext thd_ctx;
642
643 r = cuCtxGetDevice (&dev);
644 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
645 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
646
647 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
648 return;
649 else
650 {
651 CUcontext old_ctx;
652
653 ptx_dev = ptx_devices[n];
654 assert (ptx_dev);
655
656 r = cuCtxGetCurrent (&thd_ctx);
657 if (r != CUDA_SUCCESS)
658 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
659
660 /* We don't necessarily have a current context (e.g. if it has been
661 destroyed. Pop it if we do though. */
662 if (thd_ctx != NULL)
663 {
664 r = cuCtxPopCurrent (&old_ctx);
665 if (r != CUDA_SUCCESS)
666 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
667 }
668
669 r = cuCtxPushCurrent (ptx_dev->ctx);
670 if (r != CUDA_SUCCESS)
671 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
672 }
673 }
674
675 static struct ptx_device *
676 nvptx_open_device (int n)
677 {
678 struct ptx_device *ptx_dev;
679 CUdevice dev, ctx_dev;
680 CUresult r;
681 int async_engines, pi;
682
683 r = cuDeviceGet (&dev, n);
684 if (r != CUDA_SUCCESS)
685 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
686
687 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
688
689 ptx_dev->ord = n;
690 ptx_dev->dev = dev;
691 ptx_dev->ctx_shared = false;
692
693 r = cuCtxGetDevice (&ctx_dev);
694 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
695 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
696
697 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
698 {
699 /* The current host thread has an active context for a different device.
700 Detach it. */
701 CUcontext old_ctx;
702
703 r = cuCtxPopCurrent (&old_ctx);
704 if (r != CUDA_SUCCESS)
705 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
706 }
707
708 r = cuCtxGetCurrent (&ptx_dev->ctx);
709 if (r != CUDA_SUCCESS)
710 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
711
712 if (!ptx_dev->ctx)
713 {
714 r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
715 if (r != CUDA_SUCCESS)
716 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
717 }
718 else
719 ptx_dev->ctx_shared = true;
720
721 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
722 if (r != CUDA_SUCCESS)
723 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
724
725 ptx_dev->overlap = pi;
726
727 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
728 if (r != CUDA_SUCCESS)
729 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
730
731 ptx_dev->map = pi;
732
733 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
734 if (r != CUDA_SUCCESS)
735 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
736
737 ptx_dev->concur = pi;
738
739 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
740 if (r != CUDA_SUCCESS)
741 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
742
743 ptx_dev->mode = pi;
744
745 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
746 if (r != CUDA_SUCCESS)
747 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
748
749 ptx_dev->mkern = pi;
750
751 r = cuDeviceGetAttribute (&async_engines,
752 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
753 if (r != CUDA_SUCCESS)
754 async_engines = 1;
755
756 ptx_dev->images = NULL;
757 pthread_mutex_init (&ptx_dev->image_lock, NULL);
758
759 init_streams_for_device (ptx_dev, async_engines);
760
761 return ptx_dev;
762 }
763
764 static void
765 nvptx_close_device (struct ptx_device *ptx_dev)
766 {
767 CUresult r;
768
769 if (!ptx_dev)
770 return;
771
772 fini_streams_for_device (ptx_dev);
773
774 pthread_mutex_destroy (&ptx_dev->image_lock);
775
776 if (!ptx_dev->ctx_shared)
777 {
778 r = cuCtxDestroy (ptx_dev->ctx);
779 if (r != CUDA_SUCCESS)
780 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
781 }
782
783 free (ptx_dev);
784 }
785
786 static int
787 nvptx_get_num_devices (void)
788 {
789 int n;
790 CUresult r;
791
792 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
793 configurations. */
794 if (sizeof (void *) != 8)
795 return 0;
796
797 /* This function will be called before the plugin has been initialized in
798 order to enumerate available devices, but CUDA API routines can't be used
799 until cuInit has been called. Just call it now (but don't yet do any
800 further initialization). */
801 if (instantiated_devices == 0)
802 {
803 r = cuInit (0);
804 /* This is not an error: e.g. we may have CUDA libraries installed but
805 no devices available. */
806 if (r != CUDA_SUCCESS)
807 return 0;
808 }
809
810 r = cuDeviceGetCount (&n);
811 if (r!= CUDA_SUCCESS)
812 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
813
814 return n;
815 }
816
817
818 static void
819 link_ptx (CUmodule *module, const char *ptx_code)
820 {
821 CUjit_option opts[7];
822 void *optvals[7];
823 float elapsed = 0.0;
824 #define LOGSIZE 8192
825 char elog[LOGSIZE];
826 char ilog[LOGSIZE];
827 unsigned long logsize = LOGSIZE;
828 CUlinkState linkstate;
829 CUresult r;
830 void *linkout;
831 size_t linkoutsize __attribute__ ((unused));
832
833 GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
834
835 opts[0] = CU_JIT_WALL_TIME;
836 optvals[0] = &elapsed;
837
838 opts[1] = CU_JIT_INFO_LOG_BUFFER;
839 optvals[1] = &ilog[0];
840
841 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
842 optvals[2] = (void *) logsize;
843
844 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
845 optvals[3] = &elog[0];
846
847 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
848 optvals[4] = (void *) logsize;
849
850 opts[5] = CU_JIT_LOG_VERBOSE;
851 optvals[5] = (void *) 1;
852
853 opts[6] = CU_JIT_TARGET;
854 optvals[6] = (void *) CU_TARGET_COMPUTE_30;
855
856 r = cuLinkCreate (7, opts, optvals, &linkstate);
857 if (r != CUDA_SUCCESS)
858 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
859
860 char *abort_ptx = ABORT_PTX;
861 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
862 strlen (abort_ptx) + 1, 0, 0, 0, 0);
863 if (r != CUDA_SUCCESS)
864 {
865 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
866 GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
867 }
868
869 char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
870 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
871 strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
872 if (r != CUDA_SUCCESS)
873 {
874 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
875 GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
876 cuda_error (r));
877 }
878
879 char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
880 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
881 strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
882 if (r != CUDA_SUCCESS)
883 {
884 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
885 GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
886 cuda_error (r));
887 }
888
889 /* cuLinkAddData's 'data' argument erroneously omits the const qualifier. */
890 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char *)ptx_code,
891 strlen (ptx_code) + 1, 0, 0, 0, 0);
892 if (r != CUDA_SUCCESS)
893 {
894 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
895 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
896 }
897
898 r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
899 if (r != CUDA_SUCCESS)
900 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
901
902 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
903 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
904
905 r = cuModuleLoadData (module, linkout);
906 if (r != CUDA_SUCCESS)
907 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
908 }
909
910 static void
911 event_gc (bool memmap_lockable)
912 {
913 struct ptx_event *ptx_event = ptx_events;
914 struct nvptx_thread *nvthd = nvptx_thread ();
915
916 pthread_mutex_lock (&ptx_event_lock);
917
918 while (ptx_event != NULL)
919 {
920 CUresult r;
921 struct ptx_event *e = ptx_event;
922
923 ptx_event = ptx_event->next;
924
925 if (e->ord != nvthd->ptx_dev->ord)
926 continue;
927
928 r = cuEventQuery (*e->evt);
929 if (r == CUDA_SUCCESS)
930 {
931 CUevent *te;
932
933 te = e->evt;
934
935 switch (e->type)
936 {
937 case PTX_EVT_MEM:
938 case PTX_EVT_SYNC:
939 break;
940
941 case PTX_EVT_KNL:
942 map_pop (e->addr);
943 break;
944
945 case PTX_EVT_ASYNC_CLEANUP:
946 {
947 /* The function gomp_plugin_async_unmap_vars needs to claim the
948 memory-map splay tree lock for the current device, so we
949 can't call it when one of our callers has already claimed
950 the lock. In that case, just delay the GC for this event
951 until later. */
952 if (!memmap_lockable)
953 continue;
954
955 GOMP_PLUGIN_async_unmap_vars (e->addr);
956 }
957 break;
958 }
959
960 cuEventDestroy (*te);
961 free ((void *)te);
962
963 if (ptx_events == e)
964 ptx_events = ptx_events->next;
965 else
966 {
967 struct ptx_event *e_ = ptx_events;
968 while (e_->next != e)
969 e_ = e_->next;
970 e_->next = e_->next->next;
971 }
972
973 free (e);
974 }
975 }
976
977 pthread_mutex_unlock (&ptx_event_lock);
978 }
979
980 static void
981 event_add (enum ptx_event_type type, CUevent *e, void *h)
982 {
983 struct ptx_event *ptx_event;
984 struct nvptx_thread *nvthd = nvptx_thread ();
985
986 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
987 || type == PTX_EVT_ASYNC_CLEANUP);
988
989 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
990 ptx_event->type = type;
991 ptx_event->evt = e;
992 ptx_event->addr = h;
993 ptx_event->ord = nvthd->ptx_dev->ord;
994
995 pthread_mutex_lock (&ptx_event_lock);
996
997 ptx_event->next = ptx_events;
998 ptx_events = ptx_event;
999
1000 pthread_mutex_unlock (&ptx_event_lock);
1001 }
1002
1003 void
1004 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1005 size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
1006 int vector_length, int async, void *targ_mem_desc)
1007 {
1008 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1009 CUfunction function;
1010 CUresult r;
1011 int i;
1012 struct ptx_stream *dev_str;
1013 void *kargs[1];
1014 void *hp, *dp;
1015 unsigned int nthreads_in_block;
1016 struct nvptx_thread *nvthd = nvptx_thread ();
1017 const char *maybe_abort_msg = "(perhaps abort was called)";
1018
1019 function = targ_fn->fn;
1020
1021 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1022 assert (dev_str == nvthd->current_stream);
1023
1024 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1025 the host and the device. HP is a host pointer to the new chunk, and DP is
1026 the corresponding device pointer. */
1027 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1028
1029 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1030
1031 /* Copy the array of arguments to the mapped page. */
1032 for (i = 0; i < mapnum; i++)
1033 ((void **) hp)[i] = devaddrs[i];
1034
1035 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1036 fact have the same value on a unified-memory system). */
1037 r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
1038 if (r != CUDA_SUCCESS)
1039 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
1040
1041 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
1042
1043 // OpenACC CUDA
1044 //
1045 // num_gangs blocks
1046 // num_workers warps (where a warp is equivalent to 32 threads)
1047 // vector length threads
1048 //
1049
1050 /* The openacc vector_length clause 'determines the vector length to use for
1051 vector or SIMD operations'. The question is how to map this to CUDA.
1052
1053 In CUDA, the warp size is the vector length of a CUDA device. However, the
1054 CUDA interface abstracts away from that, and only shows us warp size
1055 indirectly in maximum number of threads per block, which is a product of
1056 warp size and the number of hyperthreads of a multiprocessor.
1057
1058 We choose to map openacc vector_length directly onto the number of threads
1059 in a block, in the x dimension. This is reflected in gcc code generation
1060 that uses ThreadIdx.x to access vector elements.
1061
1062 Attempting to use an openacc vector_length of more than the maximum number
1063 of threads per block will result in a cuda error. */
1064 nthreads_in_block = vector_length;
1065
1066 kargs[0] = &dp;
1067 r = cuLaunchKernel (function,
1068 num_gangs, 1, 1,
1069 nthreads_in_block, 1, 1,
1070 0, dev_str->stream, kargs, 0);
1071 if (r != CUDA_SUCCESS)
1072 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1073
1074 #ifndef DISABLE_ASYNC
1075 if (async < acc_async_noval)
1076 {
1077 r = cuStreamSynchronize (dev_str->stream);
1078 if (r == CUDA_ERROR_LAUNCH_FAILED)
1079 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1080 maybe_abort_msg);
1081 else if (r != CUDA_SUCCESS)
1082 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1083 }
1084 else
1085 {
1086 CUevent *e;
1087
1088 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1089
1090 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1091 if (r == CUDA_ERROR_LAUNCH_FAILED)
1092 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1093 maybe_abort_msg);
1094 else if (r != CUDA_SUCCESS)
1095 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1096
1097 event_gc (true);
1098
1099 r = cuEventRecord (*e, dev_str->stream);
1100 if (r != CUDA_SUCCESS)
1101 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1102
1103 event_add (PTX_EVT_KNL, e, (void *)dev_str);
1104 }
1105 #else
1106 r = cuCtxSynchronize ();
1107 if (r == CUDA_ERROR_LAUNCH_FAILED)
1108 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1109 maybe_abort_msg);
1110 else if (r != CUDA_SUCCESS)
1111 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1112 #endif
1113
1114 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1115 targ_fn->name);
1116
1117 #ifndef DISABLE_ASYNC
1118 if (async < acc_async_noval)
1119 #endif
1120 map_pop (dev_str);
1121 }
1122
1123 void * openacc_get_current_cuda_context (void);
1124
1125 static void *
1126 nvptx_alloc (size_t s)
1127 {
1128 CUdeviceptr d;
1129 CUresult r;
1130
1131 r = cuMemAlloc (&d, s);
1132 if (r == CUDA_ERROR_OUT_OF_MEMORY)
1133 return 0;
1134 if (r != CUDA_SUCCESS)
1135 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1136 return (void *)d;
1137 }
1138
1139 static void
1140 nvptx_free (void *p)
1141 {
1142 CUresult r;
1143 CUdeviceptr pb;
1144 size_t ps;
1145
1146 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1147 if (r != CUDA_SUCCESS)
1148 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1149
1150 if ((CUdeviceptr)p != pb)
1151 GOMP_PLUGIN_fatal ("invalid device address");
1152
1153 r = cuMemFree ((CUdeviceptr)p);
1154 if (r != CUDA_SUCCESS)
1155 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1156 }
1157
1158 static void *
1159 nvptx_host2dev (void *d, const void *h, size_t s)
1160 {
1161 CUresult r;
1162 CUdeviceptr pb;
1163 size_t ps;
1164 struct nvptx_thread *nvthd = nvptx_thread ();
1165
1166 if (!s)
1167 return 0;
1168
1169 if (!d)
1170 GOMP_PLUGIN_fatal ("invalid device address");
1171
1172 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1173 if (r != CUDA_SUCCESS)
1174 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1175
1176 if (!pb)
1177 GOMP_PLUGIN_fatal ("invalid device address");
1178
1179 if (!h)
1180 GOMP_PLUGIN_fatal ("invalid host address");
1181
1182 if (d == h)
1183 GOMP_PLUGIN_fatal ("invalid host or device address");
1184
1185 if ((void *)(d + s) > (void *)(pb + ps))
1186 GOMP_PLUGIN_fatal ("invalid size");
1187
1188 #ifndef DISABLE_ASYNC
1189 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1190 {
1191 CUevent *e;
1192
1193 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1194
1195 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1196 if (r != CUDA_SUCCESS)
1197 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1198
1199 event_gc (false);
1200
1201 r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1202 nvthd->current_stream->stream);
1203 if (r != CUDA_SUCCESS)
1204 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1205
1206 r = cuEventRecord (*e, nvthd->current_stream->stream);
1207 if (r != CUDA_SUCCESS)
1208 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1209
1210 event_add (PTX_EVT_MEM, e, (void *)h);
1211 }
1212 else
1213 #endif
1214 {
1215 r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1216 if (r != CUDA_SUCCESS)
1217 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1218 }
1219
1220 return 0;
1221 }
1222
1223 static void *
1224 nvptx_dev2host (void *h, const void *d, size_t s)
1225 {
1226 CUresult r;
1227 CUdeviceptr pb;
1228 size_t ps;
1229 struct nvptx_thread *nvthd = nvptx_thread ();
1230
1231 if (!s)
1232 return 0;
1233
1234 if (!d)
1235 GOMP_PLUGIN_fatal ("invalid device address");
1236
1237 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1238 if (r != CUDA_SUCCESS)
1239 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1240
1241 if (!pb)
1242 GOMP_PLUGIN_fatal ("invalid device address");
1243
1244 if (!h)
1245 GOMP_PLUGIN_fatal ("invalid host address");
1246
1247 if (d == h)
1248 GOMP_PLUGIN_fatal ("invalid host or device address");
1249
1250 if ((void *)(d + s) > (void *)(pb + ps))
1251 GOMP_PLUGIN_fatal ("invalid size");
1252
1253 #ifndef DISABLE_ASYNC
1254 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1255 {
1256 CUevent *e;
1257
1258 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1259
1260 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1261 if (r != CUDA_SUCCESS)
1262 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1263
1264 event_gc (false);
1265
1266 r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1267 nvthd->current_stream->stream);
1268 if (r != CUDA_SUCCESS)
1269 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1270
1271 r = cuEventRecord (*e, nvthd->current_stream->stream);
1272 if (r != CUDA_SUCCESS)
1273 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1274
1275 event_add (PTX_EVT_MEM, e, (void *)h);
1276 }
1277 else
1278 #endif
1279 {
1280 r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1281 if (r != CUDA_SUCCESS)
1282 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1283 }
1284
1285 return 0;
1286 }
1287
1288 static void
1289 nvptx_set_async (int async)
1290 {
1291 struct nvptx_thread *nvthd = nvptx_thread ();
1292 nvthd->current_stream
1293 = select_stream_for_async (async, pthread_self (), true, NULL);
1294 }
1295
1296 static int
1297 nvptx_async_test (int async)
1298 {
1299 CUresult r;
1300 struct ptx_stream *s;
1301
1302 s = select_stream_for_async (async, pthread_self (), false, NULL);
1303
1304 if (!s)
1305 GOMP_PLUGIN_fatal ("unknown async %d", async);
1306
1307 r = cuStreamQuery (s->stream);
1308 if (r == CUDA_SUCCESS)
1309 {
1310 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1311 whether all work has completed on this stream, and if so omits the call
1312 to the wait hook. If that happens, event_gc might not get called
1313 (which prevents variables from getting unmapped and their associated
1314 device storage freed), so call it here. */
1315 event_gc (true);
1316 return 1;
1317 }
1318 else if (r == CUDA_ERROR_NOT_READY)
1319 return 0;
1320
1321 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1322
1323 return 0;
1324 }
1325
1326 static int
1327 nvptx_async_test_all (void)
1328 {
1329 struct ptx_stream *s;
1330 pthread_t self = pthread_self ();
1331 struct nvptx_thread *nvthd = nvptx_thread ();
1332
1333 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1334
1335 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1336 {
1337 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1338 && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1339 {
1340 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1341 return 0;
1342 }
1343 }
1344
1345 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1346
1347 event_gc (true);
1348
1349 return 1;
1350 }
1351
1352 static void
1353 nvptx_wait (int async)
1354 {
1355 CUresult r;
1356 struct ptx_stream *s;
1357
1358 s = select_stream_for_async (async, pthread_self (), false, NULL);
1359
1360 if (!s)
1361 GOMP_PLUGIN_fatal ("unknown async %d", async);
1362
1363 r = cuStreamSynchronize (s->stream);
1364 if (r != CUDA_SUCCESS)
1365 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1366
1367 event_gc (true);
1368 }
1369
1370 static void
1371 nvptx_wait_async (int async1, int async2)
1372 {
1373 CUresult r;
1374 CUevent *e;
1375 struct ptx_stream *s1, *s2;
1376 pthread_t self = pthread_self ();
1377
1378 /* The stream that is waiting (rather than being waited for) doesn't
1379 necessarily have to exist already. */
1380 s2 = select_stream_for_async (async2, self, true, NULL);
1381
1382 s1 = select_stream_for_async (async1, self, false, NULL);
1383 if (!s1)
1384 GOMP_PLUGIN_fatal ("invalid async 1\n");
1385
1386 if (s1 == s2)
1387 GOMP_PLUGIN_fatal ("identical parameters");
1388
1389 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1390
1391 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1392 if (r != CUDA_SUCCESS)
1393 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1394
1395 event_gc (true);
1396
1397 r = cuEventRecord (*e, s1->stream);
1398 if (r != CUDA_SUCCESS)
1399 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1400
1401 event_add (PTX_EVT_SYNC, e, NULL);
1402
1403 r = cuStreamWaitEvent (s2->stream, *e, 0);
1404 if (r != CUDA_SUCCESS)
1405 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1406 }
1407
1408 static void
1409 nvptx_wait_all (void)
1410 {
1411 CUresult r;
1412 struct ptx_stream *s;
1413 pthread_t self = pthread_self ();
1414 struct nvptx_thread *nvthd = nvptx_thread ();
1415
1416 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1417
1418 /* Wait for active streams initiated by this thread (or by multiple threads)
1419 to complete. */
1420 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1421 {
1422 if (s->multithreaded || pthread_equal (s->host_thread, self))
1423 {
1424 r = cuStreamQuery (s->stream);
1425 if (r == CUDA_SUCCESS)
1426 continue;
1427 else if (r != CUDA_ERROR_NOT_READY)
1428 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1429
1430 r = cuStreamSynchronize (s->stream);
1431 if (r != CUDA_SUCCESS)
1432 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1433 }
1434 }
1435
1436 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1437
1438 event_gc (true);
1439 }
1440
1441 static void
1442 nvptx_wait_all_async (int async)
1443 {
1444 CUresult r;
1445 struct ptx_stream *waiting_stream, *other_stream;
1446 CUevent *e;
1447 struct nvptx_thread *nvthd = nvptx_thread ();
1448 pthread_t self = pthread_self ();
1449
1450 /* The stream doing the waiting. This could be the first mention of the
1451 stream, so create it if necessary. */
1452 waiting_stream
1453 = select_stream_for_async (async, pthread_self (), true, NULL);
1454
1455 /* Launches on the null stream already block on other streams in the
1456 context. */
1457 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1458 return;
1459
1460 event_gc (true);
1461
1462 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1463
1464 for (other_stream = nvthd->ptx_dev->active_streams;
1465 other_stream != NULL;
1466 other_stream = other_stream->next)
1467 {
1468 if (!other_stream->multithreaded
1469 && !pthread_equal (other_stream->host_thread, self))
1470 continue;
1471
1472 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1473
1474 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1475 if (r != CUDA_SUCCESS)
1476 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1477
1478 /* Record an event on the waited-for stream. */
1479 r = cuEventRecord (*e, other_stream->stream);
1480 if (r != CUDA_SUCCESS)
1481 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1482
1483 event_add (PTX_EVT_SYNC, e, NULL);
1484
1485 r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1486 if (r != CUDA_SUCCESS)
1487 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1488 }
1489
1490 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1491 }
1492
1493 static void *
1494 nvptx_get_current_cuda_device (void)
1495 {
1496 struct nvptx_thread *nvthd = nvptx_thread ();
1497
1498 if (!nvthd || !nvthd->ptx_dev)
1499 return NULL;
1500
1501 return &nvthd->ptx_dev->dev;
1502 }
1503
1504 static void *
1505 nvptx_get_current_cuda_context (void)
1506 {
1507 struct nvptx_thread *nvthd = nvptx_thread ();
1508
1509 if (!nvthd || !nvthd->ptx_dev)
1510 return NULL;
1511
1512 return nvthd->ptx_dev->ctx;
1513 }
1514
1515 static void *
1516 nvptx_get_cuda_stream (int async)
1517 {
1518 struct ptx_stream *s;
1519 struct nvptx_thread *nvthd = nvptx_thread ();
1520
1521 if (!nvthd || !nvthd->ptx_dev)
1522 return NULL;
1523
1524 s = select_stream_for_async (async, pthread_self (), false, NULL);
1525
1526 return s ? s->stream : NULL;
1527 }
1528
1529 static int
1530 nvptx_set_cuda_stream (int async, void *stream)
1531 {
1532 struct ptx_stream *oldstream;
1533 pthread_t self = pthread_self ();
1534 struct nvptx_thread *nvthd = nvptx_thread ();
1535
1536 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1537
1538 if (async < 0)
1539 GOMP_PLUGIN_fatal ("bad async %d", async);
1540
1541 /* We have a list of active streams and an array mapping async values to
1542 entries of that list. We need to take "ownership" of the passed-in stream,
1543 and add it to our list, removing the previous entry also (if there was one)
1544 in order to prevent resource leaks. Note the potential for surprise
1545 here: maybe we should keep track of passed-in streams and leave it up to
1546 the user to tidy those up, but that doesn't work for stream handles
1547 returned from acc_get_cuda_stream above... */
1548
1549 oldstream = select_stream_for_async (async, self, false, NULL);
1550
1551 if (oldstream)
1552 {
1553 if (nvthd->ptx_dev->active_streams == oldstream)
1554 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1555 else
1556 {
1557 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1558 while (s->next != oldstream)
1559 s = s->next;
1560 s->next = s->next->next;
1561 }
1562
1563 cuStreamDestroy (oldstream->stream);
1564 map_fini (oldstream);
1565 free (oldstream);
1566 }
1567
1568 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1569
1570 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1571
1572 return 1;
1573 }
1574
1575 /* Plugin entry points. */
1576
1577 const char *
1578 GOMP_OFFLOAD_get_name (void)
1579 {
1580 return "nvptx";
1581 }
1582
1583 unsigned int
1584 GOMP_OFFLOAD_get_caps (void)
1585 {
1586 return GOMP_OFFLOAD_CAP_OPENACC_200;
1587 }
1588
1589 int
1590 GOMP_OFFLOAD_get_type (void)
1591 {
1592 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1593 }
1594
1595 int
1596 GOMP_OFFLOAD_get_num_devices (void)
1597 {
1598 return nvptx_get_num_devices ();
1599 }
1600
1601 void
1602 GOMP_OFFLOAD_init_device (int n)
1603 {
1604 pthread_mutex_lock (&ptx_dev_lock);
1605
1606 if (!nvptx_init () || ptx_devices[n] != NULL)
1607 {
1608 pthread_mutex_unlock (&ptx_dev_lock);
1609 return;
1610 }
1611
1612 ptx_devices[n] = nvptx_open_device (n);
1613 instantiated_devices++;
1614
1615 pthread_mutex_unlock (&ptx_dev_lock);
1616 }
1617
1618 void
1619 GOMP_OFFLOAD_fini_device (int n)
1620 {
1621 pthread_mutex_lock (&ptx_dev_lock);
1622
1623 if (ptx_devices[n] != NULL)
1624 {
1625 nvptx_attach_host_thread_to_device (n);
1626 nvptx_close_device (ptx_devices[n]);
1627 ptx_devices[n] = NULL;
1628 instantiated_devices--;
1629 }
1630
1631 pthread_mutex_unlock (&ptx_dev_lock);
1632 }
1633
1634 /* Data emitted by mkoffload. */
1635
1636 typedef struct nvptx_tdata
1637 {
1638 const char *ptx_src;
1639
1640 const char *const *var_names;
1641 size_t var_num;
1642
1643 const char *const *fn_names;
1644 size_t fn_num;
1645 } nvptx_tdata_t;
1646
1647 /* Load the (partial) program described by TARGET_DATA to device
1648 number ORD. Allocate and return TARGET_TABLE. */
1649
1650 int
1651 GOMP_OFFLOAD_load_image (int ord, const void *target_data,
1652 struct addr_pair **target_table)
1653 {
1654 CUmodule module;
1655 const char *const *fn_names, *const *var_names;
1656 unsigned int fn_entries, var_entries, i, j;
1657 CUresult r;
1658 struct targ_fn_descriptor *targ_fns;
1659 struct addr_pair *targ_tbl;
1660 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1661 struct ptx_image_data *new_image;
1662 struct ptx_device *dev;
1663
1664 GOMP_OFFLOAD_init_device (ord);
1665
1666 dev = ptx_devices[ord];
1667
1668 nvptx_attach_host_thread_to_device (ord);
1669
1670 link_ptx (&module, img_header->ptx_src);
1671
1672 /* The mkoffload utility emits a struct of pointers/integers at the
1673 start of each offload image. The array of kernel names and the
1674 functions addresses form a one-to-one correspondence. */
1675
1676 var_entries = img_header->var_num;
1677 var_names = img_header->var_names;
1678 fn_entries = img_header->fn_num;
1679 fn_names = img_header->fn_names;
1680
1681 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1682 * (fn_entries + var_entries));
1683 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1684 * fn_entries);
1685
1686 *target_table = targ_tbl;
1687
1688 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1689 new_image->target_data = target_data;
1690 new_image->module = module;
1691 new_image->fns = targ_fns;
1692
1693 pthread_mutex_lock (&dev->image_lock);
1694 new_image->next = dev->images;
1695 dev->images = new_image;
1696 pthread_mutex_unlock (&dev->image_lock);
1697
1698 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1699 {
1700 CUfunction function;
1701
1702 r = cuModuleGetFunction (&function, module, fn_names[i]);
1703 if (r != CUDA_SUCCESS)
1704 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1705
1706 targ_fns->fn = function;
1707 targ_fns->name = (const char *) fn_names[i];
1708
1709 targ_tbl->start = (uintptr_t) targ_fns;
1710 targ_tbl->end = targ_tbl->start + 1;
1711 }
1712
1713 for (j = 0; j < var_entries; j++, targ_tbl++)
1714 {
1715 CUdeviceptr var;
1716 size_t bytes;
1717
1718 r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1719 if (r != CUDA_SUCCESS)
1720 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1721
1722 targ_tbl->start = (uintptr_t) var;
1723 targ_tbl->end = targ_tbl->start + bytes;
1724 }
1725
1726 return fn_entries + var_entries;
1727 }
1728
1729 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1730 function descriptors allocated by G_O_load_image. */
1731
1732 void
1733 GOMP_OFFLOAD_unload_image (int ord, const void *target_data)
1734 {
1735 struct ptx_image_data *image, **prev_p;
1736 struct ptx_device *dev = ptx_devices[ord];
1737
1738 pthread_mutex_lock (&dev->image_lock);
1739 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1740 if (image->target_data == target_data)
1741 {
1742 *prev_p = image->next;
1743 cuModuleUnload (image->module);
1744 free (image->fns);
1745 free (image);
1746 break;
1747 }
1748 pthread_mutex_unlock (&dev->image_lock);
1749 }
1750
1751 void *
1752 GOMP_OFFLOAD_alloc (int ord, size_t size)
1753 {
1754 nvptx_attach_host_thread_to_device (ord);
1755 return nvptx_alloc (size);
1756 }
1757
1758 void
1759 GOMP_OFFLOAD_free (int ord, void *ptr)
1760 {
1761 nvptx_attach_host_thread_to_device (ord);
1762 nvptx_free (ptr);
1763 }
1764
1765 void *
1766 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1767 {
1768 nvptx_attach_host_thread_to_device (ord);
1769 return nvptx_dev2host (dst, src, n);
1770 }
1771
1772 void *
1773 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1774 {
1775 nvptx_attach_host_thread_to_device (ord);
1776 return nvptx_host2dev (dst, src, n);
1777 }
1778
1779 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1780
1781 void
1782 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1783 void **hostaddrs, void **devaddrs, size_t *sizes,
1784 unsigned short *kinds, int num_gangs,
1785 int num_workers, int vector_length, int async,
1786 void *targ_mem_desc)
1787 {
1788 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1789 num_workers, vector_length, async, targ_mem_desc);
1790 }
1791
1792 void
1793 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1794 {
1795 CUevent *e;
1796 CUresult r;
1797 struct nvptx_thread *nvthd = nvptx_thread ();
1798
1799 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1800
1801 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1802 if (r != CUDA_SUCCESS)
1803 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1804
1805 r = cuEventRecord (*e, nvthd->current_stream->stream);
1806 if (r != CUDA_SUCCESS)
1807 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1808
1809 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1810 }
1811
1812 int
1813 GOMP_OFFLOAD_openacc_async_test (int async)
1814 {
1815 return nvptx_async_test (async);
1816 }
1817
1818 int
1819 GOMP_OFFLOAD_openacc_async_test_all (void)
1820 {
1821 return nvptx_async_test_all ();
1822 }
1823
1824 void
1825 GOMP_OFFLOAD_openacc_async_wait (int async)
1826 {
1827 nvptx_wait (async);
1828 }
1829
1830 void
1831 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1832 {
1833 nvptx_wait_async (async1, async2);
1834 }
1835
1836 void
1837 GOMP_OFFLOAD_openacc_async_wait_all (void)
1838 {
1839 nvptx_wait_all ();
1840 }
1841
1842 void
1843 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1844 {
1845 nvptx_wait_all_async (async);
1846 }
1847
1848 void
1849 GOMP_OFFLOAD_openacc_async_set_async (int async)
1850 {
1851 nvptx_set_async (async);
1852 }
1853
1854 void *
1855 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1856 {
1857 struct ptx_device *ptx_dev;
1858 struct nvptx_thread *nvthd
1859 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1860 CUresult r;
1861 CUcontext thd_ctx;
1862
1863 ptx_dev = ptx_devices[ord];
1864
1865 assert (ptx_dev);
1866
1867 r = cuCtxGetCurrent (&thd_ctx);
1868 if (r != CUDA_SUCCESS)
1869 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1870
1871 assert (ptx_dev->ctx);
1872
1873 if (!thd_ctx)
1874 {
1875 r = cuCtxPushCurrent (ptx_dev->ctx);
1876 if (r != CUDA_SUCCESS)
1877 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1878 }
1879
1880 nvthd->current_stream = ptx_dev->null_stream;
1881 nvthd->ptx_dev = ptx_dev;
1882
1883 return (void *) nvthd;
1884 }
1885
1886 void
1887 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1888 {
1889 free (data);
1890 }
1891
1892 void *
1893 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1894 {
1895 return nvptx_get_current_cuda_device ();
1896 }
1897
1898 void *
1899 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1900 {
1901 return nvptx_get_current_cuda_context ();
1902 }
1903
1904 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1905
1906 void *
1907 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1908 {
1909 return nvptx_get_cuda_stream (async);
1910 }
1911
1912 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1913
1914 int
1915 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1916 {
1917 return nvptx_set_cuda_stream (async, stream);
1918 }