1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2015 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
36 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
50 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
58 { CUDA_ERROR_INVALID_VALUE
, "invalid value" },
59 { CUDA_ERROR_OUT_OF_MEMORY
, "out of memory" },
60 { CUDA_ERROR_NOT_INITIALIZED
, "not initialized" },
61 { CUDA_ERROR_DEINITIALIZED
, "deinitialized" },
62 { CUDA_ERROR_PROFILER_DISABLED
, "profiler disabled" },
63 { CUDA_ERROR_PROFILER_NOT_INITIALIZED
, "profiler not initialized" },
64 { CUDA_ERROR_PROFILER_ALREADY_STARTED
, "already started" },
65 { CUDA_ERROR_PROFILER_ALREADY_STOPPED
, "already stopped" },
66 { CUDA_ERROR_NO_DEVICE
, "no device" },
67 { CUDA_ERROR_INVALID_DEVICE
, "invalid device" },
68 { CUDA_ERROR_INVALID_IMAGE
, "invalid image" },
69 { CUDA_ERROR_INVALID_CONTEXT
, "invalid context" },
70 { CUDA_ERROR_CONTEXT_ALREADY_CURRENT
, "context already current" },
71 { CUDA_ERROR_MAP_FAILED
, "map error" },
72 { CUDA_ERROR_UNMAP_FAILED
, "unmap error" },
73 { CUDA_ERROR_ARRAY_IS_MAPPED
, "array is mapped" },
74 { CUDA_ERROR_ALREADY_MAPPED
, "already mapped" },
75 { CUDA_ERROR_NO_BINARY_FOR_GPU
, "no binary for gpu" },
76 { CUDA_ERROR_ALREADY_ACQUIRED
, "already acquired" },
77 { CUDA_ERROR_NOT_MAPPED
, "not mapped" },
78 { CUDA_ERROR_NOT_MAPPED_AS_ARRAY
, "not mapped as array" },
79 { CUDA_ERROR_NOT_MAPPED_AS_POINTER
, "not mapped as pointer" },
80 { CUDA_ERROR_ECC_UNCORRECTABLE
, "ecc uncorrectable" },
81 { CUDA_ERROR_UNSUPPORTED_LIMIT
, "unsupported limit" },
82 { CUDA_ERROR_CONTEXT_ALREADY_IN_USE
, "context already in use" },
83 { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED
, "peer access unsupported" },
84 { CUDA_ERROR_INVALID_SOURCE
, "invalid source" },
85 { CUDA_ERROR_FILE_NOT_FOUND
, "file not found" },
86 { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
,
87 "shared object symbol not found" },
88 { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
, "shared object init error" },
89 { CUDA_ERROR_OPERATING_SYSTEM
, "operating system" },
90 { CUDA_ERROR_INVALID_HANDLE
, "invalid handle" },
91 { CUDA_ERROR_NOT_FOUND
, "not found" },
92 { CUDA_ERROR_NOT_READY
, "not ready" },
93 { CUDA_ERROR_LAUNCH_FAILED
, "launch error" },
94 { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
, "launch out of resources" },
95 { CUDA_ERROR_LAUNCH_TIMEOUT
, "launch timeout" },
96 { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
,
97 "launch incompatibe texturing" },
98 { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
, "peer access already enabled" },
99 { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
, "peer access not enabled " },
100 { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
, "primary cotext active" },
101 { CUDA_ERROR_CONTEXT_IS_DESTROYED
, "context is destroyed" },
102 { CUDA_ERROR_ASSERT
, "assert" },
103 { CUDA_ERROR_TOO_MANY_PEERS
, "too many peers" },
104 { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
,
105 "host memory already registered" },
106 { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED
, "host memory not registered" },
107 { CUDA_ERROR_NOT_PERMITTED
, "not permitted" },
108 { CUDA_ERROR_NOT_SUPPORTED
, "not supported" },
109 { CUDA_ERROR_UNKNOWN
, "unknown" }
112 static char errmsg
[128];
115 cuda_error (CUresult r
)
119 for (i
= 0; i
< ARRAYSIZE (cuda_errlist
); i
++)
121 if (cuda_errlist
[i
].r
== r
)
122 return &cuda_errlist
[i
].m
[0];
125 sprintf (&errmsg
[0], "unknown result code: %5d", r
);
130 static unsigned int instantiated_devices
= 0;
131 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
136 pthread_t host_thread
;
147 struct ptx_stream
*next
;
150 /* Thread-specific data for PTX. */
154 struct ptx_stream
*current_stream
;
155 struct ptx_device
*ptx_dev
;
166 map_init (struct ptx_stream
*s
)
170 int size
= getpagesize ();
176 r
= cuMemAllocHost (&s
->h
, size
);
177 if (r
!= CUDA_SUCCESS
)
178 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r
));
180 r
= cuMemHostGetDevicePointer (&s
->d
, s
->h
, 0);
181 if (r
!= CUDA_SUCCESS
)
182 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r
));
187 s
->h_end
= s
->h_begin
+ size
;
188 s
->h_next
= s
->h_prev
= s
->h_tail
= s
->h_begin
;
195 map_fini (struct ptx_stream
*s
)
199 r
= cuMemFreeHost (s
->h
);
200 if (r
!= CUDA_SUCCESS
)
201 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r
));
205 map_pop (struct ptx_stream
*s
)
216 s
->h_tail
+= m
->size
;
218 if (s
->h_tail
>= s
->h_end
)
219 s
->h_tail
= s
->h_begin
+ (int) (s
->h_tail
- s
->h_end
);
221 if (s
->h_next
== s
->h_tail
)
222 s
->h_prev
= s
->h_next
;
224 assert (s
->h_next
>= s
->h_begin
);
225 assert (s
->h_tail
>= s
->h_begin
);
226 assert (s
->h_prev
>= s
->h_begin
);
228 assert (s
->h_next
<= s
->h_end
);
229 assert (s
->h_tail
<= s
->h_end
);
230 assert (s
->h_prev
<= s
->h_end
);
234 map_push (struct ptx_stream
*s
, int async
, size_t size
, void **h
, void **d
)
242 left
= s
->h_end
- s
->h_next
;
243 size
+= sizeof (struct map
);
252 s
->h_next
= s
->h_begin
;
254 if (s
->h_next
+ size
> s
->h_end
)
255 GOMP_PLUGIN_fatal ("unable to push map");
264 offset
= (void *)&m
->mappings
[0] - s
->h
;
266 *d
= (void *)(s
->d
+ offset
);
267 *h
= (void *)(s
->h
+ offset
);
269 s
->h_prev
= s
->h_next
;
275 assert (s
->h_next
>= s
->h_begin
);
276 assert (s
->h_tail
>= s
->h_begin
);
277 assert (s
->h_prev
>= s
->h_begin
);
278 assert (s
->h_next
<= s
->h_end
);
279 assert (s
->h_tail
<= s
->h_end
);
280 assert (s
->h_prev
<= s
->h_end
);
285 /* Descriptor of a loaded function. */
287 struct targ_fn_descriptor
293 /* A loaded PTX image. */
294 struct ptx_image_data
296 const void *target_data
;
299 struct targ_fn_descriptor
*fns
; /* Array of functions. */
301 struct ptx_image_data
*next
;
309 struct ptx_stream
*null_stream
;
310 /* All non-null streams associated with this device (actually context),
311 either created implicitly or passed in from the user (via
312 acc_set_cuda_stream). */
313 struct ptx_stream
*active_streams
;
315 struct ptx_stream
**arr
;
318 /* A lock for use when manipulating the above stream list and array. */
319 pthread_mutex_t stream_lock
;
327 struct ptx_image_data
*images
; /* Images loaded on device. */
328 pthread_mutex_t image_lock
; /* Lock for above list. */
330 struct ptx_device
*next
;
338 PTX_EVT_ASYNC_CLEANUP
348 struct ptx_event
*next
;
351 static pthread_mutex_t ptx_event_lock
;
352 static struct ptx_event
*ptx_events
;
354 static struct ptx_device
**ptx_devices
;
356 #define _XSTR(s) _STR(s)
359 static struct _synames
364 { _XSTR (cuCtxCreate
) },
365 { _XSTR (cuCtxDestroy
) },
366 { _XSTR (cuCtxGetCurrent
) },
367 { _XSTR (cuCtxPushCurrent
) },
368 { _XSTR (cuCtxSynchronize
) },
369 { _XSTR (cuDeviceGet
) },
370 { _XSTR (cuDeviceGetAttribute
) },
371 { _XSTR (cuDeviceGetCount
) },
372 { _XSTR (cuEventCreate
) },
373 { _XSTR (cuEventDestroy
) },
374 { _XSTR (cuEventQuery
) },
375 { _XSTR (cuEventRecord
) },
377 { _XSTR (cuLaunchKernel
) },
378 { _XSTR (cuLinkAddData
) },
379 { _XSTR (cuLinkComplete
) },
380 { _XSTR (cuLinkCreate
) },
381 { _XSTR (cuMemAlloc
) },
382 { _XSTR (cuMemAllocHost
) },
383 { _XSTR (cuMemcpy
) },
384 { _XSTR (cuMemcpyDtoH
) },
385 { _XSTR (cuMemcpyDtoHAsync
) },
386 { _XSTR (cuMemcpyHtoD
) },
387 { _XSTR (cuMemcpyHtoDAsync
) },
388 { _XSTR (cuMemFree
) },
389 { _XSTR (cuMemFreeHost
) },
390 { _XSTR (cuMemGetAddressRange
) },
391 { _XSTR (cuMemHostGetDevicePointer
) },
392 { _XSTR (cuMemHostRegister
) },
393 { _XSTR (cuMemHostUnregister
) },
394 { _XSTR (cuModuleGetFunction
) },
395 { _XSTR (cuModuleLoadData
) },
396 { _XSTR (cuStreamDestroy
) },
397 { _XSTR (cuStreamQuery
) },
398 { _XSTR (cuStreamSynchronize
) },
399 { _XSTR (cuStreamWaitEvent
) }
403 verify_device_library (void)
408 dh
= dlopen ("libcuda.so", RTLD_LAZY
);
412 for (i
= 0; i
< ARRAYSIZE (cuda_symnames
); i
++)
414 ds
= dlsym (dh
, cuda_symnames
[i
].n
);
424 static inline struct nvptx_thread
*
427 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
431 init_streams_for_device (struct ptx_device
*ptx_dev
, int concurrency
)
434 struct ptx_stream
*null_stream
435 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
437 null_stream
->stream
= NULL
;
438 null_stream
->host_thread
= pthread_self ();
439 null_stream
->multithreaded
= true;
440 null_stream
->d
= (CUdeviceptr
) NULL
;
441 null_stream
->h
= NULL
;
442 map_init (null_stream
);
443 ptx_dev
->null_stream
= null_stream
;
445 ptx_dev
->active_streams
= NULL
;
446 pthread_mutex_init (&ptx_dev
->stream_lock
, NULL
);
451 /* This is just a guess -- make space for as many async streams as the
452 current device is capable of concurrently executing. This can grow
453 later as necessary. No streams are created yet. */
454 ptx_dev
->async_streams
.arr
455 = GOMP_PLUGIN_malloc (concurrency
* sizeof (struct ptx_stream
*));
456 ptx_dev
->async_streams
.size
= concurrency
;
458 for (i
= 0; i
< concurrency
; i
++)
459 ptx_dev
->async_streams
.arr
[i
] = NULL
;
463 fini_streams_for_device (struct ptx_device
*ptx_dev
)
465 free (ptx_dev
->async_streams
.arr
);
467 while (ptx_dev
->active_streams
!= NULL
)
469 struct ptx_stream
*s
= ptx_dev
->active_streams
;
470 ptx_dev
->active_streams
= ptx_dev
->active_streams
->next
;
473 cuStreamDestroy (s
->stream
);
477 map_fini (ptx_dev
->null_stream
);
478 free (ptx_dev
->null_stream
);
481 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
482 thread THREAD (and also current device/context). If CREATE is true, create
483 the stream if it does not exist (or use EXISTING if it is non-NULL), and
484 associate the stream with the same thread argument. Returns stream to use
487 static struct ptx_stream
*
488 select_stream_for_async (int async
, pthread_t thread
, bool create
,
491 struct nvptx_thread
*nvthd
= nvptx_thread ();
492 /* Local copy of TLS variable. */
493 struct ptx_device
*ptx_dev
= nvthd
->ptx_dev
;
494 struct ptx_stream
*stream
= NULL
;
495 int orig_async
= async
;
497 /* The special value acc_async_noval (-1) maps (for now) to an
498 implicitly-created stream, which is then handled the same as any other
499 numbered async stream. Other options are available, e.g. using the null
500 stream for anonymous async operations, or choosing an idle stream from an
501 active set. But, stick with this for now. */
502 if (async
> acc_async_sync
)
506 pthread_mutex_lock (&ptx_dev
->stream_lock
);
508 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
509 null stream, and in fact better performance may be obtainable if it doesn't
510 (because the null stream enforces overly-strict synchronisation with
511 respect to other streams for legacy reasons, and that's probably not
512 needed with OpenACC). Maybe investigate later. */
513 if (async
== acc_async_sync
)
514 stream
= ptx_dev
->null_stream
;
515 else if (async
>= 0 && async
< ptx_dev
->async_streams
.size
516 && ptx_dev
->async_streams
.arr
[async
] && !(create
&& existing
))
517 stream
= ptx_dev
->async_streams
.arr
[async
];
518 else if (async
>= 0 && create
)
520 if (async
>= ptx_dev
->async_streams
.size
)
522 int i
, newsize
= ptx_dev
->async_streams
.size
* 2;
524 if (async
>= newsize
)
527 ptx_dev
->async_streams
.arr
528 = GOMP_PLUGIN_realloc (ptx_dev
->async_streams
.arr
,
529 newsize
* sizeof (struct ptx_stream
*));
531 for (i
= ptx_dev
->async_streams
.size
; i
< newsize
; i
++)
532 ptx_dev
->async_streams
.arr
[i
] = NULL
;
534 ptx_dev
->async_streams
.size
= newsize
;
537 /* Create a new stream on-demand if there isn't one already, or if we're
538 setting a particular async value to an existing (externally-provided)
540 if (!ptx_dev
->async_streams
.arr
[async
] || existing
)
544 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
547 s
->stream
= existing
;
550 r
= cuStreamCreate (&s
->stream
, CU_STREAM_DEFAULT
);
551 if (r
!= CUDA_SUCCESS
)
552 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r
));
555 /* If CREATE is true, we're going to be queueing some work on this
556 stream. Associate it with the current host thread. */
557 s
->host_thread
= thread
;
558 s
->multithreaded
= false;
560 s
->d
= (CUdeviceptr
) NULL
;
564 s
->next
= ptx_dev
->active_streams
;
565 ptx_dev
->active_streams
= s
;
566 ptx_dev
->async_streams
.arr
[async
] = s
;
569 stream
= ptx_dev
->async_streams
.arr
[async
];
572 GOMP_PLUGIN_fatal ("bad async %d", async
);
576 assert (stream
!= NULL
);
578 /* If we're trying to use the same stream from different threads
579 simultaneously, set stream->multithreaded to true. This affects the
580 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
581 only wait for asynchronous launches from the same host thread they are
582 invoked on. If multiple threads use the same async value, we make note
583 of that here and fall back to testing/waiting for all threads in those
585 if (thread
!= stream
->host_thread
)
586 stream
->multithreaded
= true;
588 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
590 else if (stream
&& !stream
->multithreaded
591 && !pthread_equal (stream
->host_thread
, thread
))
592 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async
);
597 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
598 should be locked on entry and remains locked on exit. */
607 if (instantiated_devices
!= 0)
610 rc
= verify_device_library ();
615 if (r
!= CUDA_SUCCESS
)
616 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r
));
620 pthread_mutex_init (&ptx_event_lock
, NULL
);
622 r
= cuDeviceGetCount (&ndevs
);
623 if (r
!= CUDA_SUCCESS
)
624 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r
));
626 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
632 /* Select the N'th PTX device for the current host thread. The device must
633 have been previously opened before calling this function. */
636 nvptx_attach_host_thread_to_device (int n
)
640 struct ptx_device
*ptx_dev
;
643 r
= cuCtxGetDevice (&dev
);
644 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
645 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r
));
647 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
653 ptx_dev
= ptx_devices
[n
];
656 r
= cuCtxGetCurrent (&thd_ctx
);
657 if (r
!= CUDA_SUCCESS
)
658 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r
));
660 /* We don't necessarily have a current context (e.g. if it has been
661 destroyed. Pop it if we do though. */
664 r
= cuCtxPopCurrent (&old_ctx
);
665 if (r
!= CUDA_SUCCESS
)
666 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r
));
669 r
= cuCtxPushCurrent (ptx_dev
->ctx
);
670 if (r
!= CUDA_SUCCESS
)
671 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r
));
675 static struct ptx_device
*
676 nvptx_open_device (int n
)
678 struct ptx_device
*ptx_dev
;
679 CUdevice dev
, ctx_dev
;
681 int async_engines
, pi
;
683 r
= cuDeviceGet (&dev
, n
);
684 if (r
!= CUDA_SUCCESS
)
685 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r
));
687 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
691 ptx_dev
->ctx_shared
= false;
693 r
= cuCtxGetDevice (&ctx_dev
);
694 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
695 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r
));
697 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
699 /* The current host thread has an active context for a different device.
703 r
= cuCtxPopCurrent (&old_ctx
);
704 if (r
!= CUDA_SUCCESS
)
705 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r
));
708 r
= cuCtxGetCurrent (&ptx_dev
->ctx
);
709 if (r
!= CUDA_SUCCESS
)
710 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r
));
714 r
= cuCtxCreate (&ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
715 if (r
!= CUDA_SUCCESS
)
716 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r
));
719 ptx_dev
->ctx_shared
= true;
721 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
722 if (r
!= CUDA_SUCCESS
)
723 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
725 ptx_dev
->overlap
= pi
;
727 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
728 if (r
!= CUDA_SUCCESS
)
729 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
733 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
734 if (r
!= CUDA_SUCCESS
)
735 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
737 ptx_dev
->concur
= pi
;
739 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
740 if (r
!= CUDA_SUCCESS
)
741 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
745 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
746 if (r
!= CUDA_SUCCESS
)
747 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
751 r
= cuDeviceGetAttribute (&async_engines
,
752 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
753 if (r
!= CUDA_SUCCESS
)
756 ptx_dev
->images
= NULL
;
757 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
759 init_streams_for_device (ptx_dev
, async_engines
);
765 nvptx_close_device (struct ptx_device
*ptx_dev
)
772 fini_streams_for_device (ptx_dev
);
774 pthread_mutex_destroy (&ptx_dev
->image_lock
);
776 if (!ptx_dev
->ctx_shared
)
778 r
= cuCtxDestroy (ptx_dev
->ctx
);
779 if (r
!= CUDA_SUCCESS
)
780 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r
));
787 nvptx_get_num_devices (void)
792 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
794 if (sizeof (void *) != 8)
797 /* This function will be called before the plugin has been initialized in
798 order to enumerate available devices, but CUDA API routines can't be used
799 until cuInit has been called. Just call it now (but don't yet do any
800 further initialization). */
801 if (instantiated_devices
== 0)
804 /* This is not an error: e.g. we may have CUDA libraries installed but
805 no devices available. */
806 if (r
!= CUDA_SUCCESS
)
810 r
= cuDeviceGetCount (&n
);
811 if (r
!= CUDA_SUCCESS
)
812 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r
));
819 link_ptx (CUmodule
*module
, const char *ptx_code
)
821 CUjit_option opts
[7];
827 unsigned long logsize
= LOGSIZE
;
828 CUlinkState linkstate
;
831 size_t linkoutsize
__attribute__ ((unused
));
833 GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code
);
835 opts
[0] = CU_JIT_WALL_TIME
;
836 optvals
[0] = &elapsed
;
838 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
839 optvals
[1] = &ilog
[0];
841 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
842 optvals
[2] = (void *) logsize
;
844 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
845 optvals
[3] = &elog
[0];
847 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
848 optvals
[4] = (void *) logsize
;
850 opts
[5] = CU_JIT_LOG_VERBOSE
;
851 optvals
[5] = (void *) 1;
853 opts
[6] = CU_JIT_TARGET
;
854 optvals
[6] = (void *) CU_TARGET_COMPUTE_30
;
856 r
= cuLinkCreate (7, opts
, optvals
, &linkstate
);
857 if (r
!= CUDA_SUCCESS
)
858 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r
));
860 char *abort_ptx
= ABORT_PTX
;
861 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, abort_ptx
,
862 strlen (abort_ptx
) + 1, 0, 0, 0, 0);
863 if (r
!= CUDA_SUCCESS
)
865 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
866 GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r
));
869 char *acc_on_device_ptx
= ACC_ON_DEVICE_PTX
;
870 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, acc_on_device_ptx
,
871 strlen (acc_on_device_ptx
) + 1, 0, 0, 0, 0);
872 if (r
!= CUDA_SUCCESS
)
874 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
875 GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
879 char *goacc_internal_ptx
= GOACC_INTERNAL_PTX
;
880 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, goacc_internal_ptx
,
881 strlen (goacc_internal_ptx
) + 1, 0, 0, 0, 0);
882 if (r
!= CUDA_SUCCESS
)
884 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
885 GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
889 /* cuLinkAddData's 'data' argument erroneously omits the const qualifier. */
890 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, (char *)ptx_code
,
891 strlen (ptx_code
) + 1, 0, 0, 0, 0);
892 if (r
!= CUDA_SUCCESS
)
894 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
895 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r
));
898 r
= cuLinkComplete (linkstate
, &linkout
, &linkoutsize
);
899 if (r
!= CUDA_SUCCESS
)
900 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r
));
902 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
903 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
905 r
= cuModuleLoadData (module
, linkout
);
906 if (r
!= CUDA_SUCCESS
)
907 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r
));
911 event_gc (bool memmap_lockable
)
913 struct ptx_event
*ptx_event
= ptx_events
;
914 struct nvptx_thread
*nvthd
= nvptx_thread ();
916 pthread_mutex_lock (&ptx_event_lock
);
918 while (ptx_event
!= NULL
)
921 struct ptx_event
*e
= ptx_event
;
923 ptx_event
= ptx_event
->next
;
925 if (e
->ord
!= nvthd
->ptx_dev
->ord
)
928 r
= cuEventQuery (*e
->evt
);
929 if (r
== CUDA_SUCCESS
)
945 case PTX_EVT_ASYNC_CLEANUP
:
947 /* The function gomp_plugin_async_unmap_vars needs to claim the
948 memory-map splay tree lock for the current device, so we
949 can't call it when one of our callers has already claimed
950 the lock. In that case, just delay the GC for this event
952 if (!memmap_lockable
)
955 GOMP_PLUGIN_async_unmap_vars (e
->addr
);
960 cuEventDestroy (*te
);
964 ptx_events
= ptx_events
->next
;
967 struct ptx_event
*e_
= ptx_events
;
968 while (e_
->next
!= e
)
970 e_
->next
= e_
->next
->next
;
977 pthread_mutex_unlock (&ptx_event_lock
);
981 event_add (enum ptx_event_type type
, CUevent
*e
, void *h
)
983 struct ptx_event
*ptx_event
;
984 struct nvptx_thread
*nvthd
= nvptx_thread ();
986 assert (type
== PTX_EVT_MEM
|| type
== PTX_EVT_KNL
|| type
== PTX_EVT_SYNC
987 || type
== PTX_EVT_ASYNC_CLEANUP
);
989 ptx_event
= GOMP_PLUGIN_malloc (sizeof (struct ptx_event
));
990 ptx_event
->type
= type
;
993 ptx_event
->ord
= nvthd
->ptx_dev
->ord
;
995 pthread_mutex_lock (&ptx_event_lock
);
997 ptx_event
->next
= ptx_events
;
998 ptx_events
= ptx_event
;
1000 pthread_mutex_unlock (&ptx_event_lock
);
1004 nvptx_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
1005 size_t *sizes
, unsigned short *kinds
, int num_gangs
, int num_workers
,
1006 int vector_length
, int async
, void *targ_mem_desc
)
1008 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
1009 CUfunction function
;
1012 struct ptx_stream
*dev_str
;
1015 unsigned int nthreads_in_block
;
1016 struct nvptx_thread
*nvthd
= nvptx_thread ();
1017 const char *maybe_abort_msg
= "(perhaps abort was called)";
1019 function
= targ_fn
->fn
;
1021 dev_str
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1022 assert (dev_str
== nvthd
->current_stream
);
1024 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1025 the host and the device. HP is a host pointer to the new chunk, and DP is
1026 the corresponding device pointer. */
1027 map_push (dev_str
, async
, mapnum
* sizeof (void *), &hp
, &dp
);
1029 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
1031 /* Copy the array of arguments to the mapped page. */
1032 for (i
= 0; i
< mapnum
; i
++)
1033 ((void **) hp
)[i
] = devaddrs
[i
];
1035 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1036 fact have the same value on a unified-memory system). */
1037 r
= cuMemcpy ((CUdeviceptr
)dp
, (CUdeviceptr
)hp
, mapnum
* sizeof (void *));
1038 if (r
!= CUDA_SUCCESS
)
1039 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r
));
1041 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__
, targ_fn
->name
);
1046 // num_workers warps (where a warp is equivalent to 32 threads)
1047 // vector length threads
1050 /* The openacc vector_length clause 'determines the vector length to use for
1051 vector or SIMD operations'. The question is how to map this to CUDA.
1053 In CUDA, the warp size is the vector length of a CUDA device. However, the
1054 CUDA interface abstracts away from that, and only shows us warp size
1055 indirectly in maximum number of threads per block, which is a product of
1056 warp size and the number of hyperthreads of a multiprocessor.
1058 We choose to map openacc vector_length directly onto the number of threads
1059 in a block, in the x dimension. This is reflected in gcc code generation
1060 that uses ThreadIdx.x to access vector elements.
1062 Attempting to use an openacc vector_length of more than the maximum number
1063 of threads per block will result in a cuda error. */
1064 nthreads_in_block
= vector_length
;
1067 r
= cuLaunchKernel (function
,
1069 nthreads_in_block
, 1, 1,
1070 0, dev_str
->stream
, kargs
, 0);
1071 if (r
!= CUDA_SUCCESS
)
1072 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
1074 #ifndef DISABLE_ASYNC
1075 if (async
< acc_async_noval
)
1077 r
= cuStreamSynchronize (dev_str
->stream
);
1078 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1079 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
1081 else if (r
!= CUDA_SUCCESS
)
1082 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1088 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1090 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1091 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1092 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r
),
1094 else if (r
!= CUDA_SUCCESS
)
1095 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1099 r
= cuEventRecord (*e
, dev_str
->stream
);
1100 if (r
!= CUDA_SUCCESS
)
1101 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1103 event_add (PTX_EVT_KNL
, e
, (void *)dev_str
);
1106 r
= cuCtxSynchronize ();
1107 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1108 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
1110 else if (r
!= CUDA_SUCCESS
)
1111 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
1114 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
1117 #ifndef DISABLE_ASYNC
1118 if (async
< acc_async_noval
)
1123 void * openacc_get_current_cuda_context (void);
1126 nvptx_alloc (size_t s
)
1131 r
= cuMemAlloc (&d
, s
);
1132 if (r
== CUDA_ERROR_OUT_OF_MEMORY
)
1134 if (r
!= CUDA_SUCCESS
)
1135 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
1140 nvptx_free (void *p
)
1146 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)p
);
1147 if (r
!= CUDA_SUCCESS
)
1148 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r
));
1150 if ((CUdeviceptr
)p
!= pb
)
1151 GOMP_PLUGIN_fatal ("invalid device address");
1153 r
= cuMemFree ((CUdeviceptr
)p
);
1154 if (r
!= CUDA_SUCCESS
)
1155 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
1159 nvptx_host2dev (void *d
, const void *h
, size_t s
)
1164 struct nvptx_thread
*nvthd
= nvptx_thread ();
1170 GOMP_PLUGIN_fatal ("invalid device address");
1172 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)d
);
1173 if (r
!= CUDA_SUCCESS
)
1174 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r
));
1177 GOMP_PLUGIN_fatal ("invalid device address");
1180 GOMP_PLUGIN_fatal ("invalid host address");
1183 GOMP_PLUGIN_fatal ("invalid host or device address");
1185 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1186 GOMP_PLUGIN_fatal ("invalid size");
1188 #ifndef DISABLE_ASYNC
1189 if (nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1193 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1195 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1196 if (r
!= CUDA_SUCCESS
)
1197 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1201 r
= cuMemcpyHtoDAsync ((CUdeviceptr
)d
, h
, s
,
1202 nvthd
->current_stream
->stream
);
1203 if (r
!= CUDA_SUCCESS
)
1204 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r
));
1206 r
= cuEventRecord (*e
, nvthd
->current_stream
->stream
);
1207 if (r
!= CUDA_SUCCESS
)
1208 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1210 event_add (PTX_EVT_MEM
, e
, (void *)h
);
1215 r
= cuMemcpyHtoD ((CUdeviceptr
)d
, h
, s
);
1216 if (r
!= CUDA_SUCCESS
)
1217 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1224 nvptx_dev2host (void *h
, const void *d
, size_t s
)
1229 struct nvptx_thread
*nvthd
= nvptx_thread ();
1235 GOMP_PLUGIN_fatal ("invalid device address");
1237 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)d
);
1238 if (r
!= CUDA_SUCCESS
)
1239 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r
));
1242 GOMP_PLUGIN_fatal ("invalid device address");
1245 GOMP_PLUGIN_fatal ("invalid host address");
1248 GOMP_PLUGIN_fatal ("invalid host or device address");
1250 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1251 GOMP_PLUGIN_fatal ("invalid size");
1253 #ifndef DISABLE_ASYNC
1254 if (nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1258 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1260 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1261 if (r
!= CUDA_SUCCESS
)
1262 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r
));
1266 r
= cuMemcpyDtoHAsync (h
, (CUdeviceptr
)d
, s
,
1267 nvthd
->current_stream
->stream
);
1268 if (r
!= CUDA_SUCCESS
)
1269 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r
));
1271 r
= cuEventRecord (*e
, nvthd
->current_stream
->stream
);
1272 if (r
!= CUDA_SUCCESS
)
1273 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1275 event_add (PTX_EVT_MEM
, e
, (void *)h
);
1280 r
= cuMemcpyDtoH (h
, (CUdeviceptr
)d
, s
);
1281 if (r
!= CUDA_SUCCESS
)
1282 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r
));
1289 nvptx_set_async (int async
)
1291 struct nvptx_thread
*nvthd
= nvptx_thread ();
1292 nvthd
->current_stream
1293 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1297 nvptx_async_test (int async
)
1300 struct ptx_stream
*s
;
1302 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1305 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1307 r
= cuStreamQuery (s
->stream
);
1308 if (r
== CUDA_SUCCESS
)
1310 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1311 whether all work has completed on this stream, and if so omits the call
1312 to the wait hook. If that happens, event_gc might not get called
1313 (which prevents variables from getting unmapped and their associated
1314 device storage freed), so call it here. */
1318 else if (r
== CUDA_ERROR_NOT_READY
)
1321 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1327 nvptx_async_test_all (void)
1329 struct ptx_stream
*s
;
1330 pthread_t self
= pthread_self ();
1331 struct nvptx_thread
*nvthd
= nvptx_thread ();
1333 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1335 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1337 if ((s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1338 && cuStreamQuery (s
->stream
) == CUDA_ERROR_NOT_READY
)
1340 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1345 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1353 nvptx_wait (int async
)
1356 struct ptx_stream
*s
;
1358 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1361 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1363 r
= cuStreamSynchronize (s
->stream
);
1364 if (r
!= CUDA_SUCCESS
)
1365 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1371 nvptx_wait_async (int async1
, int async2
)
1375 struct ptx_stream
*s1
, *s2
;
1376 pthread_t self
= pthread_self ();
1378 /* The stream that is waiting (rather than being waited for) doesn't
1379 necessarily have to exist already. */
1380 s2
= select_stream_for_async (async2
, self
, true, NULL
);
1382 s1
= select_stream_for_async (async1
, self
, false, NULL
);
1384 GOMP_PLUGIN_fatal ("invalid async 1\n");
1387 GOMP_PLUGIN_fatal ("identical parameters");
1389 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1391 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1392 if (r
!= CUDA_SUCCESS
)
1393 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1397 r
= cuEventRecord (*e
, s1
->stream
);
1398 if (r
!= CUDA_SUCCESS
)
1399 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1401 event_add (PTX_EVT_SYNC
, e
, NULL
);
1403 r
= cuStreamWaitEvent (s2
->stream
, *e
, 0);
1404 if (r
!= CUDA_SUCCESS
)
1405 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r
));
1409 nvptx_wait_all (void)
1412 struct ptx_stream
*s
;
1413 pthread_t self
= pthread_self ();
1414 struct nvptx_thread
*nvthd
= nvptx_thread ();
1416 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1418 /* Wait for active streams initiated by this thread (or by multiple threads)
1420 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1422 if (s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1424 r
= cuStreamQuery (s
->stream
);
1425 if (r
== CUDA_SUCCESS
)
1427 else if (r
!= CUDA_ERROR_NOT_READY
)
1428 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1430 r
= cuStreamSynchronize (s
->stream
);
1431 if (r
!= CUDA_SUCCESS
)
1432 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1436 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1442 nvptx_wait_all_async (int async
)
1445 struct ptx_stream
*waiting_stream
, *other_stream
;
1447 struct nvptx_thread
*nvthd
= nvptx_thread ();
1448 pthread_t self
= pthread_self ();
1450 /* The stream doing the waiting. This could be the first mention of the
1451 stream, so create it if necessary. */
1453 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1455 /* Launches on the null stream already block on other streams in the
1457 if (!waiting_stream
|| waiting_stream
== nvthd
->ptx_dev
->null_stream
)
1462 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1464 for (other_stream
= nvthd
->ptx_dev
->active_streams
;
1465 other_stream
!= NULL
;
1466 other_stream
= other_stream
->next
)
1468 if (!other_stream
->multithreaded
1469 && !pthread_equal (other_stream
->host_thread
, self
))
1472 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1474 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1475 if (r
!= CUDA_SUCCESS
)
1476 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1478 /* Record an event on the waited-for stream. */
1479 r
= cuEventRecord (*e
, other_stream
->stream
);
1480 if (r
!= CUDA_SUCCESS
)
1481 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1483 event_add (PTX_EVT_SYNC
, e
, NULL
);
1485 r
= cuStreamWaitEvent (waiting_stream
->stream
, *e
, 0);
1486 if (r
!= CUDA_SUCCESS
)
1487 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r
));
1490 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1494 nvptx_get_current_cuda_device (void)
1496 struct nvptx_thread
*nvthd
= nvptx_thread ();
1498 if (!nvthd
|| !nvthd
->ptx_dev
)
1501 return &nvthd
->ptx_dev
->dev
;
1505 nvptx_get_current_cuda_context (void)
1507 struct nvptx_thread
*nvthd
= nvptx_thread ();
1509 if (!nvthd
|| !nvthd
->ptx_dev
)
1512 return nvthd
->ptx_dev
->ctx
;
1516 nvptx_get_cuda_stream (int async
)
1518 struct ptx_stream
*s
;
1519 struct nvptx_thread
*nvthd
= nvptx_thread ();
1521 if (!nvthd
|| !nvthd
->ptx_dev
)
1524 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1526 return s
? s
->stream
: NULL
;
1530 nvptx_set_cuda_stream (int async
, void *stream
)
1532 struct ptx_stream
*oldstream
;
1533 pthread_t self
= pthread_self ();
1534 struct nvptx_thread
*nvthd
= nvptx_thread ();
1536 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1539 GOMP_PLUGIN_fatal ("bad async %d", async
);
1541 /* We have a list of active streams and an array mapping async values to
1542 entries of that list. We need to take "ownership" of the passed-in stream,
1543 and add it to our list, removing the previous entry also (if there was one)
1544 in order to prevent resource leaks. Note the potential for surprise
1545 here: maybe we should keep track of passed-in streams and leave it up to
1546 the user to tidy those up, but that doesn't work for stream handles
1547 returned from acc_get_cuda_stream above... */
1549 oldstream
= select_stream_for_async (async
, self
, false, NULL
);
1553 if (nvthd
->ptx_dev
->active_streams
== oldstream
)
1554 nvthd
->ptx_dev
->active_streams
= nvthd
->ptx_dev
->active_streams
->next
;
1557 struct ptx_stream
*s
= nvthd
->ptx_dev
->active_streams
;
1558 while (s
->next
!= oldstream
)
1560 s
->next
= s
->next
->next
;
1563 cuStreamDestroy (oldstream
->stream
);
1564 map_fini (oldstream
);
1568 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1570 (void) select_stream_for_async (async
, self
, true, (CUstream
) stream
);
1575 /* Plugin entry points. */
1578 GOMP_OFFLOAD_get_name (void)
1584 GOMP_OFFLOAD_get_caps (void)
1586 return GOMP_OFFLOAD_CAP_OPENACC_200
;
1590 GOMP_OFFLOAD_get_type (void)
1592 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1596 GOMP_OFFLOAD_get_num_devices (void)
1598 return nvptx_get_num_devices ();
1602 GOMP_OFFLOAD_init_device (int n
)
1604 pthread_mutex_lock (&ptx_dev_lock
);
1606 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1608 pthread_mutex_unlock (&ptx_dev_lock
);
1612 ptx_devices
[n
] = nvptx_open_device (n
);
1613 instantiated_devices
++;
1615 pthread_mutex_unlock (&ptx_dev_lock
);
1619 GOMP_OFFLOAD_fini_device (int n
)
1621 pthread_mutex_lock (&ptx_dev_lock
);
1623 if (ptx_devices
[n
] != NULL
)
1625 nvptx_attach_host_thread_to_device (n
);
1626 nvptx_close_device (ptx_devices
[n
]);
1627 ptx_devices
[n
] = NULL
;
1628 instantiated_devices
--;
1631 pthread_mutex_unlock (&ptx_dev_lock
);
1634 /* Data emitted by mkoffload. */
1636 typedef struct nvptx_tdata
1638 const char *ptx_src
;
1640 const char *const *var_names
;
1643 const char *const *fn_names
;
1647 /* Load the (partial) program described by TARGET_DATA to device
1648 number ORD. Allocate and return TARGET_TABLE. */
1651 GOMP_OFFLOAD_load_image (int ord
, const void *target_data
,
1652 struct addr_pair
**target_table
)
1655 const char *const *fn_names
, *const *var_names
;
1656 unsigned int fn_entries
, var_entries
, i
, j
;
1658 struct targ_fn_descriptor
*targ_fns
;
1659 struct addr_pair
*targ_tbl
;
1660 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1661 struct ptx_image_data
*new_image
;
1662 struct ptx_device
*dev
;
1664 GOMP_OFFLOAD_init_device (ord
);
1666 dev
= ptx_devices
[ord
];
1668 nvptx_attach_host_thread_to_device (ord
);
1670 link_ptx (&module
, img_header
->ptx_src
);
1672 /* The mkoffload utility emits a struct of pointers/integers at the
1673 start of each offload image. The array of kernel names and the
1674 functions addresses form a one-to-one correspondence. */
1676 var_entries
= img_header
->var_num
;
1677 var_names
= img_header
->var_names
;
1678 fn_entries
= img_header
->fn_num
;
1679 fn_names
= img_header
->fn_names
;
1681 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1682 * (fn_entries
+ var_entries
));
1683 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1686 *target_table
= targ_tbl
;
1688 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1689 new_image
->target_data
= target_data
;
1690 new_image
->module
= module
;
1691 new_image
->fns
= targ_fns
;
1693 pthread_mutex_lock (&dev
->image_lock
);
1694 new_image
->next
= dev
->images
;
1695 dev
->images
= new_image
;
1696 pthread_mutex_unlock (&dev
->image_lock
);
1698 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1700 CUfunction function
;
1702 r
= cuModuleGetFunction (&function
, module
, fn_names
[i
]);
1703 if (r
!= CUDA_SUCCESS
)
1704 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r
));
1706 targ_fns
->fn
= function
;
1707 targ_fns
->name
= (const char *) fn_names
[i
];
1709 targ_tbl
->start
= (uintptr_t) targ_fns
;
1710 targ_tbl
->end
= targ_tbl
->start
+ 1;
1713 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1718 r
= cuModuleGetGlobal (&var
, &bytes
, module
, var_names
[j
]);
1719 if (r
!= CUDA_SUCCESS
)
1720 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1722 targ_tbl
->start
= (uintptr_t) var
;
1723 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1726 return fn_entries
+ var_entries
;
1729 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1730 function descriptors allocated by G_O_load_image. */
1733 GOMP_OFFLOAD_unload_image (int ord
, const void *target_data
)
1735 struct ptx_image_data
*image
, **prev_p
;
1736 struct ptx_device
*dev
= ptx_devices
[ord
];
1738 pthread_mutex_lock (&dev
->image_lock
);
1739 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1740 if (image
->target_data
== target_data
)
1742 *prev_p
= image
->next
;
1743 cuModuleUnload (image
->module
);
1748 pthread_mutex_unlock (&dev
->image_lock
);
1752 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1754 nvptx_attach_host_thread_to_device (ord
);
1755 return nvptx_alloc (size
);
1759 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1761 nvptx_attach_host_thread_to_device (ord
);
1766 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
1768 nvptx_attach_host_thread_to_device (ord
);
1769 return nvptx_dev2host (dst
, src
, n
);
1773 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
1775 nvptx_attach_host_thread_to_device (ord
);
1776 return nvptx_host2dev (dst
, src
, n
);
1779 void (*device_run
) (int n
, void *fn_ptr
, void *vars
) = NULL
;
1782 GOMP_OFFLOAD_openacc_parallel (void (*fn
) (void *), size_t mapnum
,
1783 void **hostaddrs
, void **devaddrs
, size_t *sizes
,
1784 unsigned short *kinds
, int num_gangs
,
1785 int num_workers
, int vector_length
, int async
,
1786 void *targ_mem_desc
)
1788 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, sizes
, kinds
, num_gangs
,
1789 num_workers
, vector_length
, async
, targ_mem_desc
);
1793 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc
)
1797 struct nvptx_thread
*nvthd
= nvptx_thread ();
1799 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1801 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1802 if (r
!= CUDA_SUCCESS
)
1803 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1805 r
= cuEventRecord (*e
, nvthd
->current_stream
->stream
);
1806 if (r
!= CUDA_SUCCESS
)
1807 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1809 event_add (PTX_EVT_ASYNC_CLEANUP
, e
, targ_mem_desc
);
1813 GOMP_OFFLOAD_openacc_async_test (int async
)
1815 return nvptx_async_test (async
);
1819 GOMP_OFFLOAD_openacc_async_test_all (void)
1821 return nvptx_async_test_all ();
1825 GOMP_OFFLOAD_openacc_async_wait (int async
)
1831 GOMP_OFFLOAD_openacc_async_wait_async (int async1
, int async2
)
1833 nvptx_wait_async (async1
, async2
);
1837 GOMP_OFFLOAD_openacc_async_wait_all (void)
1843 GOMP_OFFLOAD_openacc_async_wait_all_async (int async
)
1845 nvptx_wait_all_async (async
);
1849 GOMP_OFFLOAD_openacc_async_set_async (int async
)
1851 nvptx_set_async (async
);
1855 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
1857 struct ptx_device
*ptx_dev
;
1858 struct nvptx_thread
*nvthd
1859 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
1863 ptx_dev
= ptx_devices
[ord
];
1867 r
= cuCtxGetCurrent (&thd_ctx
);
1868 if (r
!= CUDA_SUCCESS
)
1869 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r
));
1871 assert (ptx_dev
->ctx
);
1875 r
= cuCtxPushCurrent (ptx_dev
->ctx
);
1876 if (r
!= CUDA_SUCCESS
)
1877 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r
));
1880 nvthd
->current_stream
= ptx_dev
->null_stream
;
1881 nvthd
->ptx_dev
= ptx_dev
;
1883 return (void *) nvthd
;
1887 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
1893 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1895 return nvptx_get_current_cuda_device ();
1899 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1901 return nvptx_get_current_cuda_context ();
1904 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1907 GOMP_OFFLOAD_openacc_get_cuda_stream (int async
)
1909 return nvptx_get_cuda_stream (async
);
1912 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1915 GOMP_OFFLOAD_openacc_set_cuda_stream (int async
, void *stream
)
1917 return nvptx_set_cuda_stream (async
, stream
);