1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2021 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
52 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
53 block to cache between kernel invocations. For soft-stacks blocks bigger
54 than this, we will free the block before attempting another GPU memory
55 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
56 we will free the cached soft-stacks block anyway then retry the
57 allocation. If that fails too, we lose. */
59 #define SOFTSTACK_CACHE_LIMIT 134217728
61 #if CUDA_VERSION < 6000
62 extern CUresult
cuGetErrorString (CUresult
, const char **);
63 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
66 #if CUDA_VERSION >= 6050
69 CUresult
cuLinkAddData (CUlinkState
, CUjitInputType
, void *, size_t,
70 const char *, unsigned, CUjit_option
*, void **);
71 CUresult
cuLinkCreate (unsigned, CUjit_option
*, void **, CUlinkState
*);
73 typedef size_t (*CUoccupancyB2DSize
)(int);
74 CUresult
cuLinkAddData_v2 (CUlinkState
, CUjitInputType
, void *, size_t,
75 const char *, unsigned, CUjit_option
*, void **);
76 CUresult
cuLinkCreate_v2 (unsigned, CUjit_option
*, void **, CUlinkState
*);
77 CUresult
cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction
,
78 CUoccupancyB2DSize
, size_t, int);
81 #define DO_PRAGMA(x) _Pragma (#x)
83 #if PLUGIN_NVPTX_DYNAMIC
88 # define CUDA_ONE_CALL(call) \
89 __typeof (call) *call;
90 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
92 #include "cuda-lib.def"
94 # undef CUDA_ONE_CALL_MAYBE_NULL
98 /* -1 if init_cuda_lib has not been called yet, false
99 if it has been and failed, true if it has been and succeeded. */
100 static signed char cuda_lib_inited
= -1;
102 /* Dynamically load the CUDA runtime library and initialize function
103 pointers, return false if unsuccessful, true if successful. */
107 if (cuda_lib_inited
!= -1)
108 return cuda_lib_inited
;
109 const char *cuda_runtime_lib
= "libcuda.so.1";
110 void *h
= dlopen (cuda_runtime_lib
, RTLD_LAZY
);
111 cuda_lib_inited
= false;
115 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
116 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
117 # define CUDA_ONE_CALL_1(call, allow_null) \
118 cuda_lib.call = dlsym (h, #call); \
119 if (!allow_null && cuda_lib.call == NULL) \
121 #include "cuda-lib.def"
122 # undef CUDA_ONE_CALL
123 # undef CUDA_ONE_CALL_1
124 # undef CUDA_ONE_CALL_MAYBE_NULL
126 cuda_lib_inited
= true;
129 # define CUDA_CALL_PREFIX cuda_lib.
132 # define CUDA_ONE_CALL(call)
133 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
134 #include "cuda-lib.def"
135 #undef CUDA_ONE_CALL_MAYBE_NULL
138 # define CUDA_CALL_PREFIX
139 # define init_cuda_lib() true
142 #include "secure_getenv.h"
146 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
147 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
149 /* Convenience macros for the frequently used CUDA library call and
150 error handling sequence as well as CUDA library calls that
151 do the error checking themselves or don't do it at all. */
153 #define CUDA_CALL_ERET(ERET, FN, ...) \
156 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
157 if (__r != CUDA_SUCCESS) \
159 GOMP_PLUGIN_error (#FN " error: %s", \
165 #define CUDA_CALL(FN, ...) \
166 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
168 #define CUDA_CALL_ASSERT(FN, ...) \
171 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
172 if (__r != CUDA_SUCCESS) \
174 GOMP_PLUGIN_fatal (#FN " error: %s", \
179 #define CUDA_CALL_NOCHECK(FN, ...) \
180 CUDA_CALL_PREFIX FN (__VA_ARGS__)
182 #define CUDA_CALL_EXISTS(FN) \
186 cuda_error (CUresult r
)
188 const char *fallback
= "unknown cuda error";
191 if (!CUDA_CALL_EXISTS (cuGetErrorString
))
194 r
= CUDA_CALL_NOCHECK (cuGetErrorString
, r
, &desc
);
195 if (r
== CUDA_SUCCESS
)
201 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
202 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
203 static char cuda_driver_version_s
[30];
205 static unsigned int instantiated_devices
= 0;
206 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
208 /* NVPTX/CUDA specific definition of asynchronous queues. */
209 struct goacc_asyncqueue
211 CUstream cuda_stream
;
214 struct nvptx_callback
218 struct goacc_asyncqueue
*aq
;
219 struct nvptx_callback
*next
;
222 /* Thread-specific data for PTX. */
226 /* We currently have this embedded inside the plugin because libgomp manages
227 devices through integer target_ids. This might be better if using an
228 opaque target-specific pointer directly from gomp_device_descr. */
229 struct ptx_device
*ptx_dev
;
232 /* Target data function launch information. */
234 struct targ_fn_launch
237 unsigned short dim
[GOMP_DIM_MAX
];
240 /* Target PTX object information. */
248 /* Target data image information. */
250 typedef struct nvptx_tdata
252 const struct targ_ptx_obj
*ptx_objs
;
255 const char *const *var_names
;
258 const struct targ_fn_launch
*fn_descs
;
262 /* Descriptor of a loaded function. */
264 struct targ_fn_descriptor
267 const struct targ_fn_launch
*launch
;
269 int max_threads_per_block
;
272 /* A loaded PTX image. */
273 struct ptx_image_data
275 const void *target_data
;
278 struct targ_fn_descriptor
*fns
; /* Array of functions. */
280 struct ptx_image_data
*next
;
283 struct ptx_free_block
286 struct ptx_free_block
*next
;
306 int max_threads_per_block
;
307 int max_threads_per_multiprocessor
;
308 int default_dims
[GOMP_DIM_MAX
];
310 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
313 struct ptx_image_data
*images
; /* Images loaded on device. */
314 pthread_mutex_t image_lock
; /* Lock for above list. */
316 struct ptx_free_block
*free_blocks
;
317 pthread_mutex_t free_blocks_lock
;
319 /* OpenMP stacks, cached between kernel invocations. */
324 pthread_mutex_t lock
;
327 struct ptx_device
*next
;
330 static struct ptx_device
**ptx_devices
;
332 static inline struct nvptx_thread
*
335 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
338 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
339 should be locked on entry and remains locked on exit. */
346 if (instantiated_devices
!= 0)
349 if (!init_cuda_lib ())
352 CUDA_CALL (cuInit
, 0);
354 int cuda_driver_version
;
355 CUDA_CALL_ERET (NULL
, cuDriverGetVersion
, &cuda_driver_version
);
356 snprintf (cuda_driver_version_s
, sizeof cuda_driver_version_s
,
358 cuda_driver_version
/ 1000, cuda_driver_version
% 1000 / 10);
360 CUDA_CALL (cuDeviceGetCount
, &ndevs
);
361 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
367 /* Select the N'th PTX device for the current host thread. The device must
368 have been previously opened before calling this function. */
371 nvptx_attach_host_thread_to_device (int n
)
375 struct ptx_device
*ptx_dev
;
378 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &dev
);
379 if (r
== CUDA_ERROR_NOT_PERMITTED
)
381 /* Assume we're in a CUDA callback, just return true. */
384 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
386 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
390 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
396 ptx_dev
= ptx_devices
[n
];
399 GOMP_PLUGIN_error ("device %d not found", n
);
403 CUDA_CALL (cuCtxGetCurrent
, &thd_ctx
);
405 /* We don't necessarily have a current context (e.g. if it has been
406 destroyed. Pop it if we do though. */
408 CUDA_CALL (cuCtxPopCurrent
, &old_ctx
);
410 CUDA_CALL (cuCtxPushCurrent
, ptx_dev
->ctx
);
415 static struct ptx_device
*
416 nvptx_open_device (int n
)
418 struct ptx_device
*ptx_dev
;
419 CUdevice dev
, ctx_dev
;
421 int async_engines
, pi
;
423 CUDA_CALL_ERET (NULL
, cuDeviceGet
, &dev
, n
);
425 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
429 ptx_dev
->ctx_shared
= false;
431 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &ctx_dev
);
432 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
434 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
438 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
440 /* The current host thread has an active context for a different device.
443 CUDA_CALL_ERET (NULL
, cuCtxPopCurrent
, &old_ctx
);
446 CUDA_CALL_ERET (NULL
, cuCtxGetCurrent
, &ptx_dev
->ctx
);
449 CUDA_CALL_ERET (NULL
, cuCtxCreate
, &ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
451 ptx_dev
->ctx_shared
= true;
453 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
454 &pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
455 ptx_dev
->overlap
= pi
;
457 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
458 &pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
461 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
462 &pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
463 ptx_dev
->concur
= pi
;
465 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
466 &pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
469 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
470 &pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
473 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
474 &pi
, CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, dev
);
475 ptx_dev
->clock_khz
= pi
;
477 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
478 &pi
, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
, dev
);
479 ptx_dev
->num_sms
= pi
;
481 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
482 &pi
, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
, dev
);
483 ptx_dev
->regs_per_block
= pi
;
485 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
486 in CUDA 6.0 and newer. */
487 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
488 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR
,
490 /* Fallback: use limit of registers per block, which is usually equal. */
491 if (r
== CUDA_ERROR_INVALID_VALUE
)
492 pi
= ptx_dev
->regs_per_block
;
493 else if (r
!= CUDA_SUCCESS
)
495 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r
));
498 ptx_dev
->regs_per_sm
= pi
;
500 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
501 &pi
, CU_DEVICE_ATTRIBUTE_WARP_SIZE
, dev
);
504 GOMP_PLUGIN_error ("Only warp size 32 is supported");
507 ptx_dev
->warp_size
= pi
;
509 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
510 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, dev
);
511 ptx_dev
->max_threads_per_block
= pi
;
513 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
514 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
, dev
);
515 ptx_dev
->max_threads_per_multiprocessor
= pi
;
517 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &async_engines
,
518 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
519 if (r
!= CUDA_SUCCESS
)
522 for (int i
= 0; i
!= GOMP_DIM_MAX
; i
++)
523 ptx_dev
->default_dims
[i
] = 0;
525 CUDA_CALL_ERET (NULL
, cuDeviceGetName
, ptx_dev
->name
, sizeof ptx_dev
->name
,
528 ptx_dev
->images
= NULL
;
529 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
531 ptx_dev
->free_blocks
= NULL
;
532 pthread_mutex_init (&ptx_dev
->free_blocks_lock
, NULL
);
534 ptx_dev
->omp_stacks
.ptr
= 0;
535 ptx_dev
->omp_stacks
.size
= 0;
536 pthread_mutex_init (&ptx_dev
->omp_stacks
.lock
, NULL
);
542 nvptx_close_device (struct ptx_device
*ptx_dev
)
547 for (struct ptx_free_block
*b
= ptx_dev
->free_blocks
; b
;)
549 struct ptx_free_block
*b_next
= b
->next
;
550 CUDA_CALL (cuMemFree
, (CUdeviceptr
) b
->ptr
);
555 pthread_mutex_destroy (&ptx_dev
->free_blocks_lock
);
556 pthread_mutex_destroy (&ptx_dev
->image_lock
);
558 pthread_mutex_destroy (&ptx_dev
->omp_stacks
.lock
);
560 if (ptx_dev
->omp_stacks
.ptr
)
561 CUDA_CALL (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
563 if (!ptx_dev
->ctx_shared
)
564 CUDA_CALL (cuCtxDestroy
, ptx_dev
->ctx
);
571 nvptx_get_num_devices (void)
575 /* This function will be called before the plugin has been initialized in
576 order to enumerate available devices, but CUDA API routines can't be used
577 until cuInit has been called. Just call it now (but don't yet do any
578 further initialization). */
579 if (instantiated_devices
== 0)
581 if (!init_cuda_lib ())
583 CUresult r
= CUDA_CALL_NOCHECK (cuInit
, 0);
584 /* This is not an error: e.g. we may have CUDA libraries installed but
585 no devices available. */
586 if (r
!= CUDA_SUCCESS
)
588 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
594 CUDA_CALL_ERET (-1, cuDeviceGetCount
, &n
);
599 notify_var (const char *var_name
, const char *env_var
)
602 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name
);
604 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name
, env_var
);
608 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o
)
610 const char *var_name
= "GOMP_NVPTX_JIT";
611 const char *env_var
= secure_getenv (var_name
);
612 notify_var (var_name
, env_var
);
617 const char *c
= env_var
;
623 if (c
[0] == '-' && c
[1] == 'O'
624 && '0' <= c
[2] && c
[2] <= '4'
625 && (c
[3] == '\0' || c
[3] == ' '))
627 *gomp_nvptx_o
= c
[2] - '0';
632 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
638 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
641 CUjit_option opts
[7];
646 CUlinkState linkstate
;
649 size_t linkoutsize
__attribute__ ((unused
));
651 opts
[0] = CU_JIT_WALL_TIME
;
652 optvals
[0] = &elapsed
;
654 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
655 optvals
[1] = &ilog
[0];
657 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
658 optvals
[2] = (void *) sizeof ilog
;
660 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
661 optvals
[3] = &elog
[0];
663 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
664 optvals
[4] = (void *) sizeof elog
;
666 opts
[5] = CU_JIT_LOG_VERBOSE
;
667 optvals
[5] = (void *) 1;
669 static intptr_t gomp_nvptx_o
= -1;
671 static bool init_done
= false;
674 process_GOMP_NVPTX_JIT (&gomp_nvptx_o
);
679 if (gomp_nvptx_o
!= -1)
681 opts
[nopts
] = CU_JIT_OPTIMIZATION_LEVEL
;
682 optvals
[nopts
] = (void *) gomp_nvptx_o
;
686 if (CUDA_CALL_EXISTS (cuLinkCreate_v2
))
687 CUDA_CALL (cuLinkCreate_v2
, nopts
, opts
, optvals
, &linkstate
);
689 CUDA_CALL (cuLinkCreate
, nopts
, opts
, optvals
, &linkstate
);
691 for (; num_objs
--; ptx_objs
++)
693 /* cuLinkAddData's 'data' argument erroneously omits the const
695 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
696 if (CUDA_CALL_EXISTS (cuLinkAddData_v2
))
697 r
= CUDA_CALL_NOCHECK (cuLinkAddData_v2
, linkstate
, CU_JIT_INPUT_PTX
,
698 (char *) ptx_objs
->code
, ptx_objs
->size
,
701 r
= CUDA_CALL_NOCHECK (cuLinkAddData
, linkstate
, CU_JIT_INPUT_PTX
,
702 (char *) ptx_objs
->code
, ptx_objs
->size
,
704 if (r
!= CUDA_SUCCESS
)
706 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
707 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
713 GOMP_PLUGIN_debug (0, "Linking\n");
714 r
= CUDA_CALL_NOCHECK (cuLinkComplete
, linkstate
, &linkout
, &linkoutsize
);
716 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
717 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
719 if (r
!= CUDA_SUCCESS
)
721 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
722 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r
));
726 CUDA_CALL (cuModuleLoadData
, module
, linkout
);
727 CUDA_CALL (cuLinkDestroy
, linkstate
);
732 nvptx_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
733 unsigned *dims
, void *targ_mem_desc
,
734 CUdeviceptr dp
, CUstream stream
)
736 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
740 struct nvptx_thread
*nvthd
= nvptx_thread ();
741 int warp_size
= nvthd
->ptx_dev
->warp_size
;
743 function
= targ_fn
->fn
;
745 /* Initialize the launch dimensions. Typically this is constant,
746 provided by the device compiler, but we must permit runtime
749 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
751 if (targ_fn
->launch
->dim
[i
])
752 dims
[i
] = targ_fn
->launch
->dim
[i
];
759 pthread_mutex_lock (&ptx_dev_lock
);
761 static int gomp_openacc_dims
[GOMP_DIM_MAX
];
762 if (!gomp_openacc_dims
[0])
764 /* See if the user provided GOMP_OPENACC_DIM environment
765 variable to specify runtime defaults. */
766 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
767 gomp_openacc_dims
[i
] = GOMP_PLUGIN_acc_default_dim (i
);
770 if (!nvthd
->ptx_dev
->default_dims
[0])
772 int default_dims
[GOMP_DIM_MAX
];
773 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
774 default_dims
[i
] = gomp_openacc_dims
[i
];
776 int gang
, worker
, vector
;
778 int block_size
= nvthd
->ptx_dev
->max_threads_per_block
;
779 int cpu_size
= nvthd
->ptx_dev
->max_threads_per_multiprocessor
;
780 int dev_size
= nvthd
->ptx_dev
->num_sms
;
781 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
782 " dev_size=%d, cpu_size=%d\n",
783 warp_size
, block_size
, dev_size
, cpu_size
);
785 gang
= (cpu_size
/ block_size
) * dev_size
;
786 worker
= block_size
/ warp_size
;
790 /* There is no upper bound on the gang size. The best size
791 matches the hardware configuration. Logical gangs are
792 scheduled onto physical hardware. To maximize usage, we
793 should guess a large number. */
794 if (default_dims
[GOMP_DIM_GANG
] < 1)
795 default_dims
[GOMP_DIM_GANG
] = gang
? gang
: 1024;
796 /* The worker size must not exceed the hardware. */
797 if (default_dims
[GOMP_DIM_WORKER
] < 1
798 || (default_dims
[GOMP_DIM_WORKER
] > worker
&& gang
))
799 default_dims
[GOMP_DIM_WORKER
] = worker
;
800 /* The vector size must exactly match the hardware. */
801 if (default_dims
[GOMP_DIM_VECTOR
] < 1
802 || (default_dims
[GOMP_DIM_VECTOR
] != vector
&& gang
))
803 default_dims
[GOMP_DIM_VECTOR
] = vector
;
805 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
806 default_dims
[GOMP_DIM_GANG
],
807 default_dims
[GOMP_DIM_WORKER
],
808 default_dims
[GOMP_DIM_VECTOR
]);
810 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
811 nvthd
->ptx_dev
->default_dims
[i
] = default_dims
[i
];
813 pthread_mutex_unlock (&ptx_dev_lock
);
816 bool default_dim_p
[GOMP_DIM_MAX
];
817 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
818 default_dim_p
[i
] = !dims
[i
];
820 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize
))
822 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
823 if (default_dim_p
[i
])
824 dims
[i
] = nvthd
->ptx_dev
->default_dims
[i
];
826 if (default_dim_p
[GOMP_DIM_VECTOR
])
827 dims
[GOMP_DIM_VECTOR
]
828 = MIN (dims
[GOMP_DIM_VECTOR
],
829 (targ_fn
->max_threads_per_block
/ warp_size
832 if (default_dim_p
[GOMP_DIM_WORKER
])
833 dims
[GOMP_DIM_WORKER
]
834 = MIN (dims
[GOMP_DIM_WORKER
],
835 targ_fn
->max_threads_per_block
/ dims
[GOMP_DIM_VECTOR
]);
839 /* Handle the case that the compiler allows the runtime to choose
840 the vector-length conservatively, by ignoring
841 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
844 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
845 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
846 exceed targ_fn->max_threads_per_block. */
847 int workers
= gomp_openacc_dims
[GOMP_DIM_WORKER
];
848 int gangs
= gomp_openacc_dims
[GOMP_DIM_GANG
];
851 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize
, &grids
,
852 &blocks
, function
, NULL
, 0,
853 dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]);
854 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
855 "grid = %d, block = %d\n", grids
, blocks
);
857 /* Keep the num_gangs proportional to the block size. In
858 the case were a block size is limited by shared-memory
859 or the register file capacity, the runtime will not
860 excessively over assign gangs to the multiprocessor
861 units if their state is going to be swapped out even
862 more than necessary. The constant factor 2 is there to
863 prevent threads from idling when there is insufficient
866 gangs
= 2 * grids
* (blocks
/ warp_size
);
873 int actual_vectors
= (default_dim_p
[GOMP_DIM_VECTOR
]
875 : dims
[GOMP_DIM_VECTOR
]);
876 workers
= blocks
/ actual_vectors
;
877 workers
= MAX (workers
, 1);
878 /* If we need a per-worker barrier ... . */
879 if (actual_vectors
> 32)
880 /* Don't use more barriers than available. */
881 workers
= MIN (workers
, 15);
884 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
885 if (default_dim_p
[i
])
888 case GOMP_DIM_GANG
: dims
[i
] = gangs
; break;
889 case GOMP_DIM_WORKER
: dims
[i
] = workers
; break;
890 case GOMP_DIM_VECTOR
: dims
[i
] = vectors
; break;
891 default: GOMP_PLUGIN_fatal ("invalid dim");
897 /* Check if the accelerator has sufficient hardware resources to
898 launch the offloaded kernel. */
899 if (dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]
900 > targ_fn
->max_threads_per_block
)
903 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
904 " with num_workers = %d and vector_length = %d"
906 "recompile the program with 'num_workers = x and vector_length = y'"
907 " on that offloaded region or '-fopenacc-dim=:x:y' where"
910 GOMP_PLUGIN_fatal (msg
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
911 dims
[GOMP_DIM_VECTOR
], targ_fn
->max_threads_per_block
);
914 /* Check if the accelerator has sufficient barrier resources to
915 launch the offloaded kernel. */
916 if (dims
[GOMP_DIM_WORKER
] > 15 && dims
[GOMP_DIM_VECTOR
] > 32)
919 = ("The Nvidia accelerator has insufficient barrier resources to launch"
920 " '%s' with num_workers = %d and vector_length = %d"
922 "recompile the program with 'num_workers = x' on that offloaded"
923 " region or '-fopenacc-dim=:x:' where x <= 15"
925 "or, recompile the program with 'vector_length = 32' on that"
926 " offloaded region or '-fopenacc-dim=::32'"
928 GOMP_PLUGIN_fatal (msg
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
929 dims
[GOMP_DIM_VECTOR
]);
932 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
933 " gangs=%u, workers=%u, vectors=%u\n",
934 __FUNCTION__
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_GANG
],
935 dims
[GOMP_DIM_WORKER
], dims
[GOMP_DIM_VECTOR
]);
939 // num_gangs nctaid.x
940 // num_workers ntid.y
941 // vector length ntid.x
943 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
944 acc_prof_info
*prof_info
= thr
->prof_info
;
945 acc_event_info enqueue_launch_event_info
;
946 acc_api_info
*api_info
= thr
->api_info
;
947 bool profiling_p
= __builtin_expect (prof_info
!= NULL
, false);
950 prof_info
->event_type
= acc_ev_enqueue_launch_start
;
952 enqueue_launch_event_info
.launch_event
.event_type
953 = prof_info
->event_type
;
954 enqueue_launch_event_info
.launch_event
.valid_bytes
955 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES
;
956 enqueue_launch_event_info
.launch_event
.parent_construct
957 = acc_construct_parallel
;
958 enqueue_launch_event_info
.launch_event
.implicit
= 1;
959 enqueue_launch_event_info
.launch_event
.tool_info
= NULL
;
960 enqueue_launch_event_info
.launch_event
.kernel_name
= targ_fn
->launch
->fn
;
961 enqueue_launch_event_info
.launch_event
.num_gangs
962 = dims
[GOMP_DIM_GANG
];
963 enqueue_launch_event_info
.launch_event
.num_workers
964 = dims
[GOMP_DIM_WORKER
];
965 enqueue_launch_event_info
.launch_event
.vector_length
966 = dims
[GOMP_DIM_VECTOR
];
968 api_info
->device_api
= acc_device_api_cuda
;
970 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &enqueue_launch_event_info
,
975 CUDA_CALL_ASSERT (cuLaunchKernel
, function
,
976 dims
[GOMP_DIM_GANG
], 1, 1,
977 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
978 0, stream
, kargs
, 0);
982 prof_info
->event_type
= acc_ev_enqueue_launch_end
;
983 enqueue_launch_event_info
.launch_event
.event_type
984 = prof_info
->event_type
;
985 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &enqueue_launch_event_info
,
989 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
990 targ_fn
->launch
->fn
);
993 void * openacc_get_current_cuda_context (void);
996 goacc_profiling_acc_ev_alloc (struct goacc_thread
*thr
, void *dp
, size_t s
)
998 acc_prof_info
*prof_info
= thr
->prof_info
;
999 acc_event_info data_event_info
;
1000 acc_api_info
*api_info
= thr
->api_info
;
1002 prof_info
->event_type
= acc_ev_alloc
;
1004 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1005 data_event_info
.data_event
.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1006 data_event_info
.data_event
.parent_construct
= acc_construct_parallel
;
1007 data_event_info
.data_event
.implicit
= 1;
1008 data_event_info
.data_event
.tool_info
= NULL
;
1009 data_event_info
.data_event
.var_name
= NULL
;
1010 data_event_info
.data_event
.bytes
= s
;
1011 data_event_info
.data_event
.host_ptr
= NULL
;
1012 data_event_info
.data_event
.device_ptr
= dp
;
1014 api_info
->device_api
= acc_device_api_cuda
;
1016 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
, api_info
);
1019 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1020 size threshold, or if FORCE is true. */
1023 nvptx_stacks_free (struct ptx_device
*ptx_dev
, bool force
)
1025 pthread_mutex_lock (&ptx_dev
->omp_stacks
.lock
);
1026 if (ptx_dev
->omp_stacks
.ptr
1027 && (force
|| ptx_dev
->omp_stacks
.size
> SOFTSTACK_CACHE_LIMIT
))
1029 CUresult r
= CUDA_CALL_NOCHECK (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
1030 if (r
!= CUDA_SUCCESS
)
1031 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
1032 ptx_dev
->omp_stacks
.ptr
= 0;
1033 ptx_dev
->omp_stacks
.size
= 0;
1035 pthread_mutex_unlock (&ptx_dev
->omp_stacks
.lock
);
1039 nvptx_alloc (size_t s
, bool suppress_errors
)
1043 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &d
, s
);
1044 if (suppress_errors
&& r
== CUDA_ERROR_OUT_OF_MEMORY
)
1046 else if (r
!= CUDA_SUCCESS
)
1048 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r
));
1052 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
1053 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1055 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1057 goacc_profiling_acc_ev_alloc (thr
, (void *) d
, s
);
1063 goacc_profiling_acc_ev_free (struct goacc_thread
*thr
, void *p
)
1065 acc_prof_info
*prof_info
= thr
->prof_info
;
1066 acc_event_info data_event_info
;
1067 acc_api_info
*api_info
= thr
->api_info
;
1069 prof_info
->event_type
= acc_ev_free
;
1071 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1072 data_event_info
.data_event
.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1073 data_event_info
.data_event
.parent_construct
= acc_construct_parallel
;
1074 data_event_info
.data_event
.implicit
= 1;
1075 data_event_info
.data_event
.tool_info
= NULL
;
1076 data_event_info
.data_event
.var_name
= NULL
;
1077 data_event_info
.data_event
.bytes
= -1;
1078 data_event_info
.data_event
.host_ptr
= NULL
;
1079 data_event_info
.data_event
.device_ptr
= p
;
1081 api_info
->device_api
= acc_device_api_cuda
;
1083 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
, api_info
);
1087 nvptx_free (void *p
, struct ptx_device
*ptx_dev
)
1092 CUresult r
= CUDA_CALL_NOCHECK (cuMemGetAddressRange
, &pb
, &ps
,
1094 if (r
== CUDA_ERROR_NOT_PERMITTED
)
1096 /* We assume that this error indicates we are in a CUDA callback context,
1097 where all CUDA calls are not allowed (see cuStreamAddCallback
1098 documentation for description). Arrange to free this piece of device
1100 struct ptx_free_block
*n
1101 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block
));
1103 pthread_mutex_lock (&ptx_dev
->free_blocks_lock
);
1104 n
->next
= ptx_dev
->free_blocks
;
1105 ptx_dev
->free_blocks
= n
;
1106 pthread_mutex_unlock (&ptx_dev
->free_blocks_lock
);
1109 else if (r
!= CUDA_SUCCESS
)
1111 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r
));
1114 if ((CUdeviceptr
) p
!= pb
)
1116 GOMP_PLUGIN_error ("invalid device address");
1120 CUDA_CALL (cuMemFree
, (CUdeviceptr
) p
);
1121 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1123 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1125 goacc_profiling_acc_ev_free (thr
, p
);
1131 nvptx_get_current_cuda_device (void)
1133 struct nvptx_thread
*nvthd
= nvptx_thread ();
1135 if (!nvthd
|| !nvthd
->ptx_dev
)
1138 return &nvthd
->ptx_dev
->dev
;
1142 nvptx_get_current_cuda_context (void)
1144 struct nvptx_thread
*nvthd
= nvptx_thread ();
1146 if (!nvthd
|| !nvthd
->ptx_dev
)
1149 return nvthd
->ptx_dev
->ctx
;
1152 /* Plugin entry points. */
1155 GOMP_OFFLOAD_get_name (void)
1161 GOMP_OFFLOAD_get_caps (void)
1163 return GOMP_OFFLOAD_CAP_OPENACC_200
| GOMP_OFFLOAD_CAP_OPENMP_400
;
1167 GOMP_OFFLOAD_get_type (void)
1169 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1173 GOMP_OFFLOAD_get_num_devices (void)
1175 return nvptx_get_num_devices ();
1179 GOMP_OFFLOAD_init_device (int n
)
1181 struct ptx_device
*dev
;
1183 pthread_mutex_lock (&ptx_dev_lock
);
1185 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1187 pthread_mutex_unlock (&ptx_dev_lock
);
1191 dev
= nvptx_open_device (n
);
1194 ptx_devices
[n
] = dev
;
1195 instantiated_devices
++;
1198 pthread_mutex_unlock (&ptx_dev_lock
);
1204 GOMP_OFFLOAD_fini_device (int n
)
1206 pthread_mutex_lock (&ptx_dev_lock
);
1208 if (ptx_devices
[n
] != NULL
)
1210 if (!nvptx_attach_host_thread_to_device (n
)
1211 || !nvptx_close_device (ptx_devices
[n
]))
1213 pthread_mutex_unlock (&ptx_dev_lock
);
1216 ptx_devices
[n
] = NULL
;
1217 instantiated_devices
--;
1220 if (instantiated_devices
== 0)
1226 pthread_mutex_unlock (&ptx_dev_lock
);
1230 /* Return the libgomp version number we're compatible with. There is
1231 no requirement for cross-version compatibility. */
1234 GOMP_OFFLOAD_version (void)
1236 return GOMP_VERSION
;
1239 /* Initialize __nvptx_clocktick, if present in MODULE. */
1242 nvptx_set_clocktick (CUmodule module
, struct ptx_device
*dev
)
1245 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &dptr
, NULL
,
1246 module
, "__nvptx_clocktick");
1247 if (r
== CUDA_ERROR_NOT_FOUND
)
1249 if (r
!= CUDA_SUCCESS
)
1250 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1251 double __nvptx_clocktick
= 1e-3 / dev
->clock_khz
;
1252 r
= CUDA_CALL_NOCHECK (cuMemcpyHtoD
, dptr
, &__nvptx_clocktick
,
1253 sizeof (__nvptx_clocktick
));
1254 if (r
!= CUDA_SUCCESS
)
1255 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1258 /* Load the (partial) program described by TARGET_DATA to device
1259 number ORD. Allocate and return TARGET_TABLE. */
1262 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1263 struct addr_pair
**target_table
)
1266 const char *const *var_names
;
1267 const struct targ_fn_launch
*fn_descs
;
1268 unsigned int fn_entries
, var_entries
, i
, j
;
1269 struct targ_fn_descriptor
*targ_fns
;
1270 struct addr_pair
*targ_tbl
;
1271 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1272 struct ptx_image_data
*new_image
;
1273 struct ptx_device
*dev
;
1275 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1277 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1278 " (expected %u, received %u)",
1279 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1283 if (!nvptx_attach_host_thread_to_device (ord
)
1284 || !link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
))
1287 dev
= ptx_devices
[ord
];
1289 /* The mkoffload utility emits a struct of pointers/integers at the
1290 start of each offload image. The array of kernel names and the
1291 functions addresses form a one-to-one correspondence. */
1293 var_entries
= img_header
->var_num
;
1294 var_names
= img_header
->var_names
;
1295 fn_entries
= img_header
->fn_num
;
1296 fn_descs
= img_header
->fn_descs
;
1298 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1299 * (fn_entries
+ var_entries
));
1300 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1303 *target_table
= targ_tbl
;
1305 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1306 new_image
->target_data
= target_data
;
1307 new_image
->module
= module
;
1308 new_image
->fns
= targ_fns
;
1310 pthread_mutex_lock (&dev
->image_lock
);
1311 new_image
->next
= dev
->images
;
1312 dev
->images
= new_image
;
1313 pthread_mutex_unlock (&dev
->image_lock
);
1315 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1317 CUfunction function
;
1320 CUDA_CALL_ERET (-1, cuModuleGetFunction
, &function
, module
,
1322 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &nregs
,
1323 CU_FUNC_ATTRIBUTE_NUM_REGS
, function
);
1324 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &mthrs
,
1325 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, function
);
1327 targ_fns
->fn
= function
;
1328 targ_fns
->launch
= &fn_descs
[i
];
1329 targ_fns
->regs_per_thread
= nregs
;
1330 targ_fns
->max_threads_per_block
= mthrs
;
1332 targ_tbl
->start
= (uintptr_t) targ_fns
;
1333 targ_tbl
->end
= targ_tbl
->start
+ 1;
1336 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1341 CUDA_CALL_ERET (-1, cuModuleGetGlobal
,
1342 &var
, &bytes
, module
, var_names
[j
]);
1344 targ_tbl
->start
= (uintptr_t) var
;
1345 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1348 nvptx_set_clocktick (module
, dev
);
1350 return fn_entries
+ var_entries
;
1353 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1354 function descriptors allocated by G_O_load_image. */
1357 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1359 struct ptx_image_data
*image
, **prev_p
;
1360 struct ptx_device
*dev
= ptx_devices
[ord
];
1362 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1364 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1365 " (expected %u, received %u)",
1366 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1371 pthread_mutex_lock (&dev
->image_lock
);
1372 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1373 if (image
->target_data
== target_data
)
1375 *prev_p
= image
->next
;
1376 if (CUDA_CALL_NOCHECK (cuModuleUnload
, image
->module
) != CUDA_SUCCESS
)
1382 pthread_mutex_unlock (&dev
->image_lock
);
1387 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1389 if (!nvptx_attach_host_thread_to_device (ord
))
1392 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1393 struct ptx_free_block
*blocks
, *tmp
;
1395 pthread_mutex_lock (&ptx_dev
->free_blocks_lock
);
1396 blocks
= ptx_dev
->free_blocks
;
1397 ptx_dev
->free_blocks
= NULL
;
1398 pthread_mutex_unlock (&ptx_dev
->free_blocks_lock
);
1400 nvptx_stacks_free (ptx_dev
, false);
1405 nvptx_free (blocks
->ptr
, ptx_dev
);
1410 void *d
= nvptx_alloc (size
, true);
1415 /* Memory allocation failed. Try freeing the stacks block, and
1417 nvptx_stacks_free (ptx_dev
, true);
1418 return nvptx_alloc (size
, false);
1423 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1425 return (nvptx_attach_host_thread_to_device (ord
)
1426 && nvptx_free (ptr
, ptx_devices
[ord
]));
1430 GOMP_OFFLOAD_openacc_exec (void (*fn
) (void *), size_t mapnum
,
1431 void **hostaddrs
, void **devaddrs
,
1432 unsigned *dims
, void *targ_mem_desc
)
1434 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
1436 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1437 acc_prof_info
*prof_info
= thr
->prof_info
;
1438 acc_event_info data_event_info
;
1439 acc_api_info
*api_info
= thr
->api_info
;
1440 bool profiling_p
= __builtin_expect (prof_info
!= NULL
, false);
1447 size_t s
= mapnum
* sizeof (void *);
1449 for (int i
= 0; i
< mapnum
; i
++)
1450 hp
[i
] = (devaddrs
[i
] ? devaddrs
[i
] : hostaddrs
[i
]);
1451 CUDA_CALL_ASSERT (cuMemAlloc
, &dp
, s
);
1453 goacc_profiling_acc_ev_alloc (thr
, (void *) dp
, s
);
1456 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1457 fact have the same value on a unified-memory system). */
1462 prof_info
->event_type
= acc_ev_enqueue_upload_start
;
1464 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1465 data_event_info
.data_event
.valid_bytes
1466 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1467 data_event_info
.data_event
.parent_construct
1468 = acc_construct_parallel
;
1469 data_event_info
.data_event
.implicit
= 1; /* Always implicit. */
1470 data_event_info
.data_event
.tool_info
= NULL
;
1471 data_event_info
.data_event
.var_name
= NULL
;
1472 data_event_info
.data_event
.bytes
= mapnum
* sizeof (void *);
1473 data_event_info
.data_event
.host_ptr
= hp
;
1474 data_event_info
.data_event
.device_ptr
= (const void *) dp
;
1476 api_info
->device_api
= acc_device_api_cuda
;
1478 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1481 CUDA_CALL_ASSERT (cuMemcpyHtoD
, dp
, (void *) hp
,
1482 mapnum
* sizeof (void *));
1485 prof_info
->event_type
= acc_ev_enqueue_upload_end
;
1486 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1487 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1492 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, dims
, targ_mem_desc
,
1495 CUresult r
= CUDA_CALL_NOCHECK (cuStreamSynchronize
, NULL
);
1496 const char *maybe_abort_msg
= "(perhaps abort was called)";
1497 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1498 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
1500 else if (r
!= CUDA_SUCCESS
)
1501 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1503 CUDA_CALL_ASSERT (cuMemFree
, dp
);
1505 goacc_profiling_acc_ev_free (thr
, (void *) dp
);
1509 cuda_free_argmem (void *ptr
)
1511 void **block
= (void **) ptr
;
1512 nvptx_free (block
[0], (struct ptx_device
*) block
[1]);
1517 GOMP_OFFLOAD_openacc_async_exec (void (*fn
) (void *), size_t mapnum
,
1518 void **hostaddrs
, void **devaddrs
,
1519 unsigned *dims
, void *targ_mem_desc
,
1520 struct goacc_asyncqueue
*aq
)
1522 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
1524 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1525 acc_prof_info
*prof_info
= thr
->prof_info
;
1526 acc_event_info data_event_info
;
1527 acc_api_info
*api_info
= thr
->api_info
;
1528 bool profiling_p
= __builtin_expect (prof_info
!= NULL
, false);
1532 void **block
= NULL
;
1536 size_t s
= mapnum
* sizeof (void *);
1537 block
= (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s
);
1539 for (int i
= 0; i
< mapnum
; i
++)
1540 hp
[i
] = (devaddrs
[i
] ? devaddrs
[i
] : hostaddrs
[i
]);
1541 CUDA_CALL_ASSERT (cuMemAlloc
, &dp
, s
);
1543 goacc_profiling_acc_ev_alloc (thr
, (void *) dp
, s
);
1546 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1547 fact have the same value on a unified-memory system). */
1552 prof_info
->event_type
= acc_ev_enqueue_upload_start
;
1554 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1555 data_event_info
.data_event
.valid_bytes
1556 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1557 data_event_info
.data_event
.parent_construct
1558 = acc_construct_parallel
;
1559 data_event_info
.data_event
.implicit
= 1; /* Always implicit. */
1560 data_event_info
.data_event
.tool_info
= NULL
;
1561 data_event_info
.data_event
.var_name
= NULL
;
1562 data_event_info
.data_event
.bytes
= mapnum
* sizeof (void *);
1563 data_event_info
.data_event
.host_ptr
= hp
;
1564 data_event_info
.data_event
.device_ptr
= (const void *) dp
;
1566 api_info
->device_api
= acc_device_api_cuda
;
1568 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1572 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync
, dp
, (void *) hp
,
1573 mapnum
* sizeof (void *), aq
->cuda_stream
);
1574 block
[0] = (void *) dp
;
1576 struct nvptx_thread
*nvthd
=
1577 (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
1578 block
[1] = (void *) nvthd
->ptx_dev
;
1582 prof_info
->event_type
= acc_ev_enqueue_upload_end
;
1583 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1584 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1589 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, dims
, targ_mem_desc
,
1590 dp
, aq
->cuda_stream
);
1593 GOMP_OFFLOAD_openacc_async_queue_callback (aq
, cuda_free_argmem
, block
);
1597 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
1599 struct ptx_device
*ptx_dev
;
1600 struct nvptx_thread
*nvthd
1601 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
1604 ptx_dev
= ptx_devices
[ord
];
1608 CUDA_CALL_ASSERT (cuCtxGetCurrent
, &thd_ctx
);
1610 assert (ptx_dev
->ctx
);
1613 CUDA_CALL_ASSERT (cuCtxPushCurrent
, ptx_dev
->ctx
);
1615 nvthd
->ptx_dev
= ptx_dev
;
1617 return (void *) nvthd
;
1621 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
1627 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1629 return nvptx_get_current_cuda_device ();
1633 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1635 return nvptx_get_current_cuda_context ();
1638 /* This returns a CUstream. */
1640 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue
*aq
)
1642 return (void *) aq
->cuda_stream
;
1645 /* This takes a CUstream. */
1647 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue
*aq
, void *stream
)
1649 if (aq
->cuda_stream
)
1651 CUDA_CALL_ASSERT (cuStreamSynchronize
, aq
->cuda_stream
);
1652 CUDA_CALL_ASSERT (cuStreamDestroy
, aq
->cuda_stream
);
1655 aq
->cuda_stream
= (CUstream
) stream
;
1659 struct goacc_asyncqueue
*
1660 GOMP_OFFLOAD_openacc_async_construct (int device
__attribute__((unused
)))
1662 CUstream stream
= NULL
;
1663 CUDA_CALL_ERET (NULL
, cuStreamCreate
, &stream
, CU_STREAM_DEFAULT
);
1665 struct goacc_asyncqueue
*aq
1666 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue
));
1667 aq
->cuda_stream
= stream
;
1672 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue
*aq
)
1674 CUDA_CALL_ERET (false, cuStreamDestroy
, aq
->cuda_stream
);
1680 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue
*aq
)
1682 CUresult r
= CUDA_CALL_NOCHECK (cuStreamQuery
, aq
->cuda_stream
);
1683 if (r
== CUDA_SUCCESS
)
1685 if (r
== CUDA_ERROR_NOT_READY
)
1688 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r
));
1693 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue
*aq
)
1695 CUDA_CALL_ERET (false, cuStreamSynchronize
, aq
->cuda_stream
);
1700 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue
*aq1
,
1701 struct goacc_asyncqueue
*aq2
)
1704 CUDA_CALL_ERET (false, cuEventCreate
, &e
, CU_EVENT_DISABLE_TIMING
);
1705 CUDA_CALL_ERET (false, cuEventRecord
, e
, aq1
->cuda_stream
);
1706 CUDA_CALL_ERET (false, cuStreamWaitEvent
, aq2
->cuda_stream
, e
, 0);
1711 cuda_callback_wrapper (CUstream stream
, CUresult res
, void *ptr
)
1713 if (res
!= CUDA_SUCCESS
)
1714 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__
, cuda_error (res
));
1715 struct nvptx_callback
*cb
= (struct nvptx_callback
*) ptr
;
1721 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue
*aq
,
1722 void (*callback_fn
)(void *),
1725 struct nvptx_callback
*b
= GOMP_PLUGIN_malloc (sizeof (*b
));
1726 b
->fn
= callback_fn
;
1729 CUDA_CALL_ASSERT (cuStreamAddCallback
, aq
->cuda_stream
,
1730 cuda_callback_wrapper
, (void *) b
, 0);
1734 cuda_memcpy_sanity_check (const void *h
, const void *d
, size_t s
)
1742 GOMP_PLUGIN_error ("invalid device address");
1745 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1748 GOMP_PLUGIN_error ("invalid device address");
1753 GOMP_PLUGIN_error ("invalid host address");
1758 GOMP_PLUGIN_error ("invalid host or device address");
1761 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1763 GOMP_PLUGIN_error ("invalid size");
1770 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
1772 if (!nvptx_attach_host_thread_to_device (ord
)
1773 || !cuda_memcpy_sanity_check (src
, dst
, n
))
1775 CUDA_CALL (cuMemcpyHtoD
, (CUdeviceptr
) dst
, src
, n
);
1780 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
1782 if (!nvptx_attach_host_thread_to_device (ord
)
1783 || !cuda_memcpy_sanity_check (dst
, src
, n
))
1785 CUDA_CALL (cuMemcpyDtoH
, dst
, (CUdeviceptr
) src
, n
);
1790 GOMP_OFFLOAD_dev2dev (int ord
, void *dst
, const void *src
, size_t n
)
1792 CUDA_CALL (cuMemcpyDtoDAsync
, (CUdeviceptr
) dst
, (CUdeviceptr
) src
, n
, NULL
);
1797 GOMP_OFFLOAD_openacc_async_host2dev (int ord
, void *dst
, const void *src
,
1798 size_t n
, struct goacc_asyncqueue
*aq
)
1800 if (!nvptx_attach_host_thread_to_device (ord
)
1801 || !cuda_memcpy_sanity_check (src
, dst
, n
))
1803 CUDA_CALL (cuMemcpyHtoDAsync
, (CUdeviceptr
) dst
, src
, n
, aq
->cuda_stream
);
1808 GOMP_OFFLOAD_openacc_async_dev2host (int ord
, void *dst
, const void *src
,
1809 size_t n
, struct goacc_asyncqueue
*aq
)
1811 if (!nvptx_attach_host_thread_to_device (ord
)
1812 || !cuda_memcpy_sanity_check (dst
, src
, n
))
1814 CUDA_CALL (cuMemcpyDtoHAsync
, dst
, (CUdeviceptr
) src
, n
, aq
->cuda_stream
);
1818 union goacc_property_value
1819 GOMP_OFFLOAD_openacc_get_property (int n
, enum goacc_property prop
)
1821 union goacc_property_value propval
= { .val
= 0 };
1823 pthread_mutex_lock (&ptx_dev_lock
);
1825 if (n
>= nvptx_get_num_devices () || n
< 0 || ptx_devices
[n
] == NULL
)
1827 pthread_mutex_unlock (&ptx_dev_lock
);
1831 struct ptx_device
*ptx_dev
= ptx_devices
[n
];
1834 case GOACC_PROPERTY_MEMORY
:
1838 CUDA_CALL_ERET (propval
, cuDeviceTotalMem
, &total_mem
, ptx_dev
->dev
);
1839 propval
.val
= total_mem
;
1842 case GOACC_PROPERTY_FREE_MEMORY
:
1848 CUDA_CALL_ERET (propval
, cuCtxGetDevice
, &ctxdev
);
1849 if (ptx_dev
->dev
== ctxdev
)
1850 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
1851 else if (ptx_dev
->ctx
)
1855 CUDA_CALL_ERET (propval
, cuCtxPushCurrent
, ptx_dev
->ctx
);
1856 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
1857 CUDA_CALL_ASSERT (cuCtxPopCurrent
, &old_ctx
);
1863 CUDA_CALL_ERET (propval
, cuCtxCreate
, &new_ctx
, CU_CTX_SCHED_AUTO
,
1865 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
1866 CUDA_CALL_ASSERT (cuCtxDestroy
, new_ctx
);
1868 propval
.val
= free_mem
;
1871 case GOACC_PROPERTY_NAME
:
1872 propval
.ptr
= ptx_dev
->name
;
1874 case GOACC_PROPERTY_VENDOR
:
1875 propval
.ptr
= "Nvidia";
1877 case GOACC_PROPERTY_DRIVER
:
1878 propval
.ptr
= cuda_driver_version_s
;
1884 pthread_mutex_unlock (&ptx_dev_lock
);
1888 /* Adjust launch dimensions: pick good values for number of blocks and warps
1889 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1893 nvptx_adjust_launch_bounds (struct targ_fn_descriptor
*fn
,
1894 struct ptx_device
*ptx_dev
,
1895 int *teams_p
, int *threads_p
)
1897 int max_warps_block
= fn
->max_threads_per_block
/ 32;
1898 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1899 and libgcc, which matches documented limit of all GPUs as of 2015. */
1900 if (max_warps_block
> 32)
1901 max_warps_block
= 32;
1902 if (*threads_p
<= 0)
1904 if (*threads_p
> max_warps_block
)
1905 *threads_p
= max_warps_block
;
1907 int regs_per_block
= fn
->regs_per_thread
* 32 * *threads_p
;
1908 /* This is an estimate of how many blocks the device can host simultaneously.
1909 Actual limit, which may be lower, can be queried with "occupancy control"
1910 driver interface (since CUDA 6.0). */
1911 int max_blocks
= ptx_dev
->regs_per_sm
/ regs_per_block
* ptx_dev
->num_sms
;
1912 if (*teams_p
<= 0 || *teams_p
> max_blocks
)
1913 *teams_p
= max_blocks
;
1916 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1920 nvptx_stacks_size ()
1925 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
1926 the storage should be held on entry, and remains held on exit. */
1929 nvptx_stacks_acquire (struct ptx_device
*ptx_dev
, size_t size
, int num
)
1931 if (ptx_dev
->omp_stacks
.ptr
&& ptx_dev
->omp_stacks
.size
>= size
* num
)
1932 return (void *) ptx_dev
->omp_stacks
.ptr
;
1934 /* Free the old, too-small stacks. */
1935 if (ptx_dev
->omp_stacks
.ptr
)
1937 CUresult r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
1938 if (r
!= CUDA_SUCCESS
)
1939 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r
));
1940 r
= CUDA_CALL_NOCHECK (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
1941 if (r
!= CUDA_SUCCESS
)
1942 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
1945 /* Make new and bigger stacks, and remember where we put them and how big
1947 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &ptx_dev
->omp_stacks
.ptr
,
1949 if (r
!= CUDA_SUCCESS
)
1950 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
1952 ptx_dev
->omp_stacks
.size
= size
* num
;
1954 return (void *) ptx_dev
->omp_stacks
.ptr
;
1958 GOMP_OFFLOAD_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
)
1960 struct targ_fn_descriptor
*tgt_fn_desc
1961 = (struct targ_fn_descriptor
*) tgt_fn
;
1962 CUfunction function
= tgt_fn_desc
->fn
;
1963 const struct targ_fn_launch
*launch
= tgt_fn_desc
->launch
;
1964 const char *fn_name
= launch
->fn
;
1966 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1967 const char *maybe_abort_msg
= "(perhaps abort was called)";
1968 int teams
= 0, threads
= 0;
1971 GOMP_PLUGIN_fatal ("No target arguments provided");
1974 intptr_t id
= (intptr_t) *args
++, val
;
1975 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
1976 val
= (intptr_t) *args
++;
1978 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
1979 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) != GOMP_TARGET_ARG_DEVICE_ALL
)
1981 val
= val
> INT_MAX
? INT_MAX
: val
;
1982 id
&= GOMP_TARGET_ARG_ID_MASK
;
1983 if (id
== GOMP_TARGET_ARG_NUM_TEAMS
)
1985 else if (id
== GOMP_TARGET_ARG_THREAD_LIMIT
)
1988 nvptx_adjust_launch_bounds (tgt_fn
, ptx_dev
, &teams
, &threads
);
1990 size_t stack_size
= nvptx_stacks_size ();
1992 pthread_mutex_lock (&ptx_dev
->omp_stacks
.lock
);
1993 void *stacks
= nvptx_stacks_acquire (ptx_dev
, stack_size
, teams
* threads
);
1994 void *fn_args
[] = {tgt_vars
, stacks
, (void *) stack_size
};
1995 size_t fn_args_size
= sizeof fn_args
;
1997 CU_LAUNCH_PARAM_BUFFER_POINTER
, fn_args
,
1998 CU_LAUNCH_PARAM_BUFFER_SIZE
, &fn_args_size
,
2001 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2002 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2003 __FUNCTION__
, fn_name
, teams
, threads
);
2004 r
= CUDA_CALL_NOCHECK (cuLaunchKernel
, function
, teams
, 1, 1,
2005 32, threads
, 1, 0, NULL
, NULL
, config
);
2006 if (r
!= CUDA_SUCCESS
)
2007 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
2009 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
2010 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2011 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
2013 else if (r
!= CUDA_SUCCESS
)
2014 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
2016 pthread_mutex_unlock (&ptx_dev
->omp_stacks
.lock
);
2019 /* TODO: Implement GOMP_OFFLOAD_async_run. */