2 Copyright (c) 2014-2015 Intel Corporation. All Rights Reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 // Forward declaration as the following 2 functions are declared as friend
32 // in offload_engine.h.
33 // CLANG does not like static to been after friend declaration.
34 static void __offload_init_library_once(void);
35 static void __offload_fini_library(void);
37 #include "offload_host.h"
39 #include "offload_myo_host.h"
46 #endif // TARGET_WINNT
52 #include <sys/types.h>
58 #if defined(HOST_WINNT)
59 #define PATH_SEPARATOR ";"
61 #define PATH_SEPARATOR ":"
64 #define GET_OFFLOAD_NUMBER(timer_data) \
65 timer_data? timer_data->offload_number : 0
69 // Windows does not support imports from libraries without actually
70 // including them as dependence. We don't want to include in the
71 // dependence since is it used only for Fortran when traceback is enabled.
72 // Chose to implement it with GetProcAddress.
73 #define FORTRAN_TRACE_BACK win_for__continue_traceback
74 int win_for__continue_traceback( _Offload_result coi_offload_result
)
77 int (* TraceBackRoutine
)(_Offload_result value
);
79 hDLL
= LoadLibrary("libifcoremd.dll");
81 TraceBackRoutine
= (int (*)(_Offload_result
)) GetProcAddress(hDLL
,
82 "for__continue_traceback");
83 if (TraceBackRoutine
!= 0) {
84 return TraceBackRoutine(coi_offload_result
);
88 "Cannot find for__continue_traceback routine in libifcorert.dll\n");
93 OFFLOAD_TRACE(3, "Cannot load libifcorert.dll\n");
101 #define FORTRAN_TRACE_BACK for__continue_traceback
103 // for__continue_traceback is provided as a dummy to resolve link time symbols
104 // for C/C++ programs. For Fortran the actual fortran library function in
105 // libifcore.so is used.
106 #pragma weak for__continue_traceback
107 int for__continue_traceback( _Offload_result coi_offload_result
)
110 "liboffload function for_continue_traceback should not be called.\n");
113 #endif //TARGET_WINNT
117 // Small subset of ELF declarations for Windows which is needed to compile
118 // this file. ELF header is used to understand what binary type is contained
119 // in the target image - shared library or executable.
121 typedef uint16_t Elf64_Half
;
122 typedef uint32_t Elf64_Word
;
123 typedef uint64_t Elf64_Addr
;
124 typedef uint64_t Elf64_Off
;
133 unsigned char e_ident
[EI_NIDENT
];
135 Elf64_Half e_machine
;
136 Elf64_Word e_version
;
142 Elf64_Half e_phentsize
;
144 Elf64_Half e_shentsize
;
146 Elf64_Half e_shstrndx
;
148 #endif // TARGET_WINNT
150 // Host console and file logging
152 int console_enabled
= 0;
153 int offload_number
= 0;
155 static const char *htrace_envname
= "H_TRACE";
156 static const char *offload_report_envname
= "OFFLOAD_REPORT";
157 static const char *timer_envname
= "H_TIME";
159 // location of offload_main executable
160 // To be used if the main application has no offload and is not built
161 // with -offload but dynamic library linked in has offload pragma
162 char* mic_device_main
= 0;
164 // DMA channel count used by COI and set via
165 // OFFLOAD_DMA_CHANNEL_COUNT environment variable
166 uint32_t mic_dma_channel_count
;
169 static const char* vardesc_direction_as_string
[] = {
175 static const char* vardesc_type_as_string
[] = {
196 Engine
* mic_engines
= 0;
197 uint32_t mic_engines_total
= 0;
198 pthread_key_t mic_thread_key
;
199 MicEnvVar mic_env_vars
;
200 uint64_t cpu_frequency
= 0;
203 uint32_t mic_stack_size
= 12 * 1024 * 1024;
206 uint64_t mic_buffer_size
= 0;
208 // Preallocated 4K page memory size for buffers on MIC
209 uint64_t mic_4k_buffer_size
= 0;
211 // Preallocated 2M page memory size for buffers on MIC
212 uint64_t mic_2m_buffer_size
= 0;
215 // MIC_LD_LIBRARY_PATH
216 char* mic_library_path
= 0;
219 bool mic_proxy_io
= true;
222 char* mic_proxy_fs_root
= 0;
224 // Threshold for creating buffers with large pages. Buffer is created
225 // with large pages hint if its size exceeds the threshold value.
226 // By default large pages are disabled right now (by setting default
227 // value for threshold to MAX) due to HSD 4114629.
228 uint64_t __offload_use_2mb_buffers
= 0xffffffffffffffffULL
;
229 static const char *mic_use_2mb_buffers_envname
=
230 "MIC_USE_2MB_BUFFERS";
232 static uint64_t __offload_use_async_buffer_write
= 2 * 1024 * 1024;
233 static const char *mic_use_async_buffer_write_envname
=
234 "MIC_USE_ASYNC_BUFFER_WRITE";
236 static uint64_t __offload_use_async_buffer_read
= 2 * 1024 * 1024;
237 static const char *mic_use_async_buffer_read_envname
=
238 "MIC_USE_ASYNC_BUFFER_READ";
240 // device initialization type
241 OffloadInitType __offload_init_type
= c_init_on_offload_all
;
242 static const char *offload_init_envname
= "OFFLOAD_INIT";
245 static bool __offload_active_wait
= true;
246 static const char *offload_active_wait_envname
= "OFFLOAD_ACTIVE_WAIT";
248 // OMP_DEFAULT_DEVICE
249 int __omp_device_num
= 0;
250 static const char *omp_device_num_envname
= "OMP_DEFAULT_DEVICE";
252 //OFFLOAD_PARALLEL_COPY
253 static bool __offload_parallel_copy
= false;
254 static const char *parallel_copy_envname
= "OFFLOAD_PARALLEL_COPY";
256 //Use COI interface for noncontiguous transfer if it exists.
257 static bool __offload_use_coi_noncontiguous_transfer
= false;
258 static const char *use_coi_noncontiguous_transfer_envname
=
259 "MIC_USE_COI_MULTI_D";
261 // The list of pending target libraries
262 static bool __target_libs
;
263 static TargetImageList __target_libs_list
;
264 static mutex_t __target_libs_lock
;
265 static mutex_t stack_alloc_lock
;
268 TargetImage
* __target_exe
;
270 // Print readable offload flags
271 static void trace_offload_flags(
272 OffloadHostTimerData
* timer_data
,
273 OffloadFlags offload_flags
276 // Sized big enough for all flag names
279 if (!OFFLOAD_DO_TRACE
&& (console_enabled
>= 1)) {
280 sprintf(fbuffer
, " OffloadFlags=(");
281 if (offload_flags
.bits
.fortran_traceback
) {
282 sprintf(fbuffer
+strlen(fbuffer
), "fortran_traceback");
285 if (offload_flags
.bits
.omp_async
) {
286 sprintf(fbuffer
+strlen(fbuffer
), first
? "omp_async" : ",omp_async");
289 OFFLOAD_DEBUG_TRACE_1(1,
290 GET_OFFLOAD_NUMBER(timer_data
), c_offload_init_func
,
295 // Print readable varDesc flags
296 static void trace_varDesc_flags(
297 OffloadHostTimerData
* timer_data
,
298 varDescFlags offload_flags
301 // SIzed big enough for all flag names
304 if (!OFFLOAD_DO_TRACE
&& (console_enabled
>= 1)) {
305 sprintf(fbuffer
, " varDescFlags=(");
306 if (offload_flags
.is_static
) {
307 sprintf(fbuffer
+strlen(fbuffer
), "is_static");
310 if (offload_flags
.is_static_dstn
) {
311 sprintf(fbuffer
+strlen(fbuffer
),
312 first
? "is_static_dstn" : ",is_static_dstn");
315 if (offload_flags
.has_length
) {
316 sprintf(fbuffer
+strlen(fbuffer
),
317 first
? "has_length" : ",has_length");
320 if (offload_flags
.is_stack_buf
) {
321 sprintf(fbuffer
+strlen(fbuffer
),
322 first
? "is_stack_buf" : ",is_stack_buf");
325 if (offload_flags
.targetptr
) {
326 sprintf(fbuffer
+strlen(fbuffer
),
327 first
? "targetptr" : ",targetptr");
330 if (offload_flags
.preallocated
) {
331 sprintf(fbuffer
+strlen(fbuffer
),
332 first
? "preallocated" : ",preallocated");
335 if (offload_flags
.is_pointer
) {
336 sprintf(fbuffer
+strlen(fbuffer
),
337 first
? "is_pointer" : ",is_pointer");
340 if (offload_flags
.sink_addr
) {
341 sprintf(fbuffer
+strlen(fbuffer
),
342 first
? "sink_addr" : ",sink_addr");
345 if (offload_flags
.alloc_disp
) {
346 sprintf(fbuffer
+strlen(fbuffer
),
347 first
? "alloc_disp" : ",alloc_disp");
350 if (offload_flags
.is_noncont_src
) {
351 sprintf(fbuffer
+strlen(fbuffer
),
352 first
? "is_noncont_src" : ",is_noncont_src");
355 if (offload_flags
.is_noncont_dst
) {
356 sprintf(fbuffer
+strlen(fbuffer
),
357 first
? "is_noncont_dst" : ",is_noncont_dst");
360 if (offload_flags
.always_copy
) {
361 sprintf(fbuffer
+strlen(fbuffer
),
362 first
? "always_copy" : ",always_copy");
365 if (offload_flags
.always_delete
) {
366 sprintf(fbuffer
+strlen(fbuffer
),
367 first
? "always_delete" : ",always_delete");
370 OFFLOAD_DEBUG_TRACE_1(1,
371 GET_OFFLOAD_NUMBER(timer_data
), c_offload_init_func
,
376 static char * offload_get_src_base(void * ptr
, uint8_t type
)
379 if (VAR_TYPE_IS_PTR(type
)) {
380 base
= *static_cast<char**>(ptr
);
382 else if (VAR_TYPE_IS_SCALAR(type
)) {
383 base
= static_cast<char*>(ptr
);
385 else if (VAR_TYPE_IS_DV_DATA_SLICE(type
) || VAR_TYPE_IS_DV_DATA(type
)) {
387 if (VAR_TYPE_IS_DV_DATA_SLICE(type
)) {
388 const Arr_Desc
*ap
= static_cast<const Arr_Desc
*>(ptr
);
389 dvp
= (type
== c_dv_data_slice
) ?
390 reinterpret_cast<ArrDesc
*>(ap
->base
) :
391 *reinterpret_cast<ArrDesc
**>(ap
->base
);
394 dvp
= (type
== c_dv_data
) ?
395 static_cast<ArrDesc
*>(ptr
) :
396 *static_cast<ArrDesc
**>(ptr
);
398 base
= reinterpret_cast<char*>(dvp
->Base
);
406 void OffloadDescriptor::report_coi_error(error_types msg
, COIRESULT res
)
408 // special case for the 'process died' error
409 if (res
== COI_PROCESS_DIED
) {
410 m_device
.fini_process(true);
415 if (res
== COI_OUT_OF_MEMORY
) {
416 msg
= c_buf_create_out_of_mem
;
420 case c_buf_create_from_mem
:
421 case c_buf_get_address
:
422 case c_pipeline_create
:
423 case c_pipeline_run_func
:
424 LIBOFFLOAD_ERROR(msg
, m_device
.get_logical_index(), res
);
433 case c_buf_set_state
:
434 LIBOFFLOAD_ERROR(msg
, res
);
445 _Offload_result
OffloadDescriptor::translate_coi_error(COIRESULT res
) const
449 return OFFLOAD_SUCCESS
;
451 case COI_PROCESS_DIED
:
452 return OFFLOAD_PROCESS_DIED
;
454 case COI_OUT_OF_MEMORY
:
455 return OFFLOAD_OUT_OF_MEMORY
;
458 return OFFLOAD_ERROR
;
462 // is_targetptr == 0 && is_prealloc == 0 - allocation of pointer data;
463 // is_targetptr == 1 && is_prealloc == 0 - allocation of target memory:
464 // allocate memory at target; use its value as base in target table.
465 // is_targetptr == 1 && is_prealloc == 1 - use preallocated target memory:
466 // base - is address at target of preallocated memory; use its value as
467 // base in target table.
469 bool OffloadDescriptor::alloc_ptr_data(
481 // total length of base
482 int64_t length
= size
;
484 COIBUFFER targptr_buf
;
486 uint32_t buffer_flags
= 0;
487 char * base_disp
= reinterpret_cast<char *>(base
) + disp
;
489 // create buffer with large pages if data length exceeds
490 // large page threshold
491 if (length
>= __offload_use_2mb_buffers
) {
492 buffer_flags
= COI_OPTIMIZE_HUGE_PAGE_SIZE
;
494 // Allocate memory at target for targetptr without preallocated as we need
495 // its address as base argument in call to m_device.insert_ptr_data
496 if (is_targptr
&& !is_prealloc
) {
497 length
= alloc_disp
? length
: size
+ disp
;
498 res
= COI::BufferCreate(
504 &m_device
.get_process(),
506 if (res
!= COI_SUCCESS
) {
508 m_status
->result
= translate_coi_error(res
);
510 else if (m_is_mandatory
) {
511 report_coi_error(c_buf_create
, res
);
516 res
= COI::BufferGetSinkAddress(
517 targptr_buf
, reinterpret_cast<uint64_t *>(&base
));
518 if (res
!= COI_SUCCESS
) {
520 m_status
->result
= translate_coi_error(res
);
522 else if (m_is_mandatory
) {
523 report_coi_error(c_buf_get_address
, res
);
529 OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
530 alloc_disp
? base
: base_disp
,
531 alloc_disp
? length
: size
+ disp
);
535 ptr_data
= is_targptr
?
536 m_device
.find_targetptr_data(base_disp
) :
537 m_device
.find_ptr_data(base_disp
);
538 // if ptr_data is found just need to check it for overlapping
544 // If association is not found we must create it.
545 length
= alloc_disp
? length
: size
+ disp
;
546 ptr_data
= is_targptr
?
547 m_device
.insert_targetptr_data(base
, length
, is_new
) :
548 m_device
.insert_ptr_data(base
, length
, is_new
);
552 OFFLOAD_TRACE(3, "Added new association\n");
555 OffloadTimer
timer(get_timer_data(), c_offload_host_alloc_buffers
);
557 // align should be a power of 2
558 if (!pin
&& !is_targptr
&&
559 align
> 0 && (align
& (align
- 1)) == 0) {
560 // offset within mic_buffer. Can do offset optimization
561 // only when source address alignment satisfies requested
562 // alignment on the target (cq172736).
563 if ((reinterpret_cast<intptr_t>(base
) & (align
- 1)) == 0) {
564 ptr_data
->mic_offset
=
565 reinterpret_cast<intptr_t>(base
) & 4095;
569 // buffer size and flags
570 uint64_t buffer_size
= length
+ ptr_data
->mic_offset
;
572 // For targetptr there is no CPU buffer
573 if (pin
|| !is_targptr
) {
575 OFFLOAD_DEBUG_TRACE_1(3,
576 GET_OFFLOAD_NUMBER(get_timer_data()),
577 c_offload_create_buf_host
,
578 "Creating buffer from source memory %p, "
579 "length %lld\n", base
, length
);
581 // result is not checked because we can continue without cpu
582 // buffer. In this case we will use COIBufferRead/Write
583 // instead of COIBufferCopy.
585 COI::BufferCreateFromMemory(length
,
590 &m_device
.get_process(),
596 OFFLOAD_DEBUG_TRACE_1(3,
597 GET_OFFLOAD_NUMBER(get_timer_data()),
598 c_offload_create_buf_mic
,
599 "Creating buffer from sink memory: size %lld, offset %d, "
600 "flags =0x%x\n", buffer_size
,
601 ptr_data
->mic_offset
, buffer_flags
);
602 res
= COI::BufferCreateFromMemory(ptr_data
->cpu_addr
.length(),
607 &m_device
.get_process(),
609 if (res
!= COI_SUCCESS
) {
611 m_status
->result
= translate_coi_error(res
);
613 else if (m_is_mandatory
) {
614 report_coi_error(c_buf_create
, res
);
616 ptr_data
->alloc_ptr_data_lock
.unlock();
620 else if (is_targptr
) {
621 ptr_data
->mic_buf
= targptr_buf
;
624 OFFLOAD_DEBUG_TRACE_1(3,
625 GET_OFFLOAD_NUMBER(get_timer_data()),
626 c_offload_create_buf_mic
,
627 "Creating buffer for sink: size %lld, offset %d, "
628 "flags =0x%x\n", buffer_size
,
629 ptr_data
->mic_offset
, buffer_flags
);
630 res
= COI::BufferCreate(buffer_size
,
635 &m_device
.get_process(),
637 if (res
!= COI_SUCCESS
) {
639 m_status
->result
= translate_coi_error(res
);
641 else if (m_is_mandatory
) {
642 report_coi_error(c_buf_create
, res
);
644 ptr_data
->alloc_ptr_data_lock
.unlock();
650 // make buffer valid on the device.
651 res
= COI::BufferSetState(ptr_data
->mic_buf
,
652 m_device
.get_process(),
656 if (res
!= COI_SUCCESS
) {
658 m_status
->result
= translate_coi_error(res
);
660 else if (m_is_mandatory
) {
661 report_coi_error(c_buf_set_state
, res
);
663 ptr_data
->alloc_ptr_data_lock
.unlock();
667 res
= COI::BufferSetState(ptr_data
->mic_buf
,
672 if (res
!= COI_SUCCESS
) {
674 m_status
->result
= translate_coi_error(res
);
676 else if (m_is_mandatory
) {
677 report_coi_error(c_buf_set_state
, res
);
679 ptr_data
->alloc_ptr_data_lock
.unlock();
684 ptr_data
->alloc_disp
= alloc_disp
;
685 ptr_data
->alloc_ptr_data_lock
.unlock();
688 mutex_locker_t
locker(ptr_data
->alloc_ptr_data_lock
);
690 OFFLOAD_TRACE(3, "Found existing association: addr %p, length %lld, "
692 ptr_data
->cpu_addr
.start(), ptr_data
->cpu_addr
.length(),
693 ptr_data
->is_static
);
695 // This is not a new entry. Make sure that provided address range fits
696 // into existing one.
697 MemRange
addr_range(base
, length
);
698 if (!ptr_data
->cpu_addr
.contains(addr_range
)) {
699 LIBOFFLOAD_ERROR(c_bad_ptr_mem_alloc
, base
, length
,
700 const_cast<void *>(ptr_data
->cpu_addr
.start()),
701 ptr_data
->cpu_addr
.length());
705 // if the entry is associated with static data it may not have buffers
706 // created because they are created on demand.
707 if (ptr_data
->is_static
&& !init_static_ptr_data(ptr_data
)) {
715 bool OffloadDescriptor::find_ptr_data(
724 // total length of base
725 int64_t length
= size
;
726 char *base
= reinterpret_cast<char *>(in_base
) + disp
;
728 OFFLOAD_TRACE(3, "Looking for association for data: addr %p, "
729 "length %lld\n", base
, length
);
731 // find existing association in pointer table
732 ptr_data
= is_targetptr
?
733 m_device
.find_targetptr_data(base
) :
734 m_device
.find_ptr_data(base
);
737 LIBOFFLOAD_ERROR(c_no_ptr_data
, base
);
740 OFFLOAD_TRACE(3, "Association does not exist\n");
744 OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
745 ptr_data
->cpu_addr
.start(), ptr_data
->cpu_addr
.length(),
746 ptr_data
->is_static
);
748 // make sure that provided address range fits into existing one
749 MemRange
addr_range(base
, length
);
750 if (!ptr_data
->cpu_addr
.contains(addr_range
)) {
752 LIBOFFLOAD_ERROR(c_bad_ptr_mem_range
, base
, length
,
753 const_cast<void *>(ptr_data
->cpu_addr
.start()),
754 ptr_data
->cpu_addr
.length());
757 OFFLOAD_TRACE(3, "Existing association partially overlaps with "
758 "data address range\n");
763 // if the entry is associated with static data it may not have buffers
764 // created because they are created on demand.
765 if (ptr_data
->is_static
&& !init_static_ptr_data(ptr_data
)) {
772 bool OffloadDescriptor::init_static_ptr_data(PtrData
*ptr_data
)
774 OffloadTimer
timer(get_timer_data(), c_offload_host_alloc_buffers
);
776 if (ptr_data
->cpu_buf
== 0) {
777 OFFLOAD_TRACE(3, "Creating buffer from source memory %llx\n",
778 ptr_data
->cpu_addr
.start());
780 COIRESULT res
= COI::BufferCreateFromMemory(
781 ptr_data
->cpu_addr
.length(),
784 const_cast<void*>(ptr_data
->cpu_addr
.start()),
785 1, &m_device
.get_process(),
788 if (res
!= COI_SUCCESS
) {
790 m_status
->result
= translate_coi_error(res
);
793 report_coi_error(c_buf_create_from_mem
, res
);
797 if (ptr_data
->mic_buf
== 0) {
798 OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n",
801 COIRESULT res
= COI::BufferCreateFromMemory(
802 ptr_data
->cpu_addr
.length(),
805 reinterpret_cast<void*>(ptr_data
->mic_addr
),
806 1, &m_device
.get_process(),
809 if (res
!= COI_SUCCESS
) {
811 m_status
->result
= translate_coi_error(res
);
814 report_coi_error(c_buf_create_from_mem
, res
);
821 bool OffloadDescriptor::init_mic_address(PtrData
*ptr_data
)
823 if (ptr_data
->mic_buf
!= 0 && ptr_data
->mic_addr
== 0) {
824 COIRESULT res
= COI::BufferGetSinkAddress(ptr_data
->mic_buf
,
825 &ptr_data
->mic_addr
);
826 if (res
!= COI_SUCCESS
) {
828 m_status
->result
= translate_coi_error(res
);
830 else if (m_is_mandatory
) {
831 report_coi_error(c_buf_get_address
, res
);
839 bool OffloadDescriptor::nullify_target_stack(
844 char * ptr
= (char*)malloc(size
);
846 LIBOFFLOAD_ERROR(c_malloc
);
849 memset(ptr
, 0, size
);
850 res
= COI::BufferWrite(
855 COI_COPY_UNSPECIFIED
,
858 if (res
!= COI_SUCCESS
) {
860 m_status
->result
= translate_coi_error(res
);
863 report_coi_error(c_buf_write
, res
);
868 bool OffloadDescriptor::offload_stack_memory_manager(
869 const void * stack_begin
,
875 mutex_locker_t
locker(stack_alloc_lock
);
877 PersistData
* new_el
;
878 PersistDataList::iterator it_begin
= m_device
.m_persist_list
.begin();
879 PersistDataList::iterator it_end
;
881 uint64_t cur_thread_id
= m_device
.get_thread_id();
885 for (PersistDataList::iterator it
= m_device
.m_persist_list
.begin();
886 it
!= m_device
.m_persist_list
.end(); it
++) {
887 PersistData cur_el
= *it
;
889 if (stack_begin
> it
->stack_cpu_addr
) {
890 // this stack data must be destroyed
891 if (cur_thread_id
== cur_el
.thread_id
) {
892 m_destroy_stack
.push_front(cur_el
.stack_ptr_data
);
897 else if (stack_begin
== it
->stack_cpu_addr
) {
898 if (routine_id
!= it
-> routine_id
) {
899 // this stack data must be destroyed
900 m_destroy_stack
.push_front(cur_el
.stack_ptr_data
);
906 // stack data is reused
907 m_stack_ptr_data
= it
->stack_ptr_data
;
909 // all obsolete stack sections must be erased from the list
910 m_device
.m_persist_list
.erase(it_begin
, ++it_end
);
913 erase
* sizeof(new_el
->stack_ptr_data
->mic_addr
);
915 OFFLOAD_TRACE(3, "Reuse of stack buffer with addr %p\n",
916 m_stack_ptr_data
->mic_addr
);
920 else if (stack_begin
< it
->stack_cpu_addr
&&
921 cur_thread_id
== cur_el
.thread_id
) {
927 // all obsolete stack sections must be erased from the list
928 m_device
.m_persist_list
.erase(it_begin
, ++it_end
);
929 m_in_datalen
+= erase
* sizeof(new_el
->stack_ptr_data
->mic_addr
);
931 // new stack table is created
932 new_el
= new PersistData(stack_begin
, routine_id
, buf_size
, cur_thread_id
);
935 uint32_t buffer_flags
= 0;
937 // create buffer with large pages if data length exceeds
938 // large page threshold
939 if (buf_size
>= __offload_use_2mb_buffers
) {
940 buffer_flags
= COI_OPTIMIZE_HUGE_PAGE_SIZE
;
942 res
= COI::BufferCreate(buf_size
,
947 &m_device
.get_process(),
948 &new_el
->stack_ptr_data
->mic_buf
);
949 if (res
!= COI_SUCCESS
) {
951 m_status
->result
= translate_coi_error(res
);
953 else if (m_is_mandatory
) {
954 report_coi_error(c_buf_create
, res
);
958 // make buffer valid on the device.
959 res
= COI::BufferSetState(new_el
->stack_ptr_data
->mic_buf
,
960 m_device
.get_process(),
964 if (res
!= COI_SUCCESS
) {
966 m_status
->result
= translate_coi_error(res
);
968 else if (m_is_mandatory
) {
969 report_coi_error(c_buf_set_state
, res
);
973 res
= COI::BufferSetState(new_el
->stack_ptr_data
->mic_buf
,
978 if (res
!= COI_SUCCESS
) {
980 m_status
->result
= translate_coi_error(res
);
982 else if (m_is_mandatory
) {
983 report_coi_error(c_buf_set_state
, res
);
987 // persistence algorithm requires target stack initialy to be nullified
988 if (!nullify_target_stack(new_el
->stack_ptr_data
->mic_buf
, buf_size
)) {
992 m_stack_ptr_data
= new_el
->stack_ptr_data
;
993 init_mic_address(m_stack_ptr_data
);
994 OFFLOAD_TRACE(3, "Allocating stack buffer with addr %p\n",
995 m_stack_ptr_data
->mic_addr
);
996 m_device
.m_persist_list
.push_front(*new_el
);
997 init_mic_address(new_el
->stack_ptr_data
);
1002 bool OffloadDescriptor::setup_descriptors(
1007 const void *stack_addr
1012 OffloadTimer
timer(get_timer_data(), c_offload_host_setup_buffers
);
1014 // make a copy of variable descriptors
1015 m_vars_total
= vars_total
;
1016 if (vars_total
> 0) {
1017 m_vars
= (VarDesc
*) malloc(m_vars_total
* sizeof(VarDesc
));
1019 LIBOFFLOAD_ERROR(c_malloc
);
1020 memcpy(m_vars
, vars
, m_vars_total
* sizeof(VarDesc
));
1021 m_vars_extra
= (VarExtra
*) malloc(m_vars_total
* sizeof(VarExtra
));
1022 if (m_vars_extra
== NULL
)
1023 LIBOFFLOAD_ERROR(c_malloc
);
1027 m_in_deps_allocated
= m_vars_total
+ 1;
1028 m_in_deps
= (COIEVENT
*) malloc(sizeof(COIEVENT
) * m_in_deps_allocated
);
1029 if (m_in_deps
== NULL
)
1030 LIBOFFLOAD_ERROR(c_malloc
);
1031 if (m_vars_total
> 0) {
1032 m_out_deps_allocated
= m_vars_total
;
1033 m_out_deps
= (COIEVENT
*) malloc(sizeof(COIEVENT
) * m_out_deps_allocated
);
1034 if (m_out_deps
== NULL
)
1035 LIBOFFLOAD_ERROR(c_malloc
);
1038 // copyin/copyout data length
1042 // First pass over variable descriptors
1043 // - Calculate size of the input and output non-pointer data
1044 // - Allocate buffers for input and output pointers
1045 for (int i
= 0; i
< m_vars_total
; i
++) {
1046 void* alloc_base
= NULL
;
1047 int64_t alloc_disp
= 0;
1048 int64_t alloc_size
= 0;
1049 bool src_is_for_mic
= (m_vars
[i
].direction
.out
||
1050 m_vars
[i
].into
== NULL
);
1052 const char *var_sname
= "";
1053 if (vars2
!= NULL
&& i
< vars_total
) {
1054 if (vars2
[i
].sname
!= NULL
) {
1055 var_sname
= vars2
[i
].sname
;
1058 OFFLOAD_TRACE(2, " VarDesc %d, var=%s, %s, %s\n",
1060 vardesc_direction_as_string
[m_vars
[i
].direction
.bits
],
1061 vardesc_type_as_string
[m_vars
[i
].type
.src
]);
1062 if (vars2
!= NULL
&& i
< vars_total
&& vars2
[i
].dname
!= NULL
) {
1063 OFFLOAD_TRACE(2, " into=%s, %s\n", vars2
[i
].dname
,
1064 vardesc_type_as_string
[m_vars
[i
].type
.dst
]);
1067 " type_src=%d, type_dstn=%d, direction=%d, "
1068 "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
1069 "offset=%lld, size=%lld, count/disp=%lld, ptr=%p, into=%p\n",
1072 m_vars
[i
].direction
.bits
,
1076 m_vars
[i
].mic_offset
,
1077 m_vars
[i
].flags
.bits
,
1083 // If any varDesc flags bits set, show them
1084 if (console_enabled
>= 1 && m_vars
[i
].flags
.bits
!= 0) {
1085 trace_varDesc_flags(get_timer_data(), m_vars
[i
].flags
);
1088 // preallocated implies targetptr
1089 if (m_vars
[i
].flags
.preallocated
) {
1090 // targetptr preallocated alloc_if(1) may not be used with
1092 if (m_vars
[i
].direction
.in
&& m_vars
[i
].alloc_if
) {
1093 LIBOFFLOAD_ERROR(c_in_with_preallocated
);
1096 m_vars
[i
].flags
.targetptr
= 1;
1098 if (m_vars
[i
].alloc
!= NULL
) {
1100 const Arr_Desc
*ap
=
1101 static_cast<const Arr_Desc
*>(m_vars
[i
].alloc
);
1104 ARRAY_DESC_DUMP(" ", "ALLOC", ap
, 0, 1);
1106 __arr_data_offset_and_length(ap
, alloc_disp
, alloc_size
);
1108 alloc_base
= reinterpret_cast<void*>(ap
->base
);
1111 m_vars_extra
[i
].alloc
= m_vars
[i
].alloc
;
1112 m_vars_extra
[i
].cpu_disp
= 0;
1113 m_vars_extra
[i
].cpu_offset
= 0;
1114 m_vars_extra
[i
].src_data
= 0;
1115 m_vars_extra
[i
].read_rng_src
= 0;
1116 m_vars_extra
[i
].read_rng_dst
= 0;
1117 m_vars_extra
[i
].omp_last_event_type
= c_last_not
;
1118 // flag is_arr_ptr_el is 1 only for var_descs generated
1119 // for c_data_ptr_array type
1120 if (i
< vars_total
) {
1121 m_vars_extra
[i
].is_arr_ptr_el
= 0;
1124 switch (m_vars
[i
].type
.src
) {
1125 case c_data_ptr_array
:
1128 const VarDesc3
*vd3
=
1129 static_cast<const VarDesc3
*>(m_vars
[i
].ptr
);
1130 int flags
= vd3
->array_fields
;
1132 " pointer array flags = %04x\n", flags
);
1134 " pointer array type is %s\n",
1135 vardesc_type_as_string
[flags
& 0x3f]);
1136 ap
= static_cast<const Arr_Desc
*>(vd3
->ptr_array
);
1137 ARRAY_DESC_DUMP(" ", "ptr array", ap
,
1138 m_vars
[i
].flags
.is_pointer
, 1);
1139 if (m_vars
[i
].into
) {
1140 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
1142 " ", "into array", ap
, 0, 1);
1144 if ((flags
& (1<<flag_align_is_array
)) != 0) {
1145 ap
= static_cast<const Arr_Desc
*>(vd3
->align_array
);
1147 " ", "align array", ap
, 0, 1);
1149 if ((flags
& (1<<flag_alloc_if_is_array
)) != 0) {
1150 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_if_array
);
1152 " ", "alloc_if array", ap
, 0, 1);
1154 if ((flags
& (1<<flag_free_if_is_array
)) != 0) {
1155 ap
= static_cast<const Arr_Desc
*>(vd3
->free_if_array
);
1157 " ", "free_if array", ap
, 0, 1);
1159 if ((flags
& (1<<flag_extent_start_is_array
)) != 0) {
1160 ap
= static_cast<const Arr_Desc
*>(vd3
->extent_start
);
1162 " ", "extent_start array", ap
, 0, 1);
1164 (1<<flag_extent_start_is_scalar
)) != 0) {
1166 " extent_start scalar = %d\n",
1167 (int64_t)vd3
->extent_start
);
1169 if ((flags
& (1<<flag_extent_elements_is_array
)) != 0) {
1170 ap
= static_cast<const Arr_Desc
*>
1171 (vd3
->extent_elements
);
1172 ARRAY_DESC_DUMP(" ",
1173 "extent_elements array", ap
, 0, 1);
1175 (1<<flag_extent_elements_is_scalar
)) != 0) {
1177 " extent_elements scalar = %d\n",
1178 (int64_t)vd3
->extent_elements
);
1180 if ((flags
& (1<<flag_into_start_is_array
)) != 0) {
1181 ap
= static_cast<const Arr_Desc
*>(vd3
->into_start
);
1183 " ", "into_start array", ap
, 0, 1);
1185 (1<<flag_into_start_is_scalar
)) != 0) {
1187 " into_start scalar = %d\n",
1188 (int64_t)vd3
->into_start
);
1190 if ((flags
& (1<<flag_into_elements_is_array
)) != 0) {
1191 ap
= static_cast<const Arr_Desc
*>(vd3
->into_elements
);
1193 " ", "into_elements array", ap
, 0, 1);
1195 (1<<flag_into_elements_is_scalar
)) != 0) {
1197 " into_elements scalar = %d\n",
1198 (int64_t)vd3
->into_elements
);
1200 if ((flags
& (1<<flag_alloc_start_is_array
)) != 0) {
1201 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_start
);
1203 " ", "alloc_start array", ap
, 0, 1);
1205 (1<<flag_alloc_start_is_scalar
)) != 0) {
1207 " alloc_start scalar = %d\n",
1208 (int64_t)vd3
->alloc_start
);
1210 if ((flags
& (1<<flag_alloc_elements_is_array
)) != 0) {
1211 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_elements
);
1212 ARRAY_DESC_DUMP(" ",
1213 "alloc_elements array", ap
, 0, 1);
1215 (1<<flag_alloc_elements_is_scalar
)) != 0) {
1217 " alloc_elements scalar = %d\n",
1218 (int64_t)vd3
->alloc_elements
);
1221 if (!gen_var_descs_for_pointer_array(i
)) {
1229 // In all uses later
1230 // VarDesc.size will have the length of the data to be
1232 // VarDesc.disp will have an offset from base
1233 if (m_vars
[i
].type
.src
== c_cean_var
) {
1235 const Arr_Desc
*ap
=
1236 static_cast<const Arr_Desc
*>(m_vars
[i
].ptr
);
1239 ARRAY_DESC_DUMP("", "IN/OUT", ap
, 0, !src_is_for_mic
);
1241 // offset and length are derived from the array descriptor
1242 __arr_data_offset_and_length(ap
, m_vars
[i
].disp
,
1244 if (!is_arr_desc_contiguous(ap
)) {
1245 m_vars
[i
].flags
.is_noncont_src
= 1;
1246 m_vars_extra
[i
].read_rng_src
=
1247 init_read_ranges_arr_desc(ap
);
1249 // all necessary information about length and offset is
1250 // transferred in var descriptor. There is no need to send
1251 // array descriptor to the target side.
1252 m_vars
[i
].ptr
= reinterpret_cast<void*>(ap
->base
);
1255 m_vars
[i
].size
*= m_vars
[i
].count
;
1259 if (m_vars
[i
].direction
.bits
) {
1260 // make sure that transfer size > 0
1261 if (m_vars
[i
].size
<= 0) {
1262 LIBOFFLOAD_ERROR(c_zero_or_neg_transfer_size
);
1266 if (m_vars
[i
].flags
.is_static
) {
1269 // find data associated with variable
1270 if (!find_ptr_data(ptr_data
,
1278 if (ptr_data
!= 0) {
1279 // offset to base from the beginning of the buffer
1282 (char*) m_vars
[i
].ptr
-
1283 (char*) ptr_data
->cpu_addr
.start();
1286 m_vars
[i
].flags
.is_static
= false;
1287 if (m_vars
[i
].into
== NULL
) {
1288 m_vars
[i
].flags
.is_static_dstn
= false;
1291 m_vars_extra
[i
].src_data
= ptr_data
;
1295 if (m_vars
[i
].flags
.is_static
) {
1296 // Static data is transferred either by omp target
1297 // update construct which passes zeros for
1298 // alloc_if and free_if or by always modifier.
1299 if (!m_vars
[i
].flags
.always_copy
&&
1300 (m_vars
[i
].alloc_if
|| m_vars
[i
].free_if
)) {
1301 m_vars
[i
].direction
.bits
= c_parameter_nocopy
;
1305 AutoData
*auto_data
;
1306 if (m_vars
[i
].alloc_if
) {
1307 auto_data
= m_device
.insert_auto_data(
1308 m_vars
[i
].ptr
, m_vars
[i
].size
);
1309 auto_data
->add_reference();
1312 // TODO: what should be done if var is not in
1314 auto_data
= m_device
.find_auto_data(
1318 // For automatic variables data is transferred:
1319 // - if always modifier is used OR
1320 // - if alloc_if == 0 && free_if == 0 OR
1321 // - if reference count is 1
1322 if (!m_vars
[i
].flags
.always_copy
&&
1323 (m_vars
[i
].alloc_if
|| m_vars
[i
].free_if
) &&
1325 auto_data
->get_reference() != 1) {
1326 m_vars
[i
].direction
.bits
= c_parameter_nocopy
;
1329 // save data for later use
1330 m_vars_extra
[i
].auto_data
= auto_data
;
1334 if (m_vars
[i
].direction
.in
&&
1335 !m_vars
[i
].flags
.is_static
) {
1336 m_in_datalen
+= m_vars
[i
].size
;
1338 // for non-static target destination defined as CEAN
1339 // expression we pass to target its size and dist
1340 if (m_vars
[i
].into
== NULL
&&
1341 m_vars
[i
].type
.src
== c_cean_var
) {
1342 m_in_datalen
+= 2 * sizeof(uint64_t);
1344 m_need_runfunction
= true;
1346 if (m_vars
[i
].direction
.out
&&
1347 !m_vars
[i
].flags
.is_static
) {
1348 m_out_datalen
+= m_vars
[i
].size
;
1349 m_need_runfunction
= true;
1355 if (m_vars
[i
].direction
.bits
||
1356 m_vars
[i
].alloc_if
||
1357 m_vars
[i
].free_if
) {
1358 ArrDesc
*dvp
= static_cast<ArrDesc
*>(m_vars
[i
].ptr
);
1361 __dv_desc_dump("IN/OUT", dvp
);
1363 // send dope vector contents excluding base
1364 m_in_datalen
+= m_vars
[i
].size
- sizeof(uint64_t);
1365 m_need_runfunction
= true;
1370 if ((m_vars
[i
].direction
.bits
||
1371 m_vars
[i
].alloc_if
||
1372 m_vars
[i
].free_if
) &&
1373 m_vars
[i
].size
== 0) {
1376 strlen(*static_cast<char**>(m_vars
[i
].ptr
)) + 1;
1381 if (m_vars
[i
].flags
.is_stack_buf
&&
1382 !m_vars
[i
].direction
.bits
&&
1383 m_vars
[i
].alloc_if
) {
1384 // this var_desc is for stack buffer
1387 if (!offload_stack_memory_manager(
1388 stack_addr
, entry_id
,
1389 m_vars
[i
].count
, m_vars
[i
].align
, &is_new
)) {
1393 m_compute_buffers
.push_back(
1394 m_stack_ptr_data
->mic_buf
);
1395 m_device
.m_persist_list
.front().cpu_stack_addr
=
1396 static_cast<char*>(m_vars
[i
].ptr
);
1399 m_vars
[i
].flags
.sink_addr
= 1;
1400 m_in_datalen
+= sizeof(m_stack_ptr_data
->mic_addr
);
1402 m_vars
[i
].size
= m_destroy_stack
.size();
1403 m_vars_extra
[i
].src_data
= m_stack_ptr_data
;
1405 // need to add or remove references for stack buffer at target
1406 if (is_new
|| m_destroy_stack
.size()) {
1407 m_need_runfunction
= true;
1414 case c_cean_var_ptr
:
1416 if (m_vars
[i
].type
.src
== c_cean_var_ptr
) {
1418 const Arr_Desc
*ap
=
1419 static_cast<const Arr_Desc
*>(m_vars
[i
].ptr
);
1422 ARRAY_DESC_DUMP("", "IN/OUT", ap
, 1, !src_is_for_mic
);
1424 // offset and length are derived from the array descriptor
1425 __arr_data_offset_and_length(ap
, m_vars
[i
].disp
,
1428 if (!is_arr_desc_contiguous(ap
)) {
1429 m_vars
[i
].flags
.is_noncont_src
= 1;
1430 m_vars_extra
[i
].read_rng_src
=
1431 init_read_ranges_arr_desc(ap
);
1433 // all necessary information about length and offset is
1434 // transferred in var descriptor. There is no need to send
1435 // array descriptor to the target side.
1436 m_vars
[i
].ptr
= reinterpret_cast<void*>(ap
->base
);
1438 else if (m_vars
[i
].type
.src
== c_dv_ptr
) {
1439 // need to send DV to the device unless it is 'nocopy'
1440 if (m_vars
[i
].direction
.bits
||
1441 m_vars
[i
].alloc_if
||
1442 m_vars
[i
].free_if
) {
1443 ArrDesc
*dvp
= *static_cast<ArrDesc
**>(m_vars
[i
].ptr
);
1446 __dv_desc_dump("IN/OUT", dvp
);
1448 m_vars
[i
].direction
.bits
= c_parameter_in
;
1455 // c_data_ptr or c_string_ptr
1456 m_vars
[i
].size
*= m_vars
[i
].count
;
1460 if (m_vars
[i
].direction
.bits
||
1461 m_vars
[i
].alloc_if
||
1462 m_vars
[i
].free_if
) {
1465 // check that buffer length > 0
1466 if (m_vars
[i
].alloc_if
&&
1467 m_vars
[i
].disp
+ m_vars
[i
].size
<
1468 (m_is_openmp
? 0 : 1)) {
1469 LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len
);
1474 void *base
= *static_cast<void**>(m_vars
[i
].ptr
);
1476 // allocate buffer if we have no INTO and don't need
1477 // allocation for the ptr at target
1478 if (src_is_for_mic
) {
1479 if (m_vars
[i
].flags
.is_stack_buf
) {
1480 // for stack persistent objects ptr data is created
1481 // by var_desc with number 0.
1482 // Its ptr_data is stored at m_stack_ptr_data
1483 ptr_data
= m_stack_ptr_data
;
1484 m_vars
[i
].flags
.sink_addr
= 1;
1486 else if (m_vars
[i
].alloc_if
) {
1487 if (m_vars
[i
].flags
.preallocated
) {
1488 m_out_datalen
+= sizeof(void*);
1489 m_need_runfunction
= true;
1493 if (!alloc_ptr_data(
1495 reinterpret_cast<char *>(base
) + alloc_disp
,
1496 (alloc_base
!= NULL
) ?
1497 alloc_disp
: m_vars
[i
].disp
,
1498 (alloc_base
!= NULL
) ?
1499 alloc_size
: m_vars
[i
].size
,
1501 (alloc_base
!= NULL
) ?
1502 0 : m_vars
[i
].align
,
1503 m_vars
[i
].flags
.targetptr
,
1505 m_vars
[i
].flags
.pin
)) {
1508 if (m_vars
[i
].flags
.targetptr
) {
1509 if (!init_mic_address(ptr_data
)) {
1512 *static_cast<void**>(m_vars
[i
].ptr
) = base
=
1513 reinterpret_cast<void*>(ptr_data
->mic_addr
);
1515 if (ptr_data
->add_reference() == 0 &&
1516 ptr_data
->mic_buf
!= 0) {
1517 // add buffer to the list of buffers that
1518 // are passed to dispatch call
1519 m_compute_buffers
.push_back(
1522 else if (!m_vars
[i
].flags
.pin
&&
1523 !m_vars
[i
].flags
.preallocated
) {
1524 // will send buffer address to device
1525 m_vars
[i
].flags
.sink_addr
= 1;
1528 if (!m_vars
[i
].flags
.pin
&&
1529 !ptr_data
->is_static
) {
1530 // need to add reference for buffer
1531 m_need_runfunction
= true;
1535 bool error_if_not_found
= true;
1537 // For omp target update variable is ignored
1538 // if it does not exist.
1539 if (m_vars
[i
].flags
.always_copy
||
1540 (!m_vars
[i
].alloc_if
&&
1541 !m_vars
[i
].free_if
)) {
1542 error_if_not_found
= false;
1546 // use existing association from pointer table
1547 if (!find_ptr_data(ptr_data
,
1551 m_vars
[i
].flags
.targetptr
,
1552 error_if_not_found
)) {
1557 // make var nocopy if it does not exist
1558 if (ptr_data
== 0) {
1559 m_vars
[i
].direction
.bits
=
1564 if (ptr_data
!= 0) {
1565 m_vars
[i
].flags
.sink_addr
= 1;
1569 if (ptr_data
!= 0) {
1571 // data is transferred only if
1572 // alloc_if == 0 && free_if == 0
1573 // or reference count is 1
1574 if (!m_vars
[i
].flags
.always_copy
&&
1575 ((m_vars
[i
].alloc_if
||
1576 m_vars
[i
].free_if
) &&
1577 ptr_data
->get_reference() != 1)) {
1578 m_vars
[i
].direction
.bits
=
1583 if (ptr_data
->alloc_disp
!= 0) {
1584 m_vars
[i
].flags
.alloc_disp
= 1;
1585 m_in_datalen
+= sizeof(alloc_disp
);
1588 if (m_vars
[i
].flags
.sink_addr
) {
1589 // get buffers's address on the sink
1590 if (!init_mic_address(ptr_data
)) {
1594 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
1597 if (!m_vars
[i
].flags
.pin
&&
1598 !ptr_data
->is_static
&& m_vars
[i
].free_if
) {
1599 // need to decrement buffer reference on target
1600 m_need_runfunction
= true;
1603 // offset to base from the beginning of the buffer
1605 m_vars
[i
].offset
= (char*) base
-
1606 (char*) ptr_data
->cpu_addr
.start();
1608 // copy other pointer properties to var descriptor
1609 m_vars
[i
].mic_offset
= ptr_data
->mic_offset
;
1610 m_vars
[i
].flags
.is_static
= ptr_data
->is_static
;
1614 if (!find_ptr_data(ptr_data
,
1624 (char*) ptr_data
->cpu_addr
.start();
1628 // save pointer data
1629 m_vars_extra
[i
].src_data
= ptr_data
;
1634 if (m_vars
[i
].direction
.in
) {
1635 m_in_datalen
+= __offload_funcs
.max_name_length();
1637 if (m_vars
[i
].direction
.out
) {
1638 m_out_datalen
+= __offload_funcs
.max_name_length();
1640 m_need_runfunction
= true;
1645 case c_dv_data_slice
:
1646 case c_dv_ptr_data_slice
:
1648 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.src
)) {
1650 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].ptr
);
1652 dvp
= (m_vars
[i
].type
.src
== c_dv_data_slice
) ?
1653 reinterpret_cast<ArrDesc
*>(ap
->base
) :
1654 *reinterpret_cast<ArrDesc
**>(ap
->base
);
1657 dvp
= (m_vars
[i
].type
.src
== c_dv_data
) ?
1658 static_cast<ArrDesc
*>(m_vars
[i
].ptr
) :
1659 *static_cast<ArrDesc
**>(m_vars
[i
].ptr
);
1662 // if allocatable dope vector isn't allocated don't
1663 // transfer its data
1664 if (!__dv_is_allocated(dvp
)) {
1665 m_vars
[i
].direction
.bits
= c_parameter_nocopy
;
1666 m_vars
[i
].alloc_if
= 0;
1667 m_vars
[i
].free_if
= 0;
1669 if (m_vars
[i
].direction
.bits
||
1670 m_vars
[i
].alloc_if
||
1671 m_vars
[i
].free_if
) {
1674 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.src
)) {
1675 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].ptr
);
1678 ARRAY_DESC_DUMP("", "IN/OUT", ap
, 0, !src_is_for_mic
);
1680 if (!__dv_is_contiguous(dvp
)) {
1681 m_vars
[i
].flags
.is_noncont_src
= 1;
1682 m_vars_extra
[i
].read_rng_src
=
1683 init_read_ranges_dv(dvp
);
1686 // size and displacement
1687 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.src
)) {
1688 // offset and length are derived from the
1690 __arr_data_offset_and_length(ap
,
1693 if (m_vars
[i
].direction
.bits
) {
1694 if (!is_arr_desc_contiguous(ap
)) {
1695 if (m_vars
[i
].flags
.is_noncont_src
) {
1696 LIBOFFLOAD_ERROR(c_slice_of_noncont_array
);
1699 m_vars
[i
].flags
.is_noncont_src
= 1;
1700 m_vars_extra
[i
].read_rng_src
=
1701 init_read_ranges_arr_desc(ap
);
1706 if (m_vars
[i
].flags
.has_length
) {
1708 __dv_data_length(dvp
, m_vars
[i
].count
);
1711 m_vars
[i
].size
= __dv_data_length(dvp
);
1716 // check that length >= 0
1717 if (m_vars
[i
].alloc_if
&&
1718 (m_vars
[i
].disp
+ m_vars
[i
].size
< 0)) {
1719 LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len
);
1724 void *base
= reinterpret_cast<void*>(dvp
->Base
);
1727 // allocate buffer if we have no INTO and don't need
1728 // allocation for the ptr at target
1729 if (src_is_for_mic
) {
1730 if (m_vars
[i
].alloc_if
) {
1732 if (!alloc_ptr_data(
1734 reinterpret_cast<char *>(base
) + alloc_disp
,
1735 (alloc_base
!= NULL
) ?
1736 alloc_disp
: m_vars
[i
].disp
,
1737 (alloc_base
!= NULL
) ?
1738 alloc_size
: m_vars
[i
].size
,
1740 (alloc_base
!= NULL
) ?
1741 0 : m_vars
[i
].align
,
1742 m_vars
[i
].flags
.targetptr
,
1743 m_vars
[i
].flags
.preallocated
,
1744 m_vars
[i
].flags
.pin
)) {
1748 if (ptr_data
->add_reference() == 0 &&
1749 ptr_data
->mic_buf
!= 0) {
1750 // add buffer to the list of buffers
1751 // that are passed to dispatch call
1752 m_compute_buffers
.push_back(
1756 // will send buffer address to device
1757 m_vars
[i
].flags
.sink_addr
= 1;
1760 if (!ptr_data
->is_static
) {
1761 // need to add reference for buffer
1762 m_need_runfunction
= true;
1766 bool error_if_not_found
= true;
1768 // For omp target update variable is ignored
1769 // if it does not exist.
1770 if (m_vars
[i
].flags
.always_copy
||
1771 (!m_vars
[i
].alloc_if
&&
1772 !m_vars
[i
].free_if
)) {
1773 error_if_not_found
= false;
1777 // use existing association from pointer table
1778 if (!find_ptr_data(ptr_data
,
1782 m_vars
[i
].flags
.targetptr
,
1783 error_if_not_found
)) {
1788 // make var nocopy if it does not exist
1789 if (ptr_data
== 0) {
1790 m_vars
[i
].direction
.bits
=
1795 if (ptr_data
!= 0) {
1796 // need to update base in dope vector on device
1797 m_vars
[i
].flags
.sink_addr
= 1;
1801 if (ptr_data
!= 0) {
1803 // data is transferred if
1804 // - if always modifier is used OR
1805 // - if alloc_if == 0 && free_if == 0 OR
1806 // - if reference count is 1
1807 if (!m_vars
[i
].flags
.always_copy
&&
1808 (m_vars
[i
].alloc_if
||
1809 m_vars
[i
].free_if
) &&
1810 ptr_data
->get_reference() != 1) {
1811 m_vars
[i
].direction
.bits
=
1816 if (ptr_data
->alloc_disp
!= 0) {
1817 m_vars
[i
].flags
.alloc_disp
= 1;
1818 m_in_datalen
+= sizeof(alloc_disp
);
1821 if (m_vars
[i
].flags
.sink_addr
) {
1822 // get buffers's address on the sink
1823 if (!init_mic_address(ptr_data
)) {
1827 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
1830 if (!ptr_data
->is_static
&& m_vars
[i
].free_if
) {
1831 // need to decrement buffer reference on target
1832 m_need_runfunction
= true;
1835 // offset to base from the beginning of the buffer
1839 (char*) ptr_data
->cpu_addr
.start();
1841 // copy other pointer properties to var descriptor
1842 m_vars
[i
].mic_offset
= ptr_data
->mic_offset
;
1843 m_vars
[i
].flags
.is_static
= ptr_data
->is_static
;
1846 else { // !src_is_for_mic
1847 if (!find_ptr_data(ptr_data
,
1854 m_vars
[i
].offset
= !ptr_data
? 0 :
1856 (char*) ptr_data
->cpu_addr
.start();
1859 // save pointer data
1860 m_vars_extra
[i
].src_data
= ptr_data
;
1865 LIBOFFLOAD_ERROR(c_unknown_var_type
, m_vars
[i
].type
.src
);
1868 if (m_vars
[i
].type
.src
== c_data_ptr_array
) {
1872 if (src_is_for_mic
&& m_vars
[i
].flags
.is_stack_buf
) {
1873 m_vars
[i
].offset
= static_cast<char*>(m_vars
[i
].ptr
) -
1874 m_device
.m_persist_list
.front().cpu_stack_addr
;
1876 // if source is used at CPU save its offset and disp
1877 if (m_vars
[i
].into
== NULL
|| m_vars
[i
].direction
.in
) {
1878 m_vars_extra
[i
].cpu_offset
= m_vars
[i
].offset
;
1879 m_vars_extra
[i
].cpu_disp
= m_vars
[i
].disp
;
1882 // If "into" is define we need to do the similar work for it
1883 if (!m_vars
[i
].into
) {
1887 int64_t into_disp
=0, into_offset
= 0;
1889 switch (m_vars
[i
].type
.dst
) {
1890 case c_data_ptr_array
:
1895 int64_t size
= m_vars
[i
].size
;
1897 if (m_vars
[i
].type
.dst
== c_cean_var
) {
1899 const Arr_Desc
*ap
=
1900 static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
1903 ARRAY_DESC_DUMP(" ", "INTO", ap
, 0, src_is_for_mic
);
1905 // offset and length are derived from the array descriptor
1906 __arr_data_offset_and_length(ap
, into_disp
, size
);
1908 if (!is_arr_desc_contiguous(ap
)) {
1909 m_vars
[i
].flags
.is_noncont_dst
= 1;
1910 m_vars_extra
[i
].read_rng_dst
=
1911 init_read_ranges_arr_desc(ap
);
1912 if (!cean_ranges_match(
1913 m_vars_extra
[i
].read_rng_src
,
1914 m_vars_extra
[i
].read_rng_dst
)) {
1915 LIBOFFLOAD_ERROR(c_ranges_dont_match
);
1919 m_vars
[i
].into
= reinterpret_cast<void*>(ap
->base
);
1922 int64_t size_src
= m_vars_extra
[i
].read_rng_src
?
1923 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
1925 int64_t size_dst
= m_vars_extra
[i
].read_rng_dst
?
1926 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) :
1928 // It's supposed that "into" size must be not less
1930 if (size_src
> size_dst
) {
1931 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes
,
1932 size_src
, size_dst
);
1936 if (m_vars
[i
].direction
.bits
) {
1937 if (m_vars
[i
].flags
.is_static_dstn
) {
1940 // find data associated with variable
1941 if (!find_ptr_data(ptr_data
, m_vars
[i
].into
,
1942 into_disp
, size
, false, false)) {
1945 if (ptr_data
!= 0) {
1946 // offset to base from the beginning of the buffer
1949 (char*) m_vars
[i
].into
-
1950 (char*) ptr_data
->cpu_addr
.start();
1953 m_vars
[i
].flags
.is_static_dstn
= false;
1955 m_vars_extra
[i
].dst_data
= ptr_data
;
1959 if (m_vars
[i
].direction
.in
&&
1960 !m_vars
[i
].flags
.is_static_dstn
) {
1961 m_in_datalen
+= m_vars
[i
].size
;
1963 // for non-static target destination defined as CEAN
1964 // expression we pass to target its size and dist
1965 if (m_vars
[i
].type
.dst
== c_cean_var
) {
1966 m_in_datalen
+= 2 * sizeof(uint64_t);
1968 m_need_runfunction
= true;
1974 if (m_vars
[i
].direction
.bits
||
1975 m_vars
[i
].alloc_if
||
1976 m_vars
[i
].free_if
) {
1977 ArrDesc
*dvp
= static_cast<ArrDesc
*>(m_vars
[i
].into
);
1980 __dv_desc_dump("INTO", dvp
);
1982 // send dope vector contents excluding base
1983 m_in_datalen
+= m_vars
[i
].size
- sizeof(uint64_t);
1984 m_need_runfunction
= true;
1990 case c_cean_var_ptr
:
1992 int64_t size
= m_vars
[i
].size
;
1994 if (m_vars
[i
].type
.dst
== c_cean_var_ptr
) {
1996 const Arr_Desc
*ap
=
1997 static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
2000 ARRAY_DESC_DUMP(" ", "INTO", ap
, 1, src_is_for_mic
);
2002 // offset and length are derived from the array descriptor
2003 __arr_data_offset_and_length(ap
, into_disp
, size
);
2005 if (!is_arr_desc_contiguous(ap
)) {
2006 m_vars
[i
].flags
.is_noncont_src
= 1;
2007 m_vars_extra
[i
].read_rng_dst
=
2008 init_read_ranges_arr_desc(ap
);
2009 if (!cean_ranges_match(
2010 m_vars_extra
[i
].read_rng_src
,
2011 m_vars_extra
[i
].read_rng_dst
)) {
2012 LIBOFFLOAD_ERROR(c_ranges_dont_match
);
2015 m_vars
[i
].into
= reinterpret_cast<char**>(ap
->base
);
2017 else if (m_vars
[i
].type
.dst
== c_dv_ptr
) {
2018 // need to send DV to the device unless it is 'nocopy'
2019 if (m_vars
[i
].direction
.bits
||
2020 m_vars
[i
].alloc_if
||
2021 m_vars
[i
].free_if
) {
2022 ArrDesc
*dvp
= *static_cast<ArrDesc
**>(m_vars
[i
].into
);
2025 __dv_desc_dump("INTO", dvp
);
2027 m_vars
[i
].direction
.bits
= c_parameter_in
;
2031 int64_t size_src
= m_vars_extra
[i
].read_rng_src
?
2032 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
2034 int64_t size_dst
= m_vars_extra
[i
].read_rng_dst
?
2035 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) :
2037 // It's supposed that "into" size must be not less than
2039 if (size_src
> size_dst
) {
2040 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes
,
2041 size_src
, size_dst
);
2045 if (m_vars
[i
].direction
.bits
) {
2049 void *base
= *static_cast<void**>(m_vars
[i
].into
);
2051 if (m_vars
[i
].direction
.in
) {
2053 if (m_vars
[i
].flags
.is_stack_buf
) {
2054 // for stack persistent objects ptr data is created
2055 // by var_desc with number 0.
2056 // Its ptr_data is stored at m_stack_ptr_data
2057 ptr_data
= m_stack_ptr_data
;
2058 m_vars
[i
].flags
.sink_addr
= 1;
2060 else if (m_vars
[i
].alloc_if
) {
2061 if (m_vars
[i
].flags
.preallocated
) {
2062 m_out_datalen
+= sizeof(void*);
2063 m_need_runfunction
= true;
2067 if (!alloc_ptr_data(
2069 reinterpret_cast<char *>(base
) + alloc_disp
,
2070 (alloc_base
!= NULL
) ?
2071 alloc_disp
: into_disp
,
2072 (alloc_base
!= NULL
) ?
2075 (alloc_base
!= NULL
) ?
2076 0 : m_vars
[i
].align
,
2077 m_vars
[i
].flags
.targetptr
,
2078 m_vars
[i
].flags
.preallocated
,
2079 m_vars
[i
].flags
.pin
)) {
2082 if (m_vars
[i
].flags
.targetptr
) {
2083 if (!init_mic_address(ptr_data
)) {
2086 *static_cast<void**>(m_vars
[i
].into
) = base
=
2087 reinterpret_cast<void*>(ptr_data
->mic_addr
);
2089 if (ptr_data
->add_reference() == 0 &&
2090 ptr_data
->mic_buf
!= 0) {
2091 // add buffer to the list of buffers that
2092 // are passed to dispatch call
2093 m_compute_buffers
.push_back(
2097 // will send buffer address to device
2098 m_vars
[i
].flags
.sink_addr
= 1;
2101 if (!ptr_data
->is_static
) {
2102 // need to add reference for buffer
2103 m_need_runfunction
= true;
2107 // use existing association from pointer table
2108 if (!find_ptr_data(ptr_data
, base
, into_disp
,
2109 size
, m_vars
[i
].flags
.targetptr
, true)) {
2112 m_vars
[i
].flags
.sink_addr
= 1;
2115 if (ptr_data
->alloc_disp
!= 0) {
2116 m_vars
[i
].flags
.alloc_disp
= 1;
2117 m_in_datalen
+= sizeof(alloc_disp
);
2120 if (m_vars
[i
].flags
.sink_addr
) {
2121 // get buffers's address on the sink
2122 if (!init_mic_address(ptr_data
)) {
2126 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
2129 if (!ptr_data
->is_static
&& m_vars
[i
].free_if
) {
2130 // need to decrement buffer reference on target
2131 m_need_runfunction
= true;
2134 // copy other pointer properties to var descriptor
2135 m_vars
[i
].mic_offset
= ptr_data
->mic_offset
;
2136 m_vars
[i
].flags
.is_static_dstn
= ptr_data
->is_static
;
2139 if (!find_ptr_data(ptr_data
,
2148 into_offset
= ptr_data
?
2150 (char*) ptr_data
->cpu_addr
.start() :
2153 // save pointer data
2154 m_vars_extra
[i
].dst_data
= ptr_data
;
2164 case c_dv_data_slice
:
2165 case c_dv_ptr_data_slice
:
2166 if (m_vars
[i
].direction
.bits
||
2167 m_vars
[i
].alloc_if
||
2168 m_vars
[i
].free_if
) {
2175 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.dst
)) {
2176 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
2179 ARRAY_DESC_DUMP(" ", "INTO", ap
, 0, src_is_for_mic
);
2181 dvp
= (m_vars
[i
].type
.dst
== c_dv_data_slice
) ?
2182 reinterpret_cast<ArrDesc
*>(ap
->base
) :
2183 *reinterpret_cast<ArrDesc
**>(ap
->base
);
2186 dvp
= (m_vars
[i
].type
.dst
== c_dv_data
) ?
2187 static_cast<ArrDesc
*>(m_vars
[i
].into
) :
2188 *static_cast<ArrDesc
**>(m_vars
[i
].into
);
2190 if (!__dv_is_contiguous(dvp
)) {
2191 m_vars
[i
].flags
.is_noncont_dst
= 1;
2192 m_vars_extra
[i
].read_rng_dst
=
2193 init_read_ranges_dv(dvp
);
2195 // size and displacement
2196 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.dst
)) {
2197 // offset and length are derived from the array
2199 __arr_data_offset_and_length(ap
, into_disp
, size
);
2200 if (m_vars
[i
].direction
.bits
) {
2201 if (!is_arr_desc_contiguous(ap
)) {
2202 if (m_vars
[i
].flags
.is_noncont_dst
) {
2203 LIBOFFLOAD_ERROR(c_slice_of_noncont_array
);
2206 m_vars
[i
].flags
.is_noncont_dst
= 1;
2207 m_vars_extra
[i
].read_rng_dst
=
2208 init_read_ranges_arr_desc(ap
);
2209 if (!cean_ranges_match(
2210 m_vars_extra
[i
].read_rng_src
,
2211 m_vars_extra
[i
].read_rng_dst
)) {
2212 LIBOFFLOAD_ERROR(c_ranges_dont_match
);
2218 if (m_vars
[i
].flags
.has_length
) {
2219 size
= __dv_data_length(dvp
, m_vars
[i
].count
);
2222 size
= __dv_data_length(dvp
);
2228 m_vars_extra
[i
].read_rng_src
?
2229 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
2232 m_vars_extra
[i
].read_rng_dst
?
2233 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) :
2235 // It's supposed that "into" size must be not less
2237 if (size_src
> size_dst
) {
2238 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes
,
2239 size_src
, size_dst
);
2244 void *base
= reinterpret_cast<void*>(dvp
->Base
);
2247 if (m_vars
[i
].direction
.in
) {
2248 if (m_vars
[i
].alloc_if
) {
2250 if (!alloc_ptr_data(
2252 reinterpret_cast<char *>(base
) + alloc_disp
,
2253 (alloc_base
!= NULL
) ?
2254 alloc_disp
: into_disp
,
2255 (alloc_base
!= NULL
) ?
2258 (alloc_base
!= NULL
) ?
2259 0 : m_vars
[i
].align
,
2260 m_vars
[i
].flags
.targetptr
,
2261 m_vars
[i
].flags
.preallocated
,
2262 m_vars
[i
].flags
.pin
)) {
2265 if (ptr_data
->add_reference() == 0 &&
2266 ptr_data
->mic_buf
!=0) {
2267 // add buffer to the list of buffers
2268 // that are passed to dispatch call
2269 m_compute_buffers
.push_back(
2273 // will send buffer address to device
2274 m_vars
[i
].flags
.sink_addr
= 1;
2277 if (!ptr_data
->is_static
) {
2278 // need to add reference for buffer
2279 m_need_runfunction
= true;
2283 // use existing association from pointer table
2284 if (!find_ptr_data(ptr_data
, base
, into_disp
,
2285 size
, m_vars
[i
].flags
.targetptr
, true)) {
2289 // need to update base in dope vector on device
2290 m_vars
[i
].flags
.sink_addr
= 1;
2293 if (ptr_data
->alloc_disp
!= 0) {
2294 m_vars
[i
].flags
.alloc_disp
= 1;
2295 m_in_datalen
+= sizeof(alloc_disp
);
2298 if (m_vars
[i
].flags
.sink_addr
) {
2299 // get buffers's address on the sink
2300 if (!init_mic_address(ptr_data
)) {
2303 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
2306 if (!ptr_data
->is_static
&& m_vars
[i
].free_if
) {
2307 // need to decrement buffer reference on target
2308 m_need_runfunction
= true;
2311 // offset to base from the beginning of the buffer
2314 (char*) base
- (char*) ptr_data
->cpu_addr
.start();
2316 // copy other pointer properties to var descriptor
2317 m_vars
[i
].mic_offset
= ptr_data
->mic_offset
;
2318 m_vars
[i
].flags
.is_static_dstn
= ptr_data
->is_static
;
2320 else { // src_is_for_mic
2321 if (!find_ptr_data(ptr_data
,
2328 into_offset
= !ptr_data
?
2330 (char*) base
- (char*) ptr_data
->cpu_addr
.start();
2333 // save pointer data
2334 m_vars_extra
[i
].dst_data
= ptr_data
;
2339 LIBOFFLOAD_ERROR(c_unknown_var_type
, m_vars
[i
].type
.src
);
2342 // if into is used at CPU save its offset and disp
2343 if (m_vars
[i
].direction
.out
) {
2344 m_vars_extra
[i
].cpu_offset
= into_offset
;
2345 m_vars_extra
[i
].cpu_disp
= into_disp
;
2348 if (m_vars
[i
].flags
.is_stack_buf
) {
2349 into_offset
= static_cast<char*>(m_vars
[i
].into
) -
2350 m_device
.m_persist_list
.front().cpu_stack_addr
;
2352 m_vars
[i
].offset
= into_offset
;
2353 m_vars
[i
].disp
= into_disp
;
2360 bool OffloadDescriptor::setup_misc_data(const char *name
)
2362 OffloadTimer
timer(get_timer_data(), c_offload_host_setup_misc_data
);
2364 // we can skip run functon call together with wait if offloaded
2365 // region is empty and there is no user defined non-pointer IN/OUT data
2366 if (m_need_runfunction
) {
2367 // variable descriptors are sent as input data
2368 m_in_datalen
+= m_vars_total
* sizeof(VarDesc
);
2370 // timer data is sent as a part of the output data
2371 m_out_datalen
+= OFFLOAD_TIMER_DATALEN();
2373 // max from input data and output data length
2374 uint64_t data_len
= m_in_datalen
> m_out_datalen
? m_in_datalen
:
2377 // Misc data has the following layout
2378 // <Function Descriptor>
2380 // <In/Out Data> (optional)
2382 // We can transfer copyin/copyout data in misc/return data which can
2383 // be passed to run function call if its size does not exceed
2384 // COI_PIPELINE_MAX_IN_MISC_DATA_LEN. Otherwise we have to allocate
2387 m_func_desc_size
= sizeof(FunctionDescriptor
) + strlen(name
) + 1;
2388 m_func_desc_size
= (m_func_desc_size
+ 7) & ~7;
2390 int misc_data_offset
= 0;
2391 int misc_data_size
= 0;
2393 if (m_func_desc_size
+
2394 m_in_datalen
<= COI_PIPELINE_MAX_IN_MISC_DATA_LEN
&&
2395 m_out_datalen
<= COI_PIPELINE_MAX_IN_MISC_DATA_LEN
) {
2396 // use misc/return data for copyin/copyout
2397 misc_data_offset
= m_func_desc_size
;
2398 misc_data_size
= data_len
;
2401 OffloadTimer
timer_buf(get_timer_data(),
2402 c_offload_host_alloc_data_buffer
);
2404 // send/receive data using buffer
2405 COIRESULT res
= COI::BufferCreate(data_len
,
2408 1, &m_device
.get_process(),
2410 if (res
!= COI_SUCCESS
) {
2411 if (m_status
!= 0) {
2412 m_status
->result
= translate_coi_error(res
);
2415 report_coi_error(c_buf_create
, res
);
2418 m_compute_buffers
.push_back(m_inout_buf
);
2419 m_destroy_buffers
.push_back(m_inout_buf
);
2423 // initialize function descriptor
2424 m_func_desc
= (FunctionDescriptor
*) malloc(m_func_desc_size
+
2426 if (m_func_desc
== NULL
)
2427 LIBOFFLOAD_ERROR(c_malloc
);
2428 m_func_desc
->console_enabled
= console_enabled
;
2429 m_func_desc
->timer_enabled
= offload_report_enabled
&&
2430 (timer_enabled
|| offload_report_level
);
2431 m_func_desc
->offload_report_level
= offload_report_enabled
?
2432 offload_report_level
: 0;
2433 m_func_desc
->offload_number
= GET_OFFLOAD_NUMBER(get_timer_data());
2434 m_func_desc
->in_datalen
= m_in_datalen
;
2435 m_func_desc
->out_datalen
= m_out_datalen
;
2436 m_func_desc
->vars_num
= m_vars_total
;
2437 m_func_desc
->data_offset
= misc_data_offset
;
2439 // append entry name
2440 strcpy(m_func_desc
->data
, name
);
2446 void OffloadDescriptor::setup_omp_async_info()
2448 OFFLOAD_TRACE(2, "setup_omp_async_info\n");
2449 OmpAsyncLastEventType event_type
= m_need_runfunction
?
2450 c_last_runfunc
: c_last_write
;
2451 int last_in
= m_need_runfunction
? 0 : -1;
2454 for (i
= m_vars_total
- 1; i
>=0; i
--) {
2455 switch (m_vars
[i
].type
.dst
) {
2459 if (m_vars
[i
].direction
.out
&&
2460 m_vars
[i
].flags
.is_static_dstn
) {
2461 event_type
= c_last_read
;
2463 else if (last_in
< 0 && m_vars
[i
].direction
.in
&&
2464 m_vars
[i
].flags
.is_static_dstn
) {
2470 case c_cean_var_ptr
:
2474 case c_dv_data_slice
:
2475 case c_dv_ptr_data_slice
:
2477 if (m_vars
[i
].direction
.out
) {
2478 event_type
= c_last_read
;
2480 else if (last_in
< 0 && m_vars
[i
].direction
.in
) {
2487 if (event_type
== c_last_read
) {
2492 if (event_type
== c_last_read
) {
2493 m_vars_extra
[i
].omp_last_event_type
= c_last_read
;
2495 else if (event_type
== c_last_write
) {
2496 m_vars_extra
[last_in
].omp_last_event_type
= c_last_write
;
2498 m_omp_async_last_event_type
= event_type
;
2499 OFFLOAD_TRACE(2, "setup_omp_async_info: event_type=%d\n",
2500 m_omp_async_last_event_type
);
2504 void offload_proxy_task_completed_ooo(
2510 /* TODO: Call callback function, pass info. */
2514 void OffloadDescriptor::register_omp_event_call_back(
2515 const COIEVENT
*event
,
2518 OFFLOAD_TRACE(2, "register_omp_event_call_back(event=%p, info=%p)\n",
2520 if (COI::EventRegisterCallback
) {
2521 COI::EventRegisterCallback(
2523 &offload_proxy_task_completed_ooo
,
2526 "COI::EventRegisterCallback found; callback registered\n");
2530 bool OffloadDescriptor::wait_dependencies(
2533 _Offload_stream handle
2536 OffloadTimer
timer(get_timer_data(), c_offload_host_wait_deps
);
2538 OffloadDescriptor
*task
;
2539 if (num_waits
== 0) {
2544 if (num_waits
== -1) {
2546 // some specific stream of the device
2548 stream
= Stream::find_stream(handle
, false);
2550 // the stream was not created or was destroyed
2552 LIBOFFLOAD_ERROR(c_offload_no_stream
, m_device
.get_logical_index());
2555 task
= stream
->get_last_offload();
2557 // offload was completed by previous offload_wait pragma
2562 if (!task
->offload_finish(0)) { //arg is 0 for is_traceback
2566 stream
->set_last_offload(NULL
);
2569 // all streams of the device or over all devices
2571 StreamMap stream_map
= Stream::all_streams
;
2572 for (StreamMap::iterator it
= stream_map
.begin();
2573 it
!= stream_map
.end(); it
++) {
2574 Stream
* stream
= it
->second
;
2576 if (!m_wait_all_devices
&&
2577 stream
->get_device() != m_device
.get_logical_index()) {
2580 // get associated async task
2581 OffloadDescriptor
*task
= stream
->get_last_offload();
2583 // offload was completed by offload_wait pragma or wait clause
2587 if (!task
->offload_finish(0)) { //arg is 0 for is_traceback
2591 stream
->set_last_offload(NULL
);
2594 // no uncompleted streams
2599 // if handle is equal to no_stream it's wait for signals
2600 for (int i
= 0; i
< num_waits
; i
++) {
2601 _Offload_stream stream_handle
;
2603 task
= m_device
.find_signal(waits
[i
], true);
2605 LIBOFFLOAD_ERROR(c_offload1
, m_device
.get_logical_index(),
2609 else if (task
== SIGNAL_IS_REMOVED
) {
2612 if (!task
->offload_finish(0)) { //arg is 0 for is_traceback
2616 // if the offload both has signal and is last offload of its
2617 // stream, we must wipe out the "last_offload" reference as
2618 // the offload already is finished.
2619 stream_handle
= task
->m_stream
;
2620 if (stream_handle
!= -1) {
2621 stream
= Stream::find_stream(stream_handle
, false);
2622 if (stream
&& stream
->get_last_offload() == task
) {
2623 stream
->set_last_offload(NULL
);
2632 bool OffloadDescriptor::offload_wrap(
2640 const void **signal
,
2642 const void *stack_addr
,
2643 OffloadFlags offload_flags
2646 OffloadWaitKind wait_kind
= c_offload_wait_signal
;
2647 bool is_traceback
= offload_flags
.bits
.fortran_traceback
;
2649 // define kind of wait if any;
2650 // there can be one off the following kind:
2651 // 1. c_offload_wait_signal for "offload_wait wait(signal)"
2652 // 2. c_offload_wait_stream for "offload_wait stream(stream)"
2653 // 3. c_offload_wait_all_streams for "offload_wait stream(0)"
2654 if (num_waits
== -1) {
2655 wait_kind
= (m_stream
== 0) ?
2656 c_offload_wait_all_streams
:
2657 c_offload_wait_stream
;
2660 const char *stream_str
;
2662 if (m_stream
== no_stream
|| num_waits
>= 0) {
2663 stream_str
= "none";
2665 else if (m_stream
== 0) {
2669 sprintf(buf
, "%#llx", m_stream
);
2674 OFFLOAD_DEBUG_TRACE_1(1,
2675 GET_OFFLOAD_NUMBER(get_timer_data()),
2676 c_offload_init_func
,
2677 "Offload function %s, is_empty=%d, #varDescs=%d, "
2678 "signal=none, stream=%s, #waits=%d%c",
2679 name
, is_empty
, vars_total
, stream_str
, num_waits
,
2680 num_waits
== 0 ? '\n' : ' ');
2681 // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
2682 // since the number of waits is not fixed.
2683 if (!OFFLOAD_DO_TRACE
&& (console_enabled
>= 1)) {
2686 if (m_stream
== no_stream
) {
2687 printf("%p", waits
[0]);
2688 for (int i
= 1; i
< num_waits
; i
++) {
2689 printf(", %p", waits
[i
]);
2692 else if (m_stream
!= 0) {
2693 printf("%#x", m_stream
);
2696 printf(" all streams");
2703 // stream in wait is reported further in OFFLOAD_REPORT for waits
2704 if (m_stream
!= no_stream
&& num_waits
== 0) {
2705 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2709 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2714 OFFLOAD_DEBUG_TRACE_1(1,
2715 GET_OFFLOAD_NUMBER(get_timer_data()),
2716 c_offload_init_func
,
2717 "Offload function %s, is_empty=%d, #varDescs=%d, "
2718 "signal=%p, stream=%s, #waits=%d%c",
2719 name
, is_empty
, vars_total
, *signal
, stream_str
, num_waits
,
2720 num_waits
== 0 ? '\n' : ' ');
2721 // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
2722 // since the number of waits is not fixed.
2723 if (!OFFLOAD_DO_TRACE
&& (console_enabled
>= 1)) {
2726 if (m_stream
== no_stream
) {
2727 printf("%p", waits
[0]);
2728 for (int i
= 1; i
< num_waits
; i
++) {
2729 printf(", %p", waits
[i
]);
2733 else if (m_stream
!= 0) {
2734 printf("%#x", m_stream
);
2737 printf(" all streams");
2744 // stream in wait is reported further in OFFLOAD_REPORT for waits
2745 if (m_stream
!= no_stream
&& num_waits
== 0) {
2746 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2750 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2754 if (console_enabled
>= 1 && offload_flags
.flags
!= 0) {
2755 trace_offload_flags(get_timer_data(), offload_flags
);
2758 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2759 c_offload_wait
, "%d\n",
2760 wait_kind
, num_waits
,
2761 (wait_kind
== c_offload_wait_signal
) ?
2763 reinterpret_cast<const void **>(m_stream
));
2765 if (m_status
!= 0) {
2766 m_status
->result
= OFFLOAD_SUCCESS
;
2767 m_status
->device_number
= m_device
.get_logical_index();
2770 m_initial_need_runfunction
= m_need_runfunction
= !is_empty
;
2772 // wait for dependencies to finish
2773 if (!wait_dependencies(waits
, num_waits
, m_stream
)) {
2779 if (!setup_descriptors(vars
, vars2
, vars_total
, entry_id
, stack_addr
)) {
2784 if (offload_flags
.bits
.omp_async
) {
2785 setup_omp_async_info();
2788 // initiate send for pointers. Want to do it as early as possible.
2789 if (!send_pointer_data(signal
!= 0 || offload_flags
.bits
.omp_async
,
2795 // setup misc data for run function
2796 if (!setup_misc_data(name
)) {
2801 // gather copyin data into buffer
2802 if (!gather_copyin_data()) {
2807 // Start the computation
2808 if (!compute(signal
)) {
2813 // initiate receive for pointers
2814 if (!receive_pointer_data(signal
!= 0 || offload_flags
.bits
.omp_async
,
2819 if (offload_flags
.bits
.omp_async
) {
2822 // if there is a signal or stream save descriptor for the later use.
2823 // num_waits == -1 is for offload_wait and there is nothing to save
2824 if (num_waits
!= -1 && (signal
!= 0 || m_stream
!= no_stream
)) {
2826 m_device
.add_signal(*signal
, this);
2829 if (m_stream
!= no_stream
&& m_stream
!= 0) {
2830 Stream
* stream
= Stream::find_stream(m_stream
, false);
2832 stream
->set_last_offload(this);
2835 LIBOFFLOAD_ERROR(c_offload_no_stream
, m_device
.get_logical_index());
2839 // if there is a clause with alloc_if(1) and preallocated need to call
2840 // offload_finish after runfunction
2841 if (!m_preallocated_alloc
) {
2846 // wait for the offload to finish.
2847 if (!offload_finish(is_traceback
)) {
2856 bool OffloadDescriptor::offload(
2864 const void **signal
,
2866 const void *stack_addr
,
2867 OffloadFlags offload_flags
2871 res
= offload_wrap(name
, is_empty
, vars
, vars2
, vars_total
,
2872 waits
, num_waits
, signal
, entry_id
,
2873 stack_addr
, offload_flags
);
2874 if (res
== false && !m_traceback_called
) {
2875 if (offload_flags
.bits
.fortran_traceback
) {
2877 "Calling Fortran library to continue traceback from MIC\n");
2878 FORTRAN_TRACE_BACK(m_status
->result
);
2879 m_traceback_called
= true;
2885 bool OffloadDescriptor::offload_finish(
2891 // wait for compute dependencies to become signaled
2892 if (m_in_deps_total
> 0) {
2893 OffloadTimer
timer(get_timer_data(), c_offload_host_wait_compute
);
2895 if (__offload_active_wait
) {
2898 res
= COI::EventWait(m_in_deps_total
, m_in_deps
, 0, 1, 0, 0);
2900 while (res
== COI_TIME_OUT_REACHED
);
2903 res
= COI::EventWait(m_in_deps_total
, m_in_deps
, -1, 1, 0, 0);
2906 if (res
!= COI_SUCCESS
) {
2907 if (m_status
!= 0 && !m_traceback_called
) {
2908 m_status
->result
= translate_coi_error(res
);
2911 "Calling Fortran library to continue traceback from MIC\n");
2912 FORTRAN_TRACE_BACK(m_status
->result
);
2913 m_traceback_called
= true;
2918 if (is_traceback
&& !m_traceback_called
) {
2920 "Calling Fortran library to continue traceback from MIC\n");
2921 FORTRAN_TRACE_BACK(OFFLOAD_ERROR
);
2922 m_traceback_called
= true;
2925 report_coi_error(c_event_wait
, res
);
2929 // scatter copyout data received from target
2930 if (!scatter_copyout_data()) {
2934 if (m_out_with_preallocated
&&
2935 !receive_pointer_data(m_out_deps_total
> 0, false, NULL
)) {
2940 // wait for receive dependencies to become signaled
2941 if (m_out_deps_total
> 0) {
2942 OffloadTimer
timer(get_timer_data(), c_offload_host_wait_buffers_reads
);
2944 if (__offload_active_wait
) {
2947 res
= COI::EventWait(m_out_deps_total
, m_out_deps
, 0, 1, 0, 0);
2949 while (res
== COI_TIME_OUT_REACHED
);
2952 res
= COI::EventWait(m_out_deps_total
, m_out_deps
, -1, 1, 0, 0);
2955 if (res
!= COI_SUCCESS
) {
2956 if (m_status
!= 0) {
2957 m_status
->result
= translate_coi_error(res
);
2960 report_coi_error(c_event_wait
, res
);
2966 OffloadTimer
timer(get_timer_data(), c_offload_host_destroy_buffers
);
2968 for (BufferList::const_iterator it
= m_destroy_buffers
.begin();
2969 it
!= m_destroy_buffers
.end(); it
++) {
2970 res
= COI::BufferDestroy(*it
);
2971 if (res
!= COI_SUCCESS
) {
2972 if (m_status
!= 0) {
2973 m_status
->result
= translate_coi_error(res
);
2976 report_coi_error(c_buf_destroy
, res
);
2984 void OffloadDescriptor::cleanup()
2986 // release device in orsl
2987 ORSL::release(m_device
.get_logical_index());
2989 OFFLOAD_TIMER_STOP(get_timer_data(), c_offload_host_total_offload
);
2992 Offload_Report_Epilog(get_timer_data());
2995 bool OffloadDescriptor::is_signaled()
2997 bool signaled
= true;
3000 // check compute and receive dependencies
3001 if (m_in_deps_total
> 0) {
3002 res
= COI::EventWait(m_in_deps_total
, m_in_deps
, 0, 1, 0, 0);
3003 signaled
= signaled
&& (res
== COI_SUCCESS
);
3005 if (m_out_deps_total
> 0) {
3006 res
= COI::EventWait(m_out_deps_total
, m_out_deps
, 0, 1, 0, 0);
3007 signaled
= signaled
&& (res
== COI_SUCCESS
);
3013 static Arr_Desc
* make_arr_desc(
3015 int64_t extent_start_val
,
3016 int64_t extent_elements_val
,
3021 res
= (Arr_Desc
*)malloc(sizeof(Arr_Desc
));
3023 LIBOFFLOAD_ERROR(c_malloc
);
3024 res
->base
= reinterpret_cast<int64_t>(ptr_val
);
3026 res
->dim
[0].size
= size
;
3027 res
->dim
[0].lindex
= 0;
3028 res
->dim
[0].lower
= extent_start_val
;
3029 res
->dim
[0].upper
= extent_elements_val
+ extent_start_val
- 1;
3030 res
->dim
[0].stride
= 1;
3034 // Send pointer data if source or destination or both of them are
3035 // noncontiguous. There is guarantee that length of destination enough for
3036 // transferred data.
3037 bool OffloadDescriptor::send_noncontiguous_pointer_data(
3042 uint64_t &data_sent
,
3043 uint32_t in_deps_amount
,
3047 int64_t offset_src
, offset_dst
;
3048 int64_t length_src
, length_dst
;
3049 int64_t length_src_cur
, length_dst_cur
;
3052 bool dst_is_empty
= true;
3053 bool src_is_empty
= true;
3057 // Set length_src and length_dst
3058 length_src
= (m_vars_extra
[i
].read_rng_src
) ?
3059 m_vars_extra
[i
].read_rng_src
->range_size
: m_vars
[i
].size
;
3060 length_dst
= !m_vars
[i
].into
? length_src
:
3061 (m_vars_extra
[i
].read_rng_dst
) ?
3062 m_vars_extra
[i
].read_rng_dst
->range_size
: m_vars
[i
].size
;
3063 send_size
= (length_src
< length_dst
) ? length_src
: length_dst
;
3065 // If BufferWriteMultiD is defined we can set values of required arguments
3066 // and transfer noncontiguous data via call to the COI routine.
3067 if (__offload_use_coi_noncontiguous_transfer
&& COI::BufferWriteMultiD
) {
3068 struct Arr_Desc
* arr_desc_dst
;
3069 struct Arr_Desc
* arr_desc_src
;
3070 int64_t size_src
, size_dst
;
3071 char *base
= offload_get_src_base(static_cast<char*>(m_vars
[i
].ptr
),
3072 m_vars
[i
].type
.src
);
3073 COIBUFFER dst_buf
= m_vars
[i
].into
?
3074 m_vars_extra
[i
].dst_data
->mic_buf
:
3075 m_vars_extra
[i
].src_data
->mic_buf
;
3077 offset_src
= (m_vars_extra
[i
].read_rng_src
)?
3078 m_vars_extra
[i
].read_rng_src
->init_offset
: m_vars_extra
[i
].cpu_disp
;
3079 size_src
= m_vars_extra
[i
].read_rng_src
?
3080 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
3083 offset_dst
= (m_vars_extra
[i
].read_rng_dst
)?
3084 m_vars_extra
[i
].read_rng_dst
->init_offset
: m_vars
[i
].disp
;
3085 size_dst
= m_vars_extra
[i
].read_rng_dst
?
3086 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) : m_vars
[i
].size
;
3088 int64_t el_size
= (!m_vars
[i
].into
||
3089 (m_vars_extra
[i
].read_rng_src
&& m_vars_extra
[i
].read_rng_dst
)) ?
3091 m_vars_extra
[i
].read_rng_src
?
3092 m_vars_extra
[i
].read_rng_src
->arr_desc
->dim
[
3093 m_vars_extra
[i
].read_rng_src
->arr_desc
->rank
- 1].size
:
3094 m_vars_extra
[i
].read_rng_dst
->arr_desc
->dim
[
3095 m_vars_extra
[i
].read_rng_dst
->arr_desc
->rank
- 1].size
;
3097 arr_desc_src
= (m_vars_extra
[i
].read_rng_src
) ?
3098 m_vars_extra
[i
].read_rng_src
->arr_desc
:
3099 make_arr_desc(NULL
, // don't required for source
3100 offset_src
/el_size
, size_src
/el_size
, el_size
);
3102 arr_desc_dst
= !m_vars
[i
].into
?
3104 (m_vars_extra
[i
].read_rng_dst
) ?
3105 m_vars_extra
[i
].read_rng_dst
->arr_desc
:
3107 offset_dst
/el_size
, size_src
/el_size
, el_size
);
3109 int64_t alloc_disp
= m_vars
[i
].into
?
3110 m_vars_extra
[i
].dst_data
->alloc_disp
:
3111 m_vars_extra
[i
].src_data
->alloc_disp
;
3113 arr_desc_src
->base
= reinterpret_cast<int64_t>(base
);
3114 arr_desc_dst
->base
= 0;
3116 res
= COI::BufferWriteMultiD(
3117 dst_buf
, // in_DestBuffer,
3118 m_device
.get_process(), // DestProcess,
3119 m_vars
[i
].offset
+ m_vars
[i
].mic_offset
-
3120 alloc_disp
, // Offset
3121 (void*)arr_desc_dst
, // descriptor of DestArray
3122 (void*)arr_desc_src
, // descriptor of SrcArray
3123 COI_COPY_UNSPECIFIED
, // Type
3124 in_deps_amount
, // Number of in Dependencies
3125 in_deps
, // array of in Dependencies
3126 event
); // out Dependency
3127 if (res
!= COI_SUCCESS
) {
3128 if (m_status
!= 0) {
3129 m_status
->result
= translate_coi_error(res
);
3132 report_coi_error(c_buf_copy
, res
);
3137 // if event is defined we must multiplate it for all contiguous intervals
3138 // that will be Copied/Write.
3139 // Take in account that we already have 1 event.
3141 m_in_deps_allocated
+= (length_src
/ send_size
) *
3142 ((m_vars_extra
[i
].read_rng_src
) ?
3143 m_vars_extra
[i
].read_rng_src
->range_max_number
: 1) ;
3145 (COIEVENT
*)realloc(m_in_deps
, sizeof(COIEVENT
) * m_in_deps_allocated
);
3149 // consequently get contiguous ranges,
3150 // define corresponded destination offset and send data
3153 if (m_vars_extra
[i
].read_rng_src
) {
3154 if (!get_next_range(m_vars_extra
[i
].read_rng_src
,
3156 // source ranges are over - nothing to send
3160 else if (data_sent
== 0) {
3161 offset_src
= m_vars_extra
[i
].cpu_disp
;
3166 length_src_cur
= length_src
;
3169 // if source is contiguous or its contiguous range is greater
3170 // than destination one
3171 offset_src
+= send_size
;
3173 length_src_cur
-= send_size
;
3174 src_is_empty
= length_src_cur
== 0;
3177 if (m_vars
[i
].into
) {
3178 if (m_vars_extra
[i
].read_rng_dst
) {
3179 if (!get_next_range(m_vars_extra
[i
].read_rng_dst
,
3181 // destination ranges are over
3182 LIBOFFLOAD_ERROR(c_destination_is_over
);
3186 // into is contiguous.
3188 offset_dst
= m_vars
[i
].disp
;
3190 length_dst_cur
= length_dst
;
3194 offset_dst
= offset_src
;
3195 length_dst_cur
= length_src
;
3199 // if destination is contiguous or its contiguous range is greater
3201 offset_dst
+= send_size
;
3203 length_dst_cur
-= send_size
;
3204 dst_is_empty
= length_dst_cur
== 0;
3207 event
= &m_in_deps
[m_in_deps_total
++];
3209 if (src_data
!= 0 && src_data
->cpu_buf
!= 0) {
3210 res
= COI::BufferCopy(
3213 m_vars
[i
].mic_offset
+
3214 m_vars
[i
].offset
+ offset_dst
,
3215 m_vars_extra
[i
].cpu_offset
+ offset_src
,
3217 COI_COPY_UNSPECIFIED
,
3218 in_deps_amount
, in_deps
,
3220 if (res
!= COI_SUCCESS
) {
3221 if (m_status
!= 0) {
3222 m_status
->result
= translate_coi_error(res
);
3225 report_coi_error(c_buf_copy
, res
);
3229 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
3230 m_vars
[i
].type
.src
);
3232 res
= COI::BufferWrite(
3234 m_vars
[i
].mic_offset
+
3235 m_vars
[i
].offset
+ offset_dst
,
3238 COI_COPY_UNSPECIFIED
,
3239 in_deps_amount
, in_deps
,
3241 if (res
!= COI_SUCCESS
) {
3242 if (m_status
!= 0) {
3243 m_status
->result
= translate_coi_error(res
);
3246 report_coi_error(c_buf_write
, res
);
3249 data_sent
+= send_size
;
3255 bool OffloadDescriptor::send_pointer_data(bool is_async
, void* info
)
3257 OffloadTimer
timer(get_timer_data(), c_offload_host_send_pointers
);
3259 bool should_use_async_buffer_write
= m_initial_need_runfunction
;
3260 uint64_t ptr_sent
= 0;
3262 uint32_t in_deps_amount
= 0;
3263 COIEVENT
*in_deps
= NULL
;
3265 // For offload_transfer and offload with empty body without signal:
3266 // - if there is only one buffer copy - send data synchronously
3267 // - if there are multiple buffer copy and
3268 // __offload_parallel_copy is false - send data synchronously
3269 // - if there are multiple buffer copy and
3270 // __offload_parallel_copy is true - send data asynchronously
3271 // It concerns only big size data - greater than __offload_use_async_buffer_write.
3272 // Data of size less than __offload_use_async_buffer_write are sent synchronously.
3273 // Synchronous transfer results in better performance in COI.
3274 // __offload_parallel_copy is false by default but can be changed
3275 // via environment variable OFFLOAD_PARALLEL_COPY
3276 if (!m_initial_need_runfunction
&& __offload_parallel_copy
) {
3277 int big_size_count
= 0;
3278 for (int i
= 0; i
< m_vars_total
; i
++) {
3279 if (m_vars
[i
].direction
.in
&&
3280 m_vars
[i
].size
>= __offload_use_async_buffer_write
) {
3281 switch (m_vars
[i
].type
.dst
) {
3285 if (m_vars
[i
].flags
.is_static_dstn
) {
3291 case c_cean_var_ptr
:
3295 case c_dv_data_slice
:
3296 case c_dv_ptr_data_slice
:
3304 if (big_size_count
> 1) {
3305 should_use_async_buffer_write
= true;
3309 if (m_stream
!= no_stream
&& m_vars_total
!= 0) {
3310 get_stream_in_dependencies(in_deps_amount
, in_deps
);
3313 // Initiate send for pointer data
3314 for (int i
= 0; i
< m_vars_total
; i
++) {
3315 uint64_t sent_data
= m_vars
[i
].size
;
3316 uint32_t in_deps_amount_save
;
3317 COIEVENT
*in_deps_save
;
3319 if (m_vars_extra
[i
].omp_last_event_type
== c_last_write
) {
3320 in_deps_amount_save
= in_deps_amount
;
3321 in_deps_save
= in_deps
;
3322 in_deps_amount
= m_in_deps_total
;
3323 if (in_deps_amount
> 0) {
3324 in_deps
= (COIEVENT
*) malloc(sizeof(COIEVENT
) * in_deps_amount
);
3325 if (in_deps
== NULL
)
3326 LIBOFFLOAD_ERROR(c_malloc
);
3327 memcpy(in_deps
, m_in_deps
,in_deps_amount
* sizeof(COIEVENT
));
3330 switch (m_vars
[i
].type
.dst
) {
3331 case c_data_ptr_array
:
3336 if (m_vars
[i
].direction
.in
&&
3337 m_vars
[i
].flags
.is_static_dstn
) {
3340 (should_use_async_buffer_write
&&
3341 m_vars
[i
].size
>= __offload_use_async_buffer_write
)) ?
3342 &m_in_deps
[m_in_deps_total
++] : 0;
3343 PtrData
* dst_data
= m_vars
[i
].into
?
3344 m_vars_extra
[i
].dst_data
:
3345 m_vars_extra
[i
].src_data
;
3347 VAR_TYPE_IS_PTR(m_vars
[i
].type
.src
) ||
3348 VAR_TYPE_IS_SCALAR(m_vars
[i
].type
.src
) &&
3349 m_vars
[i
].flags
.is_static
?
3350 m_vars_extra
[i
].src_data
: 0;
3352 if (m_vars
[i
].flags
.is_noncont_src
||
3353 m_vars
[i
].flags
.is_noncont_dst
) {
3354 if (!send_noncontiguous_pointer_data(
3355 i
, src_data
, dst_data
, event
, sent_data
,
3356 in_deps_amount
, in_deps
)) {
3360 else if (src_data
!= 0 && src_data
->cpu_buf
!= 0) {
3361 res
= COI::BufferCopy(
3364 m_vars
[i
].mic_offset
+
3365 m_vars
[i
].offset
+ m_vars
[i
].disp
,
3366 m_vars_extra
[i
].cpu_offset
+
3367 m_vars_extra
[i
].cpu_disp
,
3369 COI_COPY_UNSPECIFIED
,
3370 in_deps_amount
, in_deps
,
3372 if (res
!= COI_SUCCESS
) {
3373 if (m_status
!= 0) {
3374 m_status
->result
= translate_coi_error(res
);
3377 report_coi_error(c_buf_copy
, res
);
3381 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
3382 m_vars
[i
].type
.src
);
3383 res
= COI::BufferWrite(
3385 m_vars
[i
].mic_offset
+
3386 m_vars
[i
].offset
+ m_vars
[i
].disp
,
3387 base
+ m_vars_extra
[i
].cpu_disp
,
3389 COI_COPY_UNSPECIFIED
,
3390 in_deps_amount
, in_deps
,
3392 if (res
!= COI_SUCCESS
) {
3393 if (m_status
!= 0) {
3394 m_status
->result
= translate_coi_error(res
);
3397 report_coi_error(c_buf_write
, res
);
3400 ptr_sent
+= sent_data
;
3406 case c_cean_var_ptr
:
3408 if (m_vars
[i
].direction
.in
&& m_vars
[i
].size
> 0) {
3411 (should_use_async_buffer_write
&&
3412 m_vars
[i
].size
>= __offload_use_async_buffer_write
)) ?
3413 &m_in_deps
[m_in_deps_total
++] : 0;
3414 PtrData
* dst_data
= m_vars
[i
].into
?
3415 m_vars_extra
[i
].dst_data
:
3416 m_vars_extra
[i
].src_data
;
3418 VAR_TYPE_IS_PTR(m_vars
[i
].type
.src
) ||
3419 VAR_TYPE_IS_SCALAR(m_vars
[i
].type
.src
) &&
3420 m_vars
[i
].flags
.is_static
?
3421 m_vars_extra
[i
].src_data
: 0;
3423 if (m_vars
[i
].flags
.is_noncont_src
||
3424 m_vars
[i
].flags
.is_noncont_dst
) {
3425 send_noncontiguous_pointer_data(
3426 i
, src_data
, dst_data
, event
, sent_data
,
3427 in_deps_amount
, in_deps
);
3429 else if (src_data
!= 0 && src_data
->cpu_buf
!= 0) {
3430 res
= COI::BufferCopy(
3433 m_vars
[i
].mic_offset
+
3434 m_vars
[i
].offset
+ m_vars
[i
].disp
,
3435 m_vars_extra
[i
].cpu_offset
+
3436 m_vars_extra
[i
].cpu_disp
,
3438 COI_COPY_UNSPECIFIED
,
3439 in_deps_amount
, in_deps
,
3441 if (res
!= COI_SUCCESS
) {
3442 if (m_status
!= 0) {
3443 m_status
->result
= translate_coi_error(res
);
3446 report_coi_error(c_buf_copy
, res
);
3450 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
3451 m_vars
[i
].type
.src
);
3452 res
= COI::BufferWrite(
3454 m_vars
[i
].mic_offset
+
3455 m_vars
[i
].offset
+ m_vars
[i
].disp
,
3456 base
+ m_vars_extra
[i
].cpu_disp
,
3458 COI_COPY_UNSPECIFIED
,
3459 in_deps_amount
, in_deps
,
3461 if (res
!= COI_SUCCESS
) {
3462 if (m_status
!= 0) {
3463 m_status
->result
= translate_coi_error(res
);
3466 report_coi_error(c_buf_write
, res
);
3470 ptr_sent
+= sent_data
;
3476 if (m_vars
[i
].direction
.in
&&
3477 m_vars
[i
].size
> 0) {
3478 PtrData
*ptr_data
= m_vars
[i
].into
?
3479 m_vars_extra
[i
].dst_data
:
3480 m_vars_extra
[i
].src_data
;
3481 PtrData
* src_data
= m_vars_extra
[i
].src_data
;
3485 (should_use_async_buffer_write
&&
3486 m_vars
[i
].size
>= __offload_use_async_buffer_write
)) ?
3487 &m_in_deps
[m_in_deps_total
++] : 0;
3489 if (m_vars
[i
].flags
.is_noncont_src
||
3490 m_vars
[i
].flags
.is_noncont_dst
) {
3491 send_noncontiguous_pointer_data(
3492 i
, src_data
, ptr_data
, event
, sent_data
,
3493 in_deps_amount
, in_deps
);
3495 else if (src_data
&& src_data
->cpu_buf
!= 0) {
3496 res
= COI::BufferCopy(
3499 m_vars
[i
].offset
+ ptr_data
->mic_offset
+
3501 m_vars_extra
[i
].cpu_offset
+
3502 m_vars_extra
[i
].cpu_disp
,
3504 COI_COPY_UNSPECIFIED
,
3505 in_deps_amount
, in_deps
,
3507 if (res
!= COI_SUCCESS
) {
3508 if (m_status
!= 0) {
3509 m_status
->result
= translate_coi_error(res
);
3512 report_coi_error(c_buf_copy
, res
);
3516 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
3517 m_vars
[i
].type
.src
);
3518 res
= COI::BufferWrite(
3520 ptr_data
->mic_offset
+
3521 m_vars
[i
].offset
+ m_vars
[i
].disp
,
3522 base
+ m_vars_extra
[i
].cpu_disp
,
3524 COI_COPY_UNSPECIFIED
,
3525 in_deps_amount
, in_deps
,
3527 if (res
!= COI_SUCCESS
) {
3528 if (m_status
!= 0) {
3529 m_status
->result
= translate_coi_error(res
);
3532 report_coi_error(c_buf_write
, res
);
3535 ptr_sent
+= sent_data
;
3539 case c_dv_data_slice
:
3540 case c_dv_ptr_data_slice
:
3541 if (m_vars
[i
].direction
.in
&&
3542 m_vars
[i
].size
> 0) {
3543 PtrData
*dst_data
= m_vars
[i
].into
?
3544 m_vars_extra
[i
].dst_data
:
3545 m_vars_extra
[i
].src_data
;
3547 (VAR_TYPE_IS_PTR(m_vars
[i
].type
.src
) ||
3548 VAR_TYPE_IS_DV_DATA(m_vars
[i
].type
.src
) ||
3549 VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.src
) ||
3550 VAR_TYPE_IS_SCALAR(m_vars
[i
].type
.src
) &&
3551 m_vars
[i
].flags
.is_static
) ?
3552 m_vars_extra
[i
].src_data
: 0;
3555 (should_use_async_buffer_write
&&
3556 m_vars
[i
].size
>= __offload_use_async_buffer_write
)) ?
3557 &m_in_deps
[m_in_deps_total
++] : 0;
3558 if (m_vars
[i
].flags
.is_noncont_src
||
3559 m_vars
[i
].flags
.is_noncont_dst
) {
3560 send_noncontiguous_pointer_data(
3561 i
, src_data
, dst_data
, event
, sent_data
,
3562 in_deps_amount
, in_deps
);
3564 else if (src_data
&& src_data
->cpu_buf
!= 0) {
3565 res
= COI::BufferCopy(
3569 dst_data
->mic_offset
+
3571 m_vars_extra
[i
].cpu_offset
+
3572 m_vars_extra
[i
].cpu_disp
,
3574 COI_COPY_UNSPECIFIED
,
3575 in_deps_amount
, in_deps
,
3577 if (res
!= COI_SUCCESS
) {
3578 if (m_status
!= 0) {
3579 m_status
->result
= translate_coi_error(res
);
3582 report_coi_error(c_buf_copy
, res
);
3586 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
3587 m_vars
[i
].type
.src
);
3588 res
= COI::BufferWrite(
3590 dst_data
->mic_offset
+
3591 m_vars
[i
].offset
+ m_vars
[i
].disp
,
3592 base
+ m_vars_extra
[i
].cpu_disp
,
3594 COI_COPY_UNSPECIFIED
,
3595 in_deps_amount
, in_deps
,
3597 if (res
!= COI_SUCCESS
) {
3598 if (m_status
!= 0) {
3599 m_status
->result
= translate_coi_error(res
);
3602 report_coi_error(c_buf_write
, res
);
3606 ptr_sent
+= sent_data
;
3613 if (m_vars_extra
[i
].omp_last_event_type
== c_last_write
) {
3614 in_deps_amount
= in_deps_amount_save
;
3615 in_deps
= in_deps_save
;
3616 register_omp_event_call_back(&m_in_deps
[m_in_deps_total
- 1], info
);
3618 // alloc field isn't used at target.
3619 // We can reuse it for offset of array pointers.
3620 if (m_vars_extra
[i
].is_arr_ptr_el
) {
3621 m_vars
[i
].ptr_arr_offset
= m_vars_extra
[i
].ptr_arr_offset
;
3626 m_status
->data_sent
+= ptr_sent
;
3629 OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), ptr_sent
);
3630 OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
3631 c_offload_sent_pointer_data
,
3632 "Total pointer data sent to target: [%lld] bytes\n",
3638 bool OffloadDescriptor::gather_copyin_data()
3640 OffloadTimer
timer(get_timer_data(), c_offload_host_gather_inputs
);
3642 if (m_need_runfunction
&& m_in_datalen
> 0) {
3643 COIMAPINSTANCE map_inst
;
3647 if (m_inout_buf
!= 0) {
3648 OffloadTimer
timer_map(get_timer_data(),
3649 c_offload_host_map_in_data_buffer
);
3651 COIRESULT res
= COI::BufferMap(m_inout_buf
, 0, m_in_datalen
,
3652 COI_MAP_WRITE_ENTIRE_BUFFER
,
3654 reinterpret_cast<void**>(&data
));
3655 if (res
!= COI_SUCCESS
) {
3656 if (m_status
!= 0) {
3657 m_status
->result
= translate_coi_error(res
);
3660 report_coi_error(c_buf_map
, res
);
3664 data
= (char*) m_func_desc
+ m_func_desc
->data_offset
;
3667 // send variable descriptors
3668 memcpy(data
, m_vars
, m_vars_total
* sizeof(VarDesc
));
3669 data
+= m_vars_total
* sizeof(VarDesc
);
3672 m_in
.init_buffer(data
, m_in_datalen
);
3674 // Gather copy data into buffer
3675 for (int i
= 0; i
< m_vars_total
; i
++) {
3676 bool src_is_for_mic
= (m_vars
[i
].direction
.out
||
3677 m_vars
[i
].into
== NULL
);
3678 PtrData
* ptr_data
= src_is_for_mic
?
3679 m_vars_extra
[i
].src_data
:
3680 m_vars_extra
[i
].dst_data
;
3681 if (m_vars
[i
].flags
.alloc_disp
) {
3682 m_in
.send_data(&ptr_data
->alloc_disp
,
3683 sizeof(ptr_data
->alloc_disp
));
3686 // send sink address to the target
3687 if (m_vars
[i
].flags
.sink_addr
) {
3688 m_in
.send_data(&ptr_data
->mic_addr
,
3689 sizeof(ptr_data
->mic_addr
));
3692 switch (m_vars
[i
].type
.dst
) {
3693 case c_data_ptr_array
:
3698 if (m_vars
[i
].direction
.in
&&
3699 !m_vars
[i
].flags
.is_static_dstn
) {
3701 char *ptr
= offload_get_src_base(m_vars
[i
].ptr
,
3702 m_vars
[i
].type
.src
);
3703 if (m_vars
[i
].type
.dst
== c_cean_var
) {
3704 // offset and length are derived from the array
3706 int64_t size
= m_vars
[i
].size
;
3707 int64_t disp
= m_vars
[i
].disp
;
3708 m_in
.send_data(reinterpret_cast<char*>(&size
),
3710 m_in
.send_data(reinterpret_cast<char*>(&disp
),
3714 m_in
.send_data(ptr
+ m_vars_extra
[i
].cpu_disp
,
3720 if (m_vars
[i
].direction
.bits
||
3721 m_vars
[i
].alloc_if
||
3722 m_vars
[i
].free_if
) {
3723 // send dope vector excluding base
3724 char *ptr
= static_cast<char*>(m_vars
[i
].ptr
);
3725 m_in
.send_data(ptr
+ sizeof(uint64_t),
3726 m_vars
[i
].size
- sizeof(uint64_t));
3731 // send to target addresses of obsolete
3732 // stacks to be released
3733 if (m_vars
[i
].flags
.is_stack_buf
&&
3734 !m_vars
[i
].direction
.bits
&&
3735 m_vars
[i
].alloc_if
&&
3736 m_vars
[i
].size
!= 0) {
3737 for (PtrDataList::iterator it
=
3738 m_destroy_stack
.begin();
3739 it
!= m_destroy_stack
.end(); it
++) {
3740 PtrData
* ptr_data
= *it
;
3741 m_in
.send_data(&(ptr_data
->mic_addr
),
3742 sizeof(ptr_data
->mic_addr
));
3747 if (m_vars
[i
].direction
.in
) {
3748 m_in
.send_func_ptr(*((const void**) m_vars
[i
].ptr
));
3758 m_status
->data_sent
+= m_in
.get_tfr_size();
3761 if (m_func_desc
->data_offset
== 0) {
3762 OffloadTimer
timer_unmap(get_timer_data(),
3763 c_offload_host_unmap_in_data_buffer
);
3764 COIRESULT res
= COI::BufferUnmap(map_inst
, 0, 0, 0);
3765 if (res
!= COI_SUCCESS
) {
3766 if (m_status
!= 0) {
3767 m_status
->result
= translate_coi_error(res
);
3770 report_coi_error(c_buf_unmap
, res
);
3775 OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), m_in
.get_tfr_size());
3776 OFFLOAD_DEBUG_TRACE_1(1,
3777 GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_copyin_data
,
3778 "Total copyin data sent to target: [%lld] bytes\n",
3779 m_in
.get_tfr_size());
3784 bool OffloadDescriptor::compute(void *info
)
3786 OffloadTimer
timer(get_timer_data(), c_offload_host_start_compute
);
3788 if (m_need_runfunction
) {
3789 OFFLOAD_DEBUG_TRACE_1(2, GET_OFFLOAD_NUMBER(get_timer_data()),
3790 c_offload_compute
, "Compute task on MIC\n");
3792 void* misc
= m_func_desc
;
3793 int misc_len
= m_func_desc_size
;
3797 if (m_func_desc
->data_offset
!= 0) {
3798 misc_len
+= m_in_datalen
;
3800 if (m_out_datalen
> 0) {
3801 ret
= (char*) m_func_desc
+ m_func_desc
->data_offset
;
3802 ret_len
= m_out_datalen
;
3809 uint32_t in_deps_amount
= m_in_deps_total
;
3810 COIEVENT
*in_deps
= m_in_deps_total
> 0 ? m_in_deps
: 0;
3812 if (0 == m_in_deps_total
&& m_stream
!= no_stream
) {
3813 get_stream_in_dependencies(in_deps_amount
, in_deps
);
3816 res
= m_device
.compute(m_stream
,
3824 if (res
!= COI_SUCCESS
) {
3825 if (m_status
!= 0) {
3826 m_status
->result
= translate_coi_error(res
);
3829 report_coi_error(c_pipeline_run_func
, res
);
3832 if (m_omp_async_last_event_type
== c_last_runfunc
) {
3833 register_omp_event_call_back(&event
, info
);
3836 m_in_deps_total
= 1;
3837 m_in_deps
[0] = event
;
3843 // receive pointer data if source or destination or both of them are
3844 // noncontiguous. There is guarantee that length of destination enough for
3845 // transferred data.
3846 bool OffloadDescriptor::receive_noncontiguous_pointer_data(
3850 uint64_t &received_data
,
3851 uint32_t in_deps_amount
,
3855 int64_t offset_src
, offset_dst
;
3856 int64_t length_src
, length_dst
;
3857 int64_t length_src_cur
, length_dst_cur
;
3858 int64_t receive_size
;
3860 bool dst_is_empty
= true;
3861 bool src_is_empty
= true;
3863 char *base
= offload_get_src_base(
3865 static_cast<char*>(m_vars
[i
].into
) :
3866 static_cast<char*>(m_vars
[i
].ptr
),
3867 m_vars
[i
].type
.dst
);
3870 // Set length_src and length_dst
3871 length_src
= (m_vars_extra
[i
].read_rng_src
) ?
3872 m_vars_extra
[i
].read_rng_src
->range_size
: m_vars
[i
].size
;
3873 length_dst
= !m_vars
[i
].into
? length_src
:
3874 (m_vars_extra
[i
].read_rng_dst
) ?
3875 m_vars_extra
[i
].read_rng_dst
->range_size
: m_vars
[i
].size
;
3876 receive_size
= (length_src
< length_dst
) ? length_src
: length_dst
;
3878 // If BufferReadMultiD is defined we can set values of required arguments
3879 // and transfer noncontiguous data via call to the COI routine.
3880 if (__offload_use_coi_noncontiguous_transfer
&& COI::BufferReadMultiD
) {
3881 struct Arr_Desc
* arr_desc_dst
;
3882 struct Arr_Desc
* arr_desc_src
;
3883 int64_t size_src
, size_dst
;
3885 offset_src
= (m_vars_extra
[i
].read_rng_src
)?
3886 m_vars_extra
[i
].read_rng_src
->init_offset
: m_vars
[i
].disp
;
3887 size_src
= m_vars_extra
[i
].read_rng_src
?
3888 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
3891 offset_dst
= (m_vars_extra
[i
].read_rng_dst
)?
3892 m_vars_extra
[i
].read_rng_dst
->init_offset
: m_vars_extra
[i
].cpu_disp
;
3893 size_dst
= m_vars_extra
[i
].read_rng_dst
?
3894 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) : m_vars
[i
].size
;
3896 int64_t el_size
= (!m_vars
[i
].into
||
3897 (m_vars_extra
[i
].read_rng_src
&&
3898 m_vars_extra
[i
].read_rng_dst
)) ?
3900 m_vars_extra
[i
].read_rng_src
?
3901 m_vars_extra
[i
].read_rng_src
->arr_desc
->dim
[
3902 m_vars_extra
[i
].read_rng_src
->arr_desc
->rank
- 1].size
:
3903 m_vars_extra
[i
].read_rng_dst
->arr_desc
->dim
[
3904 m_vars_extra
[i
].read_rng_dst
->arr_desc
->rank
- 1].size
;
3905 arr_desc_src
= (m_vars_extra
[i
].read_rng_src
) ?
3906 m_vars_extra
[i
].read_rng_src
->arr_desc
:
3907 make_arr_desc(NULL
, // don't required for source
3908 offset_src
/el_size
, size_src
/el_size
,
3910 arr_desc_dst
= !m_vars
[i
].into
? arr_desc_src
:
3911 (m_vars_extra
[i
].read_rng_dst
) ?
3912 m_vars_extra
[i
].read_rng_dst
->arr_desc
:
3914 offset_dst
/el_size
, size_src
/el_size
, el_size
);
3916 arr_desc_dst
->base
= reinterpret_cast<int64_t>(base
);
3918 res
= COI::BufferReadMultiD(
3919 m_vars_extra
[i
].src_data
->mic_buf
, // SourceBuffer
3920 m_vars
[i
].offset
+ m_vars
[i
].mic_offset
-
3921 m_vars_extra
[i
].src_data
->alloc_disp
, // Offset
3922 (void*)arr_desc_dst
, // descriptor of DestArray
3923 (void*)arr_desc_src
, // descriptor of SrcArray
3924 COI_COPY_UNSPECIFIED
, // Type
3925 in_deps_amount
, // Number of in Dependencies
3926 in_deps
, // array of in Dependencies
3927 event
); // out Dependency
3928 if (res
!= COI_SUCCESS
) {
3929 if (m_status
!= 0) {
3930 m_status
->result
= translate_coi_error(res
);
3933 report_coi_error(c_buf_copy
, res
);
3937 // if event is defined we must multiplate for all contiguous intervals
3938 // that will be Copied/Read.
3939 // Take in account that we already have 1 event.
3941 m_out_deps_allocated
+= (length_src
/ receive_size
) *
3942 ((m_vars_extra
[i
].read_rng_src
) ?
3943 m_vars_extra
[i
].read_rng_src
->range_max_number
: 1) ;
3945 (COIEVENT
*)realloc(m_out_deps
, sizeof(COIEVENT
) * m_out_deps_allocated
);
3949 // consequently get contiguous ranges,
3950 // define corresponded destination offset and receive data
3954 if (m_vars_extra
[i
].read_rng_src
) {
3955 if (!get_next_range(m_vars_extra
[i
].read_rng_src
,
3957 // source ranges are over - nothing to send
3961 else if (received_data
== 0) {
3962 offset_src
= m_vars
[i
].disp
;
3967 length_src_cur
= length_src
;
3970 // if source is contiguous or its contiguous range is greater
3971 // than destination one
3972 offset_src
+= receive_size
;
3974 length_src_cur
-= receive_size
;
3975 src_is_empty
= length_src_cur
== 0;
3977 // get destination offset
3979 if (m_vars
[i
].into
) {
3980 if (m_vars_extra
[i
].read_rng_dst
) {
3981 if (!get_next_range(m_vars_extra
[i
].read_rng_dst
,
3983 // destination ranges are over
3984 LIBOFFLOAD_ERROR(c_destination_is_over
);
3988 // destination is contiguous.
3990 offset_dst
= m_vars_extra
[i
].cpu_disp
;
3992 length_dst_cur
= length_dst
;
3996 offset_dst
= offset_src
;
3997 length_dst_cur
= length_src
;
4001 // if destination is contiguous or its contiguous range is greater
4003 offset_dst
+= receive_size
;
4005 length_dst_cur
-= receive_size
;
4006 dst_is_empty
= length_dst_cur
== 0;
4008 event
= &m_out_deps
[m_out_deps_total
++];
4011 res
= COI::BufferCopy(
4013 m_vars_extra
[i
].src_data
->mic_buf
,
4014 m_vars_extra
[i
].cpu_offset
+ offset_dst
,
4015 m_vars
[i
].offset
+ offset_src
+
4016 m_vars
[i
].mic_offset
,
4018 COI_COPY_UNSPECIFIED
,
4022 if (res
!= COI_SUCCESS
) {
4023 if (m_status
!= 0) {
4024 m_status
->result
= translate_coi_error(res
);
4027 report_coi_error(c_buf_copy
, res
);
4031 res
= COI::BufferRead(
4032 m_vars_extra
[i
].src_data
->mic_buf
,
4033 m_vars
[i
].offset
+ offset_src
+
4034 m_vars
[i
].mic_offset
,
4037 COI_COPY_UNSPECIFIED
,
4041 if (res
!= COI_SUCCESS
) {
4042 if (m_status
!= 0) {
4043 m_status
->result
= translate_coi_error(res
);
4046 report_coi_error(c_buf_read
, res
);
4049 received_data
+= receive_size
;
4055 bool OffloadDescriptor::receive_pointer_data(bool is_async
,
4056 bool first_run
, void *info
)
4058 OffloadTimer
timer(get_timer_data(), c_offload_host_start_buffers_reads
);
4060 bool should_use_async_buffer_read
= m_initial_need_runfunction
;
4061 uint64_t ptr_received
= 0;
4064 // For offload_transfer and offload with empty body without signal:
4065 // - if there is only one buffer copy - get data synchronously
4066 // - if there are multiple buffer copy and
4067 // __offload_parallel_copy is false - get data synchronously
4068 // - if there are multiple buffer copy
4069 // and __offload_parallel_copy is true - get data asynchronously
4070 // It concerns only data with size greater than __offload_use_async_buffer_read.
4071 // Data of size less than __offload_use_async_buffer_read are received synchronously.
4072 // Synchronous transfer results in better performance in COI.
4073 // __offload_parallel_copy is false by default but can be changed
4074 // via environment variable OFFLOAD_PARALLEL_COPY
4075 if (!m_initial_need_runfunction
&& __offload_parallel_copy
) {
4076 int big_size_count
= 0;
4078 for (int i
= 0; i
< m_vars_total
; i
++) {
4079 if (m_vars
[i
].direction
.out
&&
4080 m_vars
[i
].size
>= __offload_use_async_buffer_read
) {
4081 // preallocated OUT only at second run
4082 if (first_run
== m_vars
[i
].flags
.preallocated
) {
4085 switch (m_vars
[i
].type
.src
) {
4089 if (m_vars
[i
].flags
.is_static
) {
4095 case c_cean_var_ptr
:
4098 case c_dv_data_slice
:
4099 case c_dv_ptr_data_slice
:
4108 if (big_size_count
> 1) {
4109 should_use_async_buffer_read
= true;
4112 uint32_t in_deps_amount
= m_in_deps_total
;
4113 COIEVENT
*in_deps
= m_in_deps_total
> 0 ? m_in_deps
: 0;
4115 if (0 == m_in_deps_total
&&
4116 m_stream
!= no_stream
&&
4117 m_vars_total
!= 0) {
4118 get_stream_in_dependencies(in_deps_amount
, in_deps
);
4121 for (int i
= 0; i
< m_vars_total
; i
++) {
4122 uint64_t received_data
= m_vars
[i
].size
;
4123 uint32_t in_deps_amount_save
;
4124 COIEVENT
*in_deps_save
;
4126 if (m_vars_extra
[i
].omp_last_event_type
== c_last_read
) {
4127 in_deps_amount_save
= in_deps_amount
;
4128 in_deps_save
= in_deps
;
4130 in_deps_amount
+= m_out_deps_total
;
4131 if (in_deps_amount
> 0) {
4132 in_deps
= (COIEVENT
*) malloc(sizeof(COIEVENT
) * in_deps_amount
);
4133 if (in_deps
== NULL
)
4134 LIBOFFLOAD_ERROR(c_malloc
);
4135 memcpy(in_deps
, in_deps_save
,
4136 in_deps_amount_save
* sizeof(COIEVENT
));
4137 memcpy(in_deps
+ in_deps_amount_save
* sizeof(COIEVENT
),
4139 m_out_deps_total
* sizeof(COIEVENT
));
4142 // At first run don't receive by preallocated target pointer as the
4143 //pointer value will be ready later after call to scatter_copyout_data
4144 if (first_run
&& m_vars
[i
].alloc_if
&& m_vars
[i
].flags
.preallocated
) {
4145 m_preallocated_alloc
= true;
4146 // need one more call to OffloadDescriptor::receive_pointer_data
4147 if (m_vars
[i
].direction
.out
) {
4148 m_out_with_preallocated
= true;
4152 switch (m_vars
[i
].type
.src
) {
4153 case c_data_ptr_array
:
4158 if (m_vars
[i
].direction
.out
&&
4159 m_vars
[i
].flags
.is_static
) {
4162 m_in_deps_total
> 0 ||
4163 (should_use_async_buffer_read
&&
4164 m_vars
[i
].size
>= __offload_use_async_buffer_read
)) ?
4165 &m_out_deps
[m_out_deps_total
++] : 0;
4166 PtrData
*ptr_data
= NULL
;
4167 COIBUFFER dst_buf
= NULL
; // buffer at host
4170 if (VAR_TYPE_IS_PTR(m_vars
[i
].type
.dst
)) {
4171 ptr_data
= m_vars
[i
].into
?
4172 m_vars_extra
[i
].dst_data
:
4173 m_vars_extra
[i
].src_data
;
4175 else if (VAR_TYPE_IS_SCALAR(m_vars
[i
].type
.dst
)) {
4176 if (m_vars
[i
].flags
.is_static_dstn
) {
4177 ptr_data
= m_vars
[i
].into
?
4178 m_vars_extra
[i
].dst_data
:
4179 m_vars_extra
[i
].src_data
;
4182 dst_buf
= ptr_data
? ptr_data
->cpu_buf
: NULL
;
4183 if (dst_buf
== NULL
) {
4184 base
= offload_get_src_base(
4186 static_cast<char*>(m_vars
[i
].into
) :
4187 static_cast<char*>(m_vars
[i
].ptr
),
4188 m_vars
[i
].type
.dst
);
4191 if (m_vars
[i
].flags
.is_noncont_src
||
4192 m_vars
[i
].flags
.is_noncont_dst
) {
4193 receive_noncontiguous_pointer_data(
4194 i
, dst_buf
, event
, received_data
,
4195 in_deps_amount
, in_deps
);
4197 else if (dst_buf
!= 0) {
4198 res
= COI::BufferCopy(
4200 m_vars_extra
[i
].src_data
->mic_buf
,
4201 m_vars_extra
[i
].cpu_offset
+
4202 m_vars_extra
[i
].cpu_disp
,
4203 m_vars
[i
].offset
+ m_vars
[i
].disp
,
4205 COI_COPY_UNSPECIFIED
,
4209 if (res
!= COI_SUCCESS
) {
4210 if (m_status
!= 0) {
4211 m_status
->result
= translate_coi_error(res
);
4214 report_coi_error(c_buf_copy
, res
);
4218 res
= COI::BufferRead(
4219 m_vars_extra
[i
].src_data
->mic_buf
,
4220 m_vars
[i
].offset
+ m_vars
[i
].disp
,
4221 base
+ m_vars_extra
[i
].cpu_offset
+
4222 m_vars_extra
[i
].cpu_disp
,
4224 COI_COPY_UNSPECIFIED
,
4228 if (res
!= COI_SUCCESS
) {
4229 if (m_status
!= 0) {
4230 m_status
->result
= translate_coi_error(res
);
4233 report_coi_error(c_buf_read
, res
);
4236 ptr_received
+= received_data
;
4242 case c_cean_var_ptr
:
4245 case c_dv_data_slice
:
4246 case c_dv_ptr_data_slice
:
4248 COIBUFFER dst_buf
= NULL
; // buffer on host
4249 if (m_vars
[i
].direction
.out
&& m_vars
[i
].size
> 0) {
4252 m_in_deps_total
> 0 ||
4253 (should_use_async_buffer_read
&&
4254 m_vars
[i
].size
>= __offload_use_async_buffer_read
)) ?
4255 &m_out_deps
[m_out_deps_total
++] : 0;
4257 uint64_t dst_offset
= 0;
4258 char *base
= static_cast<char*>(m_vars
[i
].ptr
);
4260 if (VAR_TYPE_IS_PTR(m_vars
[i
].type
.dst
)) {
4261 PtrData
*ptr_data
= m_vars
[i
].into
?
4262 m_vars_extra
[i
].dst_data
:
4263 m_vars_extra
[i
].src_data
;
4264 dst_buf
= ptr_data
? ptr_data
->cpu_buf
: NULL
;
4265 if (dst_buf
== NULL
) {
4266 base
= m_vars
[i
].into
?
4267 *static_cast<char**>(m_vars
[i
].into
) :
4268 *static_cast<char**>(m_vars
[i
].ptr
);
4270 dst_offset
= m_vars_extra
[i
].cpu_offset
+
4271 m_vars_extra
[i
].cpu_disp
;
4273 else if (VAR_TYPE_IS_SCALAR(m_vars
[i
].type
.dst
)) {
4274 if (m_vars
[i
].flags
.is_static_dstn
) {
4275 dst_buf
= m_vars
[i
].into
?
4276 m_vars_extra
[i
].dst_data
->cpu_buf
:
4277 m_vars_extra
[i
].src_data
->cpu_buf
;
4279 if (dst_buf
== NULL
) {
4280 base
= offload_get_src_base(
4282 static_cast<char*>(m_vars
[i
].into
) :
4283 static_cast<char*>(m_vars
[i
].ptr
),
4284 m_vars
[i
].type
.dst
);
4286 dst_offset
= m_vars_extra
[i
].cpu_offset
+
4287 m_vars_extra
[i
].cpu_disp
;
4289 else if (VAR_TYPE_IS_DV_DATA(m_vars
[i
].type
.dst
) ||
4290 VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.dst
)) {
4291 PtrData
*ptr_data
= m_vars
[i
].into
!= 0 ?
4292 m_vars_extra
[i
].dst_data
:
4293 m_vars_extra
[i
].src_data
;
4294 dst_buf
= ptr_data
!= 0 ? ptr_data
->cpu_buf
: 0;
4295 if (dst_buf
== NULL
) {
4296 base
= offload_get_src_base(
4298 static_cast<char*>(m_vars
[i
].into
) :
4299 static_cast<char*>(m_vars
[i
].ptr
),
4300 m_vars
[i
].type
.dst
);
4303 dst_offset
= m_vars_extra
[i
].cpu_offset
+
4304 m_vars_extra
[i
].cpu_disp
;
4307 if (m_vars
[i
].flags
.is_noncont_src
||
4308 m_vars
[i
].flags
.is_noncont_dst
) {
4309 receive_noncontiguous_pointer_data(
4310 i
, dst_buf
, event
, received_data
,
4314 else if (dst_buf
!= 0) {
4315 res
= COI::BufferCopy(
4317 m_vars_extra
[i
].src_data
->mic_buf
,
4319 m_vars
[i
].offset
+ m_vars
[i
].disp
+
4320 m_vars
[i
].mic_offset
,
4322 COI_COPY_UNSPECIFIED
,
4326 if (res
!= COI_SUCCESS
) {
4327 if (m_status
!= 0) {
4328 m_status
->result
= translate_coi_error(res
);
4331 report_coi_error(c_buf_copy
, res
);
4335 res
= COI::BufferRead(
4336 m_vars_extra
[i
].src_data
->mic_buf
,
4337 m_vars
[i
].offset
+ m_vars
[i
].disp
+
4338 m_vars
[i
].mic_offset
,
4341 COI_COPY_UNSPECIFIED
,
4345 if (res
!= COI_SUCCESS
) {
4346 if (m_status
!= 0) {
4347 m_status
->result
= translate_coi_error(res
);
4350 report_coi_error(c_buf_read
, res
);
4353 ptr_received
+= received_data
;
4362 if (m_vars_extra
[i
].omp_last_event_type
== c_last_read
) {
4363 in_deps_amount
= in_deps_amount_save
;
4364 in_deps
= in_deps_save
;
4365 register_omp_event_call_back(&m_out_deps
[m_out_deps_total
- 1], info
);
4367 // destroy buffers for obsolete stacks
4368 if (m_destroy_stack
.size() != 0) {
4369 for (PtrDataList::iterator it
= m_destroy_stack
.begin();
4370 it
!= m_destroy_stack
.end(); it
++) {
4371 PtrData
*ptr_data
= *it
;
4372 m_destroy_buffers
.push_back(ptr_data
->mic_buf
);
4373 OFFLOAD_TRACE(3, "Removing stack buffer with addr %p\n",
4374 ptr_data
->mic_addr
);
4376 m_destroy_stack
.clear();
4378 if (m_vars
[i
].free_if
) {
4379 // remove association for automatic variables
4380 if (m_is_openmp
&& !m_vars
[i
].flags
.is_static
&&
4381 (m_vars
[i
].type
.src
== c_data
||
4382 m_vars
[i
].type
.src
== c_void_ptr
||
4383 m_vars
[i
].type
.src
== c_cean_var
)) {
4384 AutoData
*auto_data
= m_vars_extra
[i
].auto_data
;
4385 if (auto_data
!= 0) {
4386 if (m_vars
[i
].flags
.always_delete
) {
4387 auto_data
->nullify_reference();
4389 else if(auto_data
->remove_reference() == 0) {
4390 m_device
.remove_auto_data(auto_data
->cpu_addr
.start());
4396 if (m_vars
[i
].direction
.out
|| m_vars
[i
].into
== NULL
) {
4397 if (!VAR_TYPE_IS_PTR(m_vars
[i
].type
.src
) &&
4398 !VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.src
) &&
4399 !VAR_TYPE_IS_DV_DATA(m_vars
[i
].type
.src
)) {
4403 PtrData
*ptr_data
= m_vars_extra
[i
].src_data
;
4404 if (ptr_data
->remove_reference() == 0) {
4406 if (ptr_data
->cpu_buf
!= 0) {
4407 m_destroy_buffers
.push_back(ptr_data
->cpu_buf
);
4409 if (ptr_data
->mic_buf
!= 0) {
4410 m_destroy_buffers
.push_back(ptr_data
->mic_buf
);
4412 OFFLOAD_TRACE(3, "Removing association for addr %p\n",
4413 ptr_data
->cpu_addr
.start());
4415 // remove association from map
4416 if (m_vars
[i
].flags
.targetptr
) {
4417 m_device
.remove_targetptr_data(ptr_data
->cpu_addr
.start());
4420 m_device
.remove_ptr_data(ptr_data
->cpu_addr
.start());
4424 else if (VAR_TYPE_IS_PTR(m_vars
[i
].type
.dst
) ||
4425 VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.dst
) ||
4426 VAR_TYPE_IS_DV_DATA(m_vars
[i
].type
.dst
)) {
4427 PtrData
*ptr_data
= m_vars_extra
[i
].dst_data
;
4428 if (ptr_data
->remove_reference() == 0) {
4430 if (ptr_data
->cpu_buf
!= 0) {
4431 m_destroy_buffers
.push_back(ptr_data
->cpu_buf
);
4433 if (ptr_data
->mic_buf
!= 0) {
4434 m_destroy_buffers
.push_back(ptr_data
->mic_buf
);
4436 OFFLOAD_TRACE(3, "Removing association for addr %p\n",
4437 ptr_data
->cpu_addr
.start());
4439 // remove association from map
4440 if (m_vars
[i
].flags
.targetptr
) {
4441 m_device
.remove_targetptr_data(ptr_data
->cpu_addr
.start());
4444 m_device
.remove_ptr_data(ptr_data
->cpu_addr
.start());
4452 m_status
->data_received
+= ptr_received
;
4455 OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), ptr_received
);
4456 OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
4457 c_offload_received_pointer_data
,
4458 "Total pointer data received from target: [%lld] bytes\n",
4464 bool OffloadDescriptor::scatter_copyout_data()
4466 OffloadTimer
timer(get_timer_data(), c_offload_host_scatter_outputs
);
4468 if (m_need_runfunction
&& m_out_datalen
> 0) {
4470 // total size that need to be transferred from target to host
4471 COIMAPINSTANCE map_inst
;
4475 // output data buffer
4476 if (m_func_desc
->data_offset
== 0) {
4477 OffloadTimer
timer_map(get_timer_data(),
4478 c_offload_host_map_out_data_buffer
);
4480 COIRESULT res
= COI::BufferMap(m_inout_buf
, 0, m_out_datalen
,
4481 COI_MAP_READ_ONLY
, 0, 0, 0,
4483 reinterpret_cast<void**>(&data
));
4484 if (res
!= COI_SUCCESS
) {
4485 if (m_status
!= 0) {
4486 m_status
->result
= translate_coi_error(res
);
4489 report_coi_error(c_buf_map
, res
);
4493 data
= (char*) m_func_desc
+ m_func_desc
->data_offset
;
4497 OFFLOAD_TIMER_TARGET_DATA(get_timer_data(), data
);
4498 data
+= OFFLOAD_TIMER_DATALEN();
4500 // initialize output marshaller
4501 m_out
.init_buffer(data
, m_out_datalen
);
4503 for (int i
= 0; i
< m_vars_total
; i
++) {
4504 bool src_is_for_mic
= (m_vars
[i
].direction
.out
||
4505 m_vars
[i
].into
== NULL
);
4507 if (m_vars
[i
].type
.src
!= c_data_ptr_array
&&
4508 m_vars
[i
].flags
.preallocated
&& m_vars
[i
].alloc_if
) {
4511 void ** cpu_ptr
= src_is_for_mic
?
4512 reinterpret_cast<void**>(m_vars
[i
].ptr
) :
4513 reinterpret_cast<void**>(m_vars
[i
].into
);
4514 void* alloc_base
= NULL
;
4515 int64_t alloc_disp
= 0;
4517 if (m_vars_extra
[i
].alloc
!= NULL
) {
4519 const Arr_Desc
*ap
=
4520 static_cast<const Arr_Desc
*>(m_vars_extra
[i
].alloc
);
4522 __arr_data_offset_and_length(ap
, alloc_disp
, alloc_size
);
4524 alloc_base
= reinterpret_cast<void*>(ap
->base
);
4527 // get pointer to target memory
4528 m_out
.receive_data(&ptr_value
, sizeof(void*));
4531 if (!alloc_ptr_data(
4534 (alloc_base
!= NULL
) ?
4535 alloc_disp
: m_vars
[i
].disp
,
4536 (alloc_base
!= NULL
) ?
4537 alloc_size
: m_vars
[i
].size
,
4540 m_vars
[i
].flags
.targetptr
,
4541 m_vars
[i
].flags
.preallocated
,
4542 m_vars
[i
].flags
.pin
)) {
4546 ptr_data
->add_reference();
4547 *cpu_ptr
= ptr_value
;
4548 if (src_is_for_mic
) {
4549 m_vars_extra
[i
].src_data
= ptr_data
;
4552 m_vars_extra
[i
].dst_data
= ptr_data
;
4554 m_vars
[i
].offset
= (char*) ptr_value
-
4555 (char*) ptr_data
->cpu_addr
.start();
4558 switch (m_vars
[i
].type
.src
) {
4559 case c_data_ptr_array
:
4564 if (m_vars
[i
].direction
.out
&&
4565 !m_vars
[i
].flags
.is_static
) {
4567 if (m_vars
[i
].into
) {
4568 char *ptr
= offload_get_src_base(
4569 static_cast<char*>(m_vars
[i
].into
),
4570 m_vars
[i
].type
.dst
);
4571 m_out
.receive_data(ptr
+ m_vars_extra
[i
].cpu_disp
,
4576 static_cast<char*>(m_vars
[i
].ptr
) +
4577 m_vars_extra
[i
].cpu_disp
,
4584 if (m_vars
[i
].direction
.out
) {
4585 m_out
.receive_func_ptr((const void**) m_vars
[i
].ptr
);
4595 m_status
->data_received
+= m_out
.get_tfr_size();
4598 if (m_func_desc
->data_offset
== 0) {
4599 OffloadTimer
timer_unmap(get_timer_data(),
4600 c_offload_host_unmap_out_data_buffer
);
4602 COIRESULT res
= COI::BufferUnmap(map_inst
, 0, 0, 0);
4603 if (res
!= COI_SUCCESS
) {
4604 if (m_status
!= 0) {
4605 m_status
->result
= translate_coi_error(res
);
4608 report_coi_error(c_buf_unmap
, res
);
4613 OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), m_out
.get_tfr_size());
4614 OFFLOAD_TRACE(1, "Total copyout data received from target: [%lld] bytes\n",
4615 m_out
.get_tfr_size());
4620 static void get_arr_desc_numbers(
4626 CeanReadRanges
* &ptr_ranges
4629 if (is_arr_desc_contiguous(ap
)) {
4631 __arr_data_offset_and_length(ap
, offset
, size
);
4632 el_number
= size
/ el_size
;
4635 ptr_ranges
= init_read_ranges_arr_desc(ap
);
4636 el_number
= (ptr_ranges
->range_size
/ el_size
) *
4637 ptr_ranges
->range_max_number
;
4638 size
= ptr_ranges
->range_size
;
4642 bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i
)
4644 int pointers_number
;
4646 int new_index
= m_vars_total
;
4648 const VarDesc3
*vd3
= static_cast<const VarDesc3
*>(m_vars
[i
].ptr
);
4649 int flags
= vd3
->array_fields
;
4650 bool src_is_for_mic
= (m_vars
[i
].direction
.out
||
4651 m_vars
[i
].into
== NULL
);
4653 ReadArrElements
<void *> ptr
;
4654 ReadArrElements
<void *> into
;
4655 ReadArrElements
<int64_t> ext_start
;
4656 ReadArrElements
<int64_t> ext_elements
;
4657 ReadArrElements
<int64_t> align
;
4658 ReadArrElements
<int64_t> alloc_if
;
4659 ReadArrElements
<int64_t> free_if
;
4660 ReadArrElements
<int64_t> into_start
;
4661 ReadArrElements
<int64_t> into_elem
;
4662 ReadArrElements
<int64_t> alloc_start
;
4663 ReadArrElements
<int64_t> alloc_elem
;
4666 ap
= static_cast<const Arr_Desc
*>(vd3
->ptr_array
);
4668 // "pointers_number" for total number of transferred pointers.
4669 // For each of them we create new var_desc and put it at the bottom
4670 // of the var_desc's array
4671 get_arr_desc_numbers(ap
, sizeof(void *), ptr
.offset
, ptr
.size
,
4672 pointers_number
, ptr
.ranges
);
4673 ptr
.base
= (m_vars
[i
].flags
.is_pointer
) ?
4674 *(reinterpret_cast<char**>(ap
->base
)) :
4675 reinterpret_cast<char*>(ap
->base
);
4677 // 2. prepare memory for new var_descs
4678 m_vars_total
+= pointers_number
;
4679 m_vars
= (VarDesc
*)realloc(m_vars
, m_vars_total
* sizeof(VarDesc
));
4681 LIBOFFLOAD_ERROR(c_malloc
);
4683 (VarExtra
*)realloc(m_vars_extra
, m_vars_total
* sizeof(VarExtra
));
4684 if (m_vars_extra
== NULL
)
4685 LIBOFFLOAD_ERROR(c_malloc
);
4687 (COIEVENT
*)realloc(m_in_deps
, sizeof(COIEVENT
) * (m_vars_total
+ 1));
4688 if (m_in_deps
== NULL
)
4689 LIBOFFLOAD_ERROR(c_malloc
);
4691 (COIEVENT
*)realloc(m_out_deps
, sizeof(COIEVENT
) * m_vars_total
);
4692 if (m_out_deps
== NULL
)
4693 LIBOFFLOAD_ERROR(c_malloc
);
4695 // 3. Prepare for reading new var_desc's fields
4697 if ((flags
& (1<<flag_extent_start_is_array
)) != 0) {
4698 ap
= static_cast<const Arr_Desc
*>(vd3
->extent_start
);
4699 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, ext_start
.offset
,
4700 ext_start
.size
, tmp_val
, ext_start
.ranges
);
4701 ext_start
.base
= reinterpret_cast<char*>(ap
->base
);
4702 ext_start
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4704 if (tmp_val
< pointers_number
) {
4705 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "extent start");
4709 else if ((flags
& (1<<flag_extent_start_is_scalar
)) != 0) {
4710 ext_start
.val
= (int64_t)vd3
->extent_start
;
4716 // EXTENT ELEMENTS NUMBER
4717 if ((flags
& (1<<flag_extent_elements_is_array
)) != 0) {
4718 ap
= static_cast<const Arr_Desc
*>(vd3
->extent_elements
);
4719 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
,
4720 ext_elements
.offset
, ext_elements
.size
,
4721 tmp_val
, ext_elements
.ranges
);
4722 ext_elements
.base
= reinterpret_cast<char*>(ap
->base
);
4723 ext_elements
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4725 if (tmp_val
< pointers_number
) {
4726 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "extent elements");
4730 else if ((flags
& (1<<flag_extent_elements_is_scalar
)) != 0) {
4731 ext_elements
.val
= (int64_t)vd3
->extent_elements
;
4734 ext_elements
.val
= m_vars
[i
].count
;
4738 if ((flags
& (1<<flag_alloc_if_is_array
)) != 0) {
4739 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_if_array
);
4740 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, alloc_if
.offset
,
4741 alloc_if
.size
, tmp_val
, alloc_if
.ranges
);
4742 alloc_if
.base
= reinterpret_cast<char*>(ap
->base
);
4743 alloc_if
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4745 if (tmp_val
< pointers_number
) {
4746 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_if");
4751 alloc_if
.val
= m_vars
[i
].alloc_if
;
4755 if ((flags
& (1<<flag_free_if_is_array
)) != 0) {
4756 ap
= static_cast<const Arr_Desc
*>(vd3
->free_if_array
);
4757 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, free_if
.offset
,
4758 free_if
.size
, tmp_val
, free_if
.ranges
);
4759 free_if
.base
= reinterpret_cast<char*>(ap
->base
);
4760 free_if
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4762 if (tmp_val
< pointers_number
) {
4763 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "free_if");
4768 free_if
.val
= m_vars
[i
].free_if
;
4773 if ((flags
& (1<<flag_align_is_array
)) != 0) {
4774 ap
= static_cast<const Arr_Desc
*>(vd3
->align_array
);
4775 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, align
.offset
,
4776 align
.size
, tmp_val
, align
.ranges
);
4777 align
.base
= reinterpret_cast<char*>(ap
->base
);
4778 align
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4780 if (tmp_val
< pointers_number
) {
4781 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "align");
4786 align
.val
= m_vars
[i
].align
;
4791 if (m_vars
[i
].into
) {
4792 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
4793 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, into
.offset
,
4794 into
.size
, tmp_val
, into
.ranges
);
4795 into
.base
= reinterpret_cast<char*>(ap
->base
);
4797 if (tmp_val
< pointers_number
) {
4798 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into");
4805 if ((flags
& (1<<flag_into_start_is_array
)) != 0) {
4806 ap
= static_cast<const Arr_Desc
*>(vd3
->into_start
);
4807 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, into_start
.offset
,
4808 into_start
.size
, tmp_val
, into_start
.ranges
);
4809 into_start
.base
= reinterpret_cast<char*>(ap
->base
);
4810 into_start
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4812 if (tmp_val
< pointers_number
) {
4813 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into_extent start");
4817 else if ((flags
& (1<<flag_into_start_is_scalar
)) != 0) {
4818 into_start
.val
= (int64_t)vd3
->into_start
;
4824 // 3.3 INTO_ELEMENTS
4826 if ((flags
& (1<<flag_into_elements_is_array
)) != 0) {
4827 ap
= static_cast<const Arr_Desc
*>(vd3
->into_elements
);
4828 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, into_elem
.offset
,
4829 into_elem
.size
, tmp_val
, into_elem
.ranges
);
4830 into_elem
.base
= reinterpret_cast<char*>(ap
->base
);
4831 into_elem
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4833 if (tmp_val
< pointers_number
) {
4834 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into_extent elements");
4838 else if ((flags
& (1<<flag_into_elements_is_scalar
)) != 0) {
4839 into_elem
.val
= (int64_t)vd3
->into_elements
;
4842 into_elem
.val
= m_vars
[i
].count
;
4847 if ((flags
& (1<<flag_alloc_start_is_array
)) != 0) {
4848 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_start
);
4849 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
,
4850 alloc_start
.offset
, alloc_start
.size
, tmp_val
,
4851 alloc_start
.ranges
);
4852 alloc_start
.base
= reinterpret_cast<char*>(ap
->base
);
4853 alloc_start
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4855 if (tmp_val
< pointers_number
) {
4856 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_extent start");
4860 else if ((flags
& (1<<flag_alloc_start_is_scalar
)) != 0) {
4861 alloc_start
.val
= (int64_t)vd3
->alloc_start
;
4864 alloc_start
.val
= 0;
4869 if ((flags
& (1<<flag_alloc_elements_is_array
)) != 0) {
4870 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_elements
);
4871 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, alloc_elem
.offset
,
4872 alloc_elem
.size
, tmp_val
, alloc_elem
.ranges
);
4873 alloc_elem
.base
= reinterpret_cast<char*>(ap
->base
);
4874 alloc_elem
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4875 if (tmp_val
< pointers_number
) {
4876 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
,
4877 "alloc_extent elements");
4881 else if ((flags
& (1<<flag_alloc_elements_is_scalar
)) != 0) {
4882 alloc_elem
.val
= (int64_t)vd3
->alloc_elements
;
4888 for (int k
= 0; k
< pointers_number
; k
++) {
4889 int type
= flags
& 0x3f;
4890 int type_src
, type_dst
;
4892 // type_src, type_dst
4893 type_src
= type_dst
= (type
== c_data_ptr_array
) ?
4894 c_data_ptr
: (type
== c_func_ptr_array
) ?
4895 c_func_ptr
: (type
== c_void_ptr_array
) ?
4896 c_void_ptr
: (type
== c_string_ptr_array
) ?
4900 if (!ptr
.read_next(true)) {
4904 ptr
.val
= (void*)(ptr
.base
+ ptr
.offset
);
4907 // !!! If we got error at phase of reading - it's an internal
4908 // !!! error, as we must detect mismatch before
4911 if (m_vars
[i
].into
) {
4912 if (!into
.read_next(true)) {
4913 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into");
4917 into
.val
= (void*)(into
.base
+ into
.offset
);
4921 // Get other components of the clause
4922 if (!ext_start
.read_next(flags
& (1<<flag_extent_start_is_array
))) {
4923 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "extent start");
4926 if (!ext_elements
.read_next(
4927 flags
& (1<<flag_extent_elements_is_array
))) {
4928 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "extent elements");
4931 if (!alloc_if
.read_next(flags
& (1<<flag_alloc_if_is_array
))) {
4932 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_if");
4935 if (!free_if
.read_next(flags
& (1<<flag_free_if_is_array
))) {
4936 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "free_if");
4939 if (!align
.read_next(flags
& (1<<flag_align_is_array
))) {
4940 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "align");
4943 if (!into_start
.read_next(flags
& (1<<flag_into_start_is_array
))) {
4944 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into_extent start");
4947 if (!into_elem
.read_next(flags
& (1<<flag_into_elements_is_array
))) {
4948 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into_extent elements");
4951 if (!alloc_start
.read_next(flags
& (1<<flag_alloc_start_is_array
))) {
4952 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_extent start");
4955 if (!alloc_elem
.read_next(
4956 flags
& (1<<flag_alloc_elements_is_array
))) {
4957 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_extent elements");
4961 m_vars
[new_index
+ k
].direction
.bits
= m_vars
[i
].direction
.bits
;
4962 m_vars
[new_index
+ k
].alloc_if
= alloc_if
.val
;
4963 m_vars
[new_index
+ k
].free_if
= free_if
.val
;
4964 m_vars
[new_index
+ k
].align
= align
.val
;
4965 m_vars
[new_index
+ k
].mic_offset
= 0;
4966 m_vars
[new_index
+ k
].flags
.bits
= m_vars
[i
].flags
.bits
;
4967 m_vars
[new_index
+ k
].offset
= 0;
4968 m_vars
[new_index
+ k
].size
= m_vars
[i
].size
;
4969 m_vars
[new_index
+ k
].flags
.targetptr
= m_vars
[i
].flags
.targetptr
;
4970 m_vars
[new_index
+ k
].flags
.preallocated
=
4971 m_vars
[i
].flags
.preallocated
;
4973 if (ext_start
.val
== 0) {
4974 m_vars
[new_index
+ k
].count
= ext_elements
.val
;
4975 m_vars
[new_index
+ k
].ptr
= ptr
.val
;
4976 if (type_src
== c_string_ptr
) {
4977 m_vars
[new_index
+ k
].size
= 0;
4981 m_vars
[new_index
+ k
].count
= 0;
4982 m_vars
[new_index
+ k
].ptr
=
4983 static_cast<void*>(make_arr_desc(
4989 type_src
= type_src
== c_data_ptr
? c_cean_var_ptr
:
4990 c_string_ptr
? c_cean_var_ptr
:
4992 if (!m_vars
[i
].into
) {
4993 type_dst
= type_src
;
4997 if (m_vars
[i
].into
&& into_elem
.val
!= 0) {
4998 m_vars
[new_index
+ k
].into
=
4999 static_cast<void*>(make_arr_desc(
5004 type_dst
= (type
== c_data_ptr_array
) ? c_cean_var_ptr
:
5005 (type
== c_string_ptr_array
) ? c_cean_var_ptr
:
5009 m_vars
[new_index
+ k
].into
= NULL
;
5012 if (alloc_elem
.val
!= 0) {
5013 m_vars
[new_index
+ k
].alloc
=
5014 static_cast<void*>(make_arr_desc(
5021 m_vars
[new_index
+ k
].alloc
= NULL
;
5024 m_vars
[new_index
+ k
].type
.src
= type_src
;
5025 m_vars
[new_index
+ k
].type
.dst
= type_dst
;
5027 m_vars_extra
[new_index
+ k
].alloc
= m_vars
[new_index
+ k
].alloc
;
5028 m_vars_extra
[new_index
+ k
].is_arr_ptr_el
= 1;
5029 m_vars_extra
[new_index
+ k
].ptr_arr_offset
=
5030 src_is_for_mic
? ptr
.offset
: into
.offset
;
5032 // count and alloc fields are useless at target. They can be reused
5033 // for pointer arrays.
5034 m_vars
[i
].count
= pointers_number
;
5035 m_vars
[i
].ptr_arr_offset
= new_index
;
5039 // Gets in dependencies of the previous offload via the stream "m_stream".
5040 // Out argument in_deps_amount - address of amount of the dependencies
5041 // Out argument in_deps - array of dependencies.
5042 // Description of the dependencies scheme for streams :
5043 // ----------------------------------------------------
5044 // Every offload forms DAG consisted of 3 nodes:
5045 // for in-transfers, runfunction and out-transfers.
5046 // Every node has in-dependencies and out-dependencies
5047 // Out-dependencies of previous node forms in-dependencies of current node.
5048 // In-dependencies of 1-st node (of in-transfers) without streams is equal
5049 // to NULL. For streams in-dependencies of 1-st node is equal to list of out
5050 // dependencies of last node of previous offload via this stream.
5051 // So we can say that DAGs of 2 consequent offloads via the same stream are
5052 // connected by the way described above.
5053 void OffloadDescriptor::get_stream_in_dependencies(
5054 uint32_t &in_deps_amount
,
5058 if (m_stream
!= no_stream
&& m_stream
!= 0) {
5059 Stream
* stream
= Stream::find_stream(m_stream
, false);
5061 LIBOFFLOAD_ERROR(c_offload_no_stream
,
5062 m_device
.get_logical_index());
5065 OffloadDescriptor
* offload
= stream
->get_last_offload();
5067 // if it's the first offload in the stream
5071 // if last offload has out-tranfers
5072 if (offload
->m_out_deps_total
) {
5073 in_deps_amount
= offload
->m_out_deps_total
;
5074 in_deps
= offload
->m_out_deps
;
5076 // last offload only sends pointer data or run function or both of them
5077 // and has no out-transfers
5078 else if (offload
->m_in_deps_total
) {
5079 in_deps_amount
= offload
->m_in_deps_total
;
5080 in_deps
= offload
->m_in_deps
;
5085 static void __offload_fini_library(void)
5087 OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n");
5088 if (mic_engines_total
> 0) {
5089 delete[] mic_engines
;
5091 if (mic_proxy_fs_root
!= 0) {
5092 free(mic_proxy_fs_root
);
5093 mic_proxy_fs_root
= 0;
5096 if (mic_library_path
!= 0) {
5097 free(mic_library_path
);
5098 mic_library_path
= 0;
5101 // destroy thread key
5102 thread_key_delete(mic_thread_key
);
5105 // unload COI library
5106 if (COI::is_available
) {
5110 OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ... done\n");
5113 static void __offload_init_library_once(void)
5116 uint32_t num_devices
;
5117 std::bitset
<MIC_ENGINES_MAX
> devices
;
5118 prefix
= report_get_message_str(c_report_host
);
5121 const char *env_var
= getenv(htrace_envname
);
5122 if (env_var
!= 0 && *env_var
!= '\0') {
5124 if (__offload_parse_int_string(env_var
, new_val
)) {
5125 console_enabled
= new_val
& 0x0f;
5129 env_var
= getenv(offload_report_envname
);
5130 if (env_var
!= 0 && *env_var
!= '\0') {
5132 if (__offload_parse_int_string(env_var
, env_val
)) {
5133 if (env_val
== OFFLOAD_REPORT_1
||
5134 env_val
== OFFLOAD_REPORT_2
||
5135 env_val
== OFFLOAD_REPORT_3
) {
5136 offload_report_level
= env_val
;
5139 LIBOFFLOAD_ERROR(c_invalid_env_report_value
,
5140 offload_report_envname
);
5144 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value
,
5145 offload_report_envname
);
5148 else if (!offload_report_level
) {
5149 env_var
= getenv(timer_envname
);
5150 if (env_var
!= 0 && *env_var
!= '\0') {
5151 timer_enabled
= atoi(env_var
);
5160 // get number of devices installed in the system
5161 res
= COI::EngineGetCount(COI_ISA_MIC
, &num_devices
);
5162 if (res
!= COI_SUCCESS
) {
5166 if (num_devices
> MIC_ENGINES_MAX
) {
5167 num_devices
= MIC_ENGINES_MAX
;
5170 // fill in the list of devices that can be used for offloading
5171 env_var
= getenv("OFFLOAD_DEVICES");
5173 if (strcasecmp(env_var
, "none") != 0) {
5174 // value is composed of comma separated physical device indexes
5175 char *buf
= strdup(env_var
);
5177 for (str
= strtok_r(buf
, ",", &ptr
); str
!= 0;
5178 str
= strtok_r(0, ",", &ptr
)) {
5179 // convert string to an int
5181 if (!__offload_parse_int_string(str
, num
)) {
5182 LIBOFFLOAD_ERROR(c_mic_init5
);
5184 // fallback to using all installed devices
5186 for (int i
= 0; i
< num_devices
; i
++) {
5191 if (num
< 0 || num
>= num_devices
) {
5192 LIBOFFLOAD_ERROR(c_mic_init6
, num
);
5201 // use all available devices
5202 for (int i
= 0; i
< num_devices
; i
++) {
5204 res
= COI::EngineGetHandle(COI_ISA_MIC
, i
, &engine
);
5205 if (res
== COI_SUCCESS
) {
5211 mic_engines_total
= devices
.count();
5213 // no need to continue if there are no devices to offload to
5214 if (mic_engines_total
<= 0) {
5218 // initialize indexes for available devices
5219 mic_engines
= new Engine
[mic_engines_total
];
5220 for (int p_idx
= 0, l_idx
= 0; p_idx
< num_devices
; p_idx
++) {
5221 if (devices
[p_idx
]) {
5222 mic_engines
[l_idx
].set_indexes(l_idx
, p_idx
);
5227 // Get DMA channel count to pass it to COI
5228 env_var
= getenv("OFFLOAD_DMA_CHANNEL_COUNT");
5231 if (__offload_parse_int_string(env_var
, new_val
)) {
5232 mic_dma_channel_count
= new_val
;
5235 LIBOFFLOAD_ERROR(c_invalid_env_var_value
,
5236 "OFFLOAD_DMA_CHANNEL_COUNT");
5240 // Set COI_HOST_THREAD_AFFINITY if OFFLOAD_HOST_THREAD_AFFINITY is set.
5241 // Use putenv instead of setenv as Windows has no setenv.
5242 // Note: putenv requires its argument can't be freed or modified.
5243 // So no free after call to putenv or elsewhere.
5244 env_var
= getenv("OFFLOAD_HOST_THREAD_AFFINITY");
5246 char * new_env_var
=
5247 (char*) malloc(sizeof("COI_HOST_THREAD_AFFINITY=") +
5248 sizeof(env_var
) + 1);
5249 sprintf(new_env_var
, "COI_HOST_THREAD_AFFINITY=%s", env_var
);
5250 putenv(new_env_var
);
5253 // library search path for device binaries
5254 env_var
= getenv("MIC_LD_LIBRARY_PATH");
5256 mic_library_path
= strdup(env_var
);
5260 // find target executable to be used if main application is not an
5261 // offload build application.
5262 const char *base_name
= "offload_main";
5263 if (mic_library_path
!= 0) {
5264 char *buf
= strdup(mic_library_path
);
5265 char *try_name
= (char*) alloca(strlen(mic_library_path
) +
5266 strlen(base_name
) + 2);
5269 for (dir
= strtok_r(buf
, PATH_SEPARATOR
, &ptr
); dir
!= 0;
5270 dir
= strtok_r(0, PATH_SEPARATOR
, &ptr
)) {
5271 // compose a full path
5272 sprintf(try_name
, "%s/%s", dir
, base_name
);
5274 // check if such file exists
5276 if (stat(try_name
, &st
) == 0 && S_ISREG(st
.st_mode
)) {
5277 mic_device_main
= strdup(try_name
);
5285 // memory size reserved for COI buffers
5286 env_var
= getenv("MIC_BUFFERSIZE");
5289 if (__offload_parse_size_string(env_var
, new_size
)) {
5290 mic_buffer_size
= new_size
;
5293 LIBOFFLOAD_ERROR(c_invalid_env_var_value
, "MIC_BUFFERSIZE");
5297 // memory size reserved for 4K pages for COI buffers
5298 env_var
= getenv("MIC_4K_BUFFER_RESERVE_SIZE");
5301 if (__offload_parse_size_string(env_var
, new_size
)) {
5302 mic_4k_buffer_size
= new_size
;
5305 LIBOFFLOAD_ERROR(c_invalid_env_var_value
, "MIC_4K_BUFFER_RESERVE_SIZE");
5309 // memory size reserved for 2M pages for COI buffers
5310 env_var
= getenv("MIC_2M_BUFFER_RESERVE_SIZE");
5313 if (__offload_parse_size_string(env_var
, new_size
)) {
5314 mic_2m_buffer_size
= new_size
;
5317 LIBOFFLOAD_ERROR(c_invalid_env_var_value
, "MIC_2M_BUFFER_RESERVE_SIZE");
5321 // determine stacksize for the pipeline on the device
5322 env_var
= getenv("MIC_STACKSIZE");
5323 if (env_var
!= 0 && *env_var
!= '\0') {
5325 if (__offload_parse_size_string(env_var
, new_size
) &&
5326 (new_size
>= 16384) && ((new_size
& 4095) == 0)) {
5327 mic_stack_size
= new_size
;
5330 LIBOFFLOAD_ERROR(c_mic_init3
);
5335 env_var
= getenv("MIC_PROXY_IO");
5336 if (env_var
!= 0 && *env_var
!= '\0') {
5338 if (__offload_parse_int_string(env_var
, new_val
)) {
5339 mic_proxy_io
= new_val
;
5342 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value
, "MIC_PROXY_IO");
5345 env_var
= getenv("MIC_PROXY_FS_ROOT");
5346 if (env_var
!= 0 && *env_var
!= '\0') {
5347 mic_proxy_fs_root
= strdup(env_var
);
5350 // Prepare environment for the target process using the following
5352 // - If MIC_ENV_PREFIX is set then any environment variable on the
5353 // host which has that prefix are copied to the device without
5355 // All other host environment variables are ignored.
5356 // - If MIC_ENV_PREFIX is not set or if MIC_ENV_PREFIX="" then host
5357 // environment is duplicated.
5358 env_var
= getenv("MIC_ENV_PREFIX");
5359 if (env_var
!= 0 && *env_var
!= '\0') {
5360 mic_env_vars
.set_prefix(env_var
);
5362 int len
= strlen(env_var
);
5363 for (int i
= 0; environ
[i
] != 0; i
++) {
5364 if (strncmp(environ
[i
], env_var
, len
) == 0 &&
5365 strncmp(environ
[i
], "MIC_LD_LIBRARY_PATH", 19) != 0 &&
5366 environ
[i
][len
] != '=') {
5367 mic_env_vars
.analyze_env_var(environ
[i
]);
5372 // create key for thread data
5373 if (thread_key_create(&mic_thread_key
, Engine::destroy_thread_data
)) {
5374 LIBOFFLOAD_ERROR(c_mic_init4
, errno
);
5379 cpu_frequency
= COI::PerfGetCycleFrequency();
5381 env_var
= getenv(mic_use_2mb_buffers_envname
);
5382 if (env_var
!= 0 && *env_var
!= '\0') {
5384 if (__offload_parse_size_string(env_var
, new_size
)) {
5385 __offload_use_2mb_buffers
= new_size
;
5388 LIBOFFLOAD_ERROR(c_invalid_env_var_value
,
5389 mic_use_2mb_buffers_envname
);
5393 env_var
= getenv(mic_use_async_buffer_write_envname
);
5394 if (env_var
!= 0 && *env_var
!= '\0') {
5396 if (__offload_parse_size_string(env_var
, new_size
)) {
5397 __offload_use_async_buffer_write
= new_size
;
5401 env_var
= getenv(mic_use_async_buffer_read_envname
);
5402 if (env_var
!= 0 && *env_var
!= '\0') {
5404 if (__offload_parse_size_string(env_var
, new_size
)) {
5405 __offload_use_async_buffer_read
= new_size
;
5409 // mic initialization type
5410 env_var
= getenv(offload_init_envname
);
5411 if (env_var
!= 0 && *env_var
!= '\0') {
5412 if (strcmp(env_var
, "on_offload") == 0) {
5413 __offload_init_type
= c_init_on_offload
;
5415 else if (strcmp(env_var
, "on_offload_all") == 0) {
5416 __offload_init_type
= c_init_on_offload_all
;
5418 else if (strcmp(env_var
, "on_start") == 0) {
5419 __offload_init_type
= c_init_on_start
;
5422 LIBOFFLOAD_ERROR(c_invalid_env_var_value
, offload_init_envname
);
5427 env_var
= getenv(offload_active_wait_envname
);
5428 if (env_var
!= 0 && *env_var
!= '\0') {
5430 if (__offload_parse_int_string(env_var
, new_val
)) {
5431 __offload_active_wait
= new_val
;
5434 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value
,
5435 offload_active_wait_envname
);
5440 env_var
= getenv(omp_device_num_envname
);
5441 if (env_var
!= 0 && *env_var
!= '\0') {
5443 if (__offload_parse_int_string(env_var
, new_val
) && new_val
>= 0) {
5444 __omp_device_num
= new_val
;
5447 LIBOFFLOAD_ERROR(c_omp_invalid_device_num_env
,
5448 omp_device_num_envname
);
5452 // parallel copy of offload_transfer
5453 env_var
= getenv(parallel_copy_envname
);
5454 if (env_var
!= 0 && *env_var
!= '\0') {
5456 if (__offload_parse_int_string(env_var
, new_val
) && new_val
>= 0) {
5457 __offload_parallel_copy
= new_val
;
5460 LIBOFFLOAD_ERROR(c_invalid_env_var_value
,
5461 parallel_copy_envname
);
5465 // use COI interface for noncontiguous arrays transfer
5466 env_var
= getenv(use_coi_noncontiguous_transfer_envname
);
5467 if (env_var
!= 0 && *env_var
!= '\0') {
5469 if (__offload_parse_size_string(env_var
, new_size
)) {
5470 __offload_use_coi_noncontiguous_transfer
= new_size
;
5473 LIBOFFLOAD_ERROR(c_invalid_env_var_value
,
5474 use_coi_noncontiguous_transfer_envname
);
5482 extern int __offload_init_library(void)
5484 // do one time intialization
5485 static OffloadOnceControl ctrl
= OFFLOAD_ONCE_CONTROL_INIT
;
5486 __offload_run_once(&ctrl
, __offload_init_library_once
);
5488 // offload is available if COI is available and the number of devices > 0
5489 bool is_available
= COI::is_available
&& (mic_engines_total
> 0);
5491 // register pending libraries if there are any
5492 if (is_available
&& __target_libs
) {
5493 mutex_locker_t
locker(__target_libs_lock
);
5495 for (TargetImageList::iterator it
= __target_libs_list
.begin();
5496 it
!= __target_libs_list
.end(); it
++) {
5497 // Register library in COI
5498 COI::ProcessRegisterLibraries(1, &it
->data
, &it
->size
,
5499 &it
->origin
, &it
->offset
);
5501 // add lib to all engines
5502 for (int i
= 0; i
< mic_engines_total
; i
++) {
5503 mic_engines
[i
].add_lib(*it
);
5507 __target_libs
= false;
5508 __target_libs_list
.clear();
5511 return is_available
;
5514 extern "C" bool __offload_target_image_is_executable(const void *target_image
)
5516 const struct Image
*image
= static_cast<const struct Image
*>(target_image
);
5519 const char *name
= image
->data
;
5520 const void *data
= image
->data
+ strlen(image
->data
) + 1;
5522 // determine image type
5523 const Elf64_Ehdr
*hdr
= static_cast<const Elf64_Ehdr
*>(data
);
5524 return (hdr
->e_type
== ET_EXEC
);
5527 extern "C" bool __offload_register_image(const void *target_image
)
5529 const struct Image
*image
= static_cast<const struct Image
*>(target_image
);
5532 const char *name
= image
->data
;
5533 const void *data
= image
->data
+ strlen(image
->data
) + 1;
5534 uint64_t size
= image
->size
;
5535 char *origin
= (char *) malloc(strlen(image
->data
) + 1);
5536 uint64_t offset
= 0;
5537 const char *host_name
= image
->data
;
5541 LIBOFFLOAD_ERROR(c_malloc
);
5543 // The origin name is the name of the file on the host
5544 // this is used by Vtune, since it is a fat binary we
5545 // use the host file name of the fat binary.
5546 // Driver prepends the host file name ending with "?"
5547 // to the image->data name so need to extract the string
5549 while (*host_name
!= '\0' && *host_name
!= '?') {
5550 origin
[i
] = *host_name
;
5555 // Implies the host name does not exist which really should
5556 // not occur. Allow this since only consumer is Vtune.
5557 if ((i
== 0) || (*host_name
!= '?')) {
5562 // our actions depend on the image type
5563 const Elf64_Ehdr
*hdr
= static_cast<const Elf64_Ehdr
*>(data
);
5564 switch (hdr
->e_type
) {
5566 // Each offload application is supposed to have only one target
5567 // image representing target executable.
5568 // No thread synchronization is required here as the initialization
5569 // code is always executed in a single thread.
5570 if (__target_exe
!= 0) {
5571 LIBOFFLOAD_ERROR(c_multiple_target_exes
);
5574 __target_exe
= new TargetImage(name
, data
, size
, origin
, offset
);
5576 // Registration code for execs is always called from the context
5577 // of main and thus we can safely call any function here,
5578 // including LoadLibrary API on windows. This is the place where
5579 // we do the offload library initialization.
5580 if (__offload_init_library()) {
5581 // initialize engine if init_type is on_start
5582 if (__offload_init_type
== c_init_on_start
) {
5583 for (int i
= 0; i
< mic_engines_total
; i
++) {
5584 mic_engines
[i
].init();
5588 return mic_engines_total
> 0;
5592 char *fullname
= origin
;
5593 // We add the library to a list of pending libraries
5594 __target_libs_lock
.lock();
5595 __target_libs
= true;
5596 __target_libs_list
.push_back(
5597 TargetImage(name
, data
, size
, fullname
, offset
));
5598 __target_libs_lock
.unlock();
5599 // If __target_exe is set, then main has started running
5600 // If not main, then we can't do anything useful here
5601 // because this registration code is called from DllMain
5602 // context (on windows).
5603 if (__target_exe
!= 0) {
5604 // There is no need to delay loading the library
5605 if (!__offload_init_library()) {
5606 // Couldn't validate library as a fat offload library
5607 LIBOFFLOAD_ERROR(c_unknown_binary_type
);
5615 // something is definitely wrong, issue an error and exit
5616 LIBOFFLOAD_ERROR(c_unknown_binary_type
);
5621 extern "C" void __offload_unregister_image(const void *target_image
)
5623 // Target image is packed as follows:
5624 // 8 bytes - size of the target binary
5625 // null-terminated string - binary name
5626 // <size> bytes - binary contents
5627 const struct Image
{
5630 } *image
= static_cast<const struct Image
*>(target_image
);
5633 const char *name
= image
->data
;
5634 const void *data
= image
->data
+ strlen(image
->data
) + 1;
5636 // our actions depend on the image type
5637 const Elf64_Ehdr
*hdr
= static_cast<const Elf64_Ehdr
*>(data
);
5638 if (hdr
->e_type
== ET_EXEC
) {
5639 // We are executing exec's desctructors.
5640 // It is time to do a library cleanup.
5641 if (timer_enabled
) {
5642 Offload_Timer_Print();
5646 __offload_myoFini();
5647 #endif // MYO_SUPPORT
5649 __offload_fini_library();
5651 else if (hdr
->e_type
== ET_DYN
) {
5652 for (int i
= 0; i
< mic_engines_total
; i
++) {
5653 mic_engines
[i
].unload_library(data
, name
);
5659 // Runtime trace interface for user programs
5661 void __offload_console_trace(int level
)
5663 console_enabled
= level
;
5666 // User-visible offload API
5668 int _Offload_number_of_devices(void)
5670 __offload_init_library();
5671 return mic_engines_total
;
5674 int _Offload_get_device_number(void)
5679 int _Offload_get_physical_device_number(void)
5684 int _Offload_signaled(int index
, void *signal
)
5686 __offload_init_library();
5688 // check index value
5690 LIBOFFLOAD_ERROR(c_offload_signaled1
, index
);
5694 index
%= mic_engines_total
;
5696 // find associated async task
5697 OffloadDescriptor
*task
=
5698 mic_engines
[index
].find_signal(signal
, false);
5700 LIBOFFLOAD_ERROR(c_offload_signaled2
, signal
);
5703 // if signal is removed by wait completing
5704 else if (task
== SIGNAL_IS_REMOVED
) {
5707 return task
->is_signaled();
5710 void _Offload_report(int val
)
5712 if (val
== OFFLOAD_REPORT_ON
||
5713 val
== OFFLOAD_REPORT_OFF
) {
5714 offload_report_enabled
= val
;
5718 int _Offload_find_associated_mic_memory(
5720 const void* cpu_addr
,
5721 void** cpu_base_addr
,
5722 uint64_t* buf_length
,
5724 uint64_t* mic_buf_start_offset
,
5728 __offload_init_library();
5730 // check target value
5732 LIBOFFLOAD_ERROR(c_offload_signaled1
, target
);
5735 target
%= mic_engines_total
;
5737 // find existing association in pointer table
5738 PtrData
* ptr_data
= mic_engines
[target
].find_ptr_data(cpu_addr
);
5739 if (ptr_data
== 0) {
5740 OFFLOAD_TRACE(3, "Association does not exist\n");
5744 OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
5745 ptr_data
->cpu_addr
.start(), ptr_data
->cpu_addr
.length(),
5746 ptr_data
->is_static
);
5748 if (ptr_data
->mic_buf
!= 0 && ptr_data
->mic_addr
== 0) {
5749 COIRESULT res
= COI::BufferGetSinkAddress(ptr_data
->mic_buf
,
5750 &ptr_data
->mic_addr
);
5751 if (res
!= COI_SUCCESS
) {
5755 *cpu_base_addr
= const_cast<void *>(ptr_data
->cpu_addr
.start());
5756 *buf_length
= ptr_data
->cpu_addr
.length() - ptr_data
->alloc_disp
;
5757 *mic_addr
= (void *)(ptr_data
->mic_addr
+ ptr_data
->mic_offset
);
5758 *mic_buf_start_offset
= ptr_data
->alloc_disp
;
5759 *is_static
= ptr_data
->is_static
;
5760 return ptr_data
->is_static
? 1 : ptr_data
->get_reference();
5763 _Offload_stream
_Offload_stream_create(
5764 int device
, // MIC device number
5765 int number_of_cpus
// Cores allocated to the stream
5768 __offload_init_library();
5770 // check target value
5772 LIBOFFLOAD_ERROR(c_offload_signaled1
, device
);
5775 device
%= mic_engines_total
;
5777 // Create new stream and get its handle
5778 _Offload_stream handle
= Stream::add_stream(device
, number_of_cpus
);
5780 OFFLOAD_TRACE(3, "Can't create stream\n");
5784 // create pipeline associated with the new stream
5785 mic_engines
[device
].get_pipeline(handle
);
5790 int _Offload_stream_destroy(
5791 int device
, // MIC device number
5792 _Offload_stream handle
// stream to destroy
5795 __offload_init_library();
5797 // check target value
5799 LIBOFFLOAD_ERROR(c_offload_signaled1
, device
);
5802 device
%= mic_engines_total
;
5804 mic_engines
[device
].stream_destroy(handle
);
5809 int _Offload_stream_completed(int device
, _Offload_stream handler
)
5811 __offload_init_library();
5813 // check index value
5815 LIBOFFLOAD_ERROR(c_offload_signaled1
, device
);
5819 device
%= mic_engines_total
;
5825 stream
= Stream::find_stream(handler
, false);
5827 // the stream was not created or was destroyed
5829 LIBOFFLOAD_ERROR(c_offload_no_stream
, device
);
5833 // find associated async task
5834 OffloadDescriptor
*task
= stream
->get_last_offload();
5836 // offload was completed by offload_wait pragma or wait clause
5840 return task
->is_signaled();
5842 // zero handler is for all streams at the device
5844 StreamMap stream_map
= Stream::all_streams
;
5845 for (StreamMap::iterator it
= stream_map
.begin();
5846 it
!= stream_map
.end(); it
++) {
5847 Stream
* stream
= it
->second
;
5848 // find associated async task
5849 OffloadDescriptor
*task
= stream
->get_last_offload();
5851 // offload was completed by offload_wait pragma or wait clause
5855 // if even one stream is not completed result is false
5856 if (!task
->is_signaled()) {
5860 // no uncompleted streams
5866 int __dbg_is_attached
= 0;
5867 int __dbg_target_id
= -1;
5868 pid_t __dbg_target_so_pid
= -1;
5869 char __dbg_target_exe_name
[MAX_TARGET_NAME
] = {0};
5870 const int __dbg_api_major_version
= 1;
5871 const int __dbg_api_minor_version
= 0;
5873 void __dbg_target_so_loaded()
5876 void __dbg_target_so_unloaded()