Fortran] Use proper type for hidden is-present argument
[gcc.git] / liboffloadmic / runtime / offload_engine.h
1 /*
2 Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30
31 #ifndef OFFLOAD_ENGINE_H_INCLUDED
32 #define OFFLOAD_ENGINE_H_INCLUDED
33
34 #include <limits.h>
35 #include <bitset>
36 #include <list>
37 #include <set>
38 #include <map>
39 #include "offload_common.h"
40 #include "coi/coi_client.h"
41
42 #define SIGNAL_HAS_COMPLETED ((OffloadDescriptor *)-1)
43 const int64_t no_stream = -1;
44
45 // Address range
46 class MemRange {
47 public:
48 MemRange() : m_start(0), m_length(0) {}
49 MemRange(const void *addr, uint64_t len) : m_start(addr), m_length(len) {}
50
51 const void* start() const {
52 return m_start;
53 }
54
55 const void* end() const {
56 return static_cast<const char*>(m_start) + m_length;
57 }
58
59 uint64_t length() const {
60 return m_length;
61 }
62
63 // returns true if given range overlaps with another one
64 bool overlaps(const MemRange &o) const {
65 // Two address ranges A[start, end) and B[start,end) overlap
66 // if A.start < B.end and A.end > B.start.
67 return start() < o.end() && end() > o.start();
68 }
69
70 // returns true if given range contains the other range
71 bool contains(const MemRange &o) const {
72 return start() <= o.start() && o.end() <= end();
73 }
74
75 private:
76 const void* m_start;
77 uint64_t m_length;
78 };
79
80 // Data associated with a pointer variable
81 class PtrData {
82 public:
83 PtrData(const void *addr, uint64_t len) :
84 cpu_addr(addr, len), cpu_buf(0),
85 mic_addr(0), alloc_disp(0), mic_buf(0), mic_offset(0),
86 ref_count(0), is_static(false), is_omp_associate(false)
87 {}
88
89 //
90 // Copy constructor
91 //
92 PtrData(const PtrData& ptr):
93 cpu_addr(ptr.cpu_addr), cpu_buf(ptr.cpu_buf),
94 mic_addr(ptr.mic_addr), alloc_disp(ptr.alloc_disp),
95 mic_buf(ptr.mic_buf), mic_offset(ptr.mic_offset),
96 ref_count(ptr.ref_count), is_static(ptr.is_static),
97 is_omp_associate(ptr.is_omp_associate),
98 var_alloc_type(0)
99 {}
100
101 bool operator<(const PtrData &o) const {
102 // Variables are sorted by the CPU start address.
103 // Overlapping memory ranges are considered equal.
104 return (cpu_addr.start() < o.cpu_addr.start()) &&
105 !cpu_addr.overlaps(o.cpu_addr);
106 }
107
108 long add_reference() {
109 if (is_omp_associate || (is_static && !var_alloc_type)) {
110 return LONG_MAX;
111 }
112 #ifndef TARGET_WINNT
113 return __sync_fetch_and_add(&ref_count, 1);
114 #else // TARGET_WINNT
115 return _InterlockedIncrement(&ref_count) - 1;
116 #endif // TARGET_WINNT
117 }
118
119 long remove_reference() {
120 if (is_omp_associate || (is_static && !var_alloc_type)) {
121 return LONG_MAX;
122 }
123 #ifndef TARGET_WINNT
124 return __sync_sub_and_fetch(&ref_count, 1);
125 #else // TARGET_WINNT
126 return _InterlockedDecrement(&ref_count);
127 #endif // TARGET_WINNT
128 }
129
130 long get_reference() const {
131 if (is_omp_associate || (is_static && !var_alloc_type)) {
132 return LONG_MAX;
133 }
134 return ref_count;
135 }
136
137 public:
138 // CPU address range
139 const MemRange cpu_addr;
140
141 // CPU and MIC buffers
142 COIBUFFER cpu_buf;
143 COIBUFFER mic_buf;
144
145 // placeholder for buffer address on mic
146 uint64_t mic_addr;
147
148 uint64_t alloc_disp;
149
150 // additional offset to pointer data on MIC for improving bandwidth for
151 // data which is not 4K aligned
152 uint32_t mic_offset;
153
154 // if true buffers are created from static memory
155 bool is_static;
156
157 // true if MIC buffer created by omp_target_associate
158 bool is_omp_associate;
159
160 bool var_alloc_type;
161 mutex_t alloc_ptr_data_lock;
162
163 private:
164 // reference count for the entry
165 long ref_count;
166 };
167
168 typedef std::list<PtrData*> PtrDataList;
169
170 class PtrDataTable {
171 public:
172 typedef std::set<PtrData> PtrSet;
173
174 PtrData* find_ptr_data(const void *ptr) {
175 m_ptr_lock.lock();
176 PtrSet::iterator res = list.find(PtrData(ptr, 0));
177
178 m_ptr_lock.unlock();
179 if (res == list.end()) {
180 return 0;
181 }
182 return const_cast<PtrData*>(res.operator->());
183 }
184
185 PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
186 m_ptr_lock.lock();
187 std::pair<PtrSet::iterator, bool> res =
188 list.insert(PtrData(ptr, len));
189
190 PtrData* ptr_data = const_cast<PtrData*>(res.first.operator->());
191 m_ptr_lock.unlock();
192
193 is_new = res.second;
194 if (is_new) {
195 // It's necessary to lock as soon as possible.
196 // unlock must be done at call site of insert_ptr_data at
197 // branch for is_new
198 ptr_data->alloc_ptr_data_lock.lock();
199 }
200 return ptr_data;
201 }
202
203 void remove_ptr_data(const void *ptr) {
204 m_ptr_lock.lock();
205 list.erase(PtrData(ptr, 0));
206 m_ptr_lock.unlock();
207 }
208 private:
209
210 PtrSet list;
211 mutex_t m_ptr_lock;
212 };
213
214 // Data associated with automatic variable
215 class AutoData {
216 public:
217 AutoData(const void *addr, uint64_t len) :
218 cpu_addr(addr, len), ref_count(0)
219 {}
220
221 bool operator<(const AutoData &o) const {
222 // Variables are sorted by the CPU start address.
223 // Overlapping memory ranges are considered equal.
224 return (cpu_addr.start() < o.cpu_addr.start()) &&
225 !cpu_addr.overlaps(o.cpu_addr);
226 }
227
228 long add_reference() {
229 #ifndef TARGET_WINNT
230 return __sync_fetch_and_add(&ref_count, 1);
231 #else // TARGET_WINNT
232 return _InterlockedIncrement(&ref_count) - 1;
233 #endif // TARGET_WINNT
234 }
235
236 long remove_reference() {
237 #ifndef TARGET_WINNT
238 return __sync_sub_and_fetch(&ref_count, 1);
239 #else // TARGET_WINNT
240 return _InterlockedDecrement(&ref_count);
241 #endif // TARGET_WINNT
242 }
243
244 long nullify_reference() {
245 #ifndef TARGET_WINNT
246 return __sync_lock_test_and_set(&ref_count, 0);
247 #else // TARGET_WINNT
248 return _InterlockedExchange(&ref_count,0);
249 #endif // TARGET_WINNT
250 }
251
252 long get_reference() const {
253 return ref_count;
254 }
255
256 public:
257 // CPU address range
258 const MemRange cpu_addr;
259
260 private:
261 // reference count for the entry
262 long ref_count;
263 };
264
265 // Set of autimatic variables
266 typedef std::set<AutoData> AutoSet;
267
268 // Target image data
269 struct TargetImage
270 {
271 TargetImage(const char *_name, const void *_data, uint64_t _size,
272 const char *_origin, uint64_t _offset) :
273 name(_name), data(_data), size(_size),
274 origin(_origin), offset(_offset)
275 {}
276
277 // library name
278 const char* name;
279
280 // contents and size
281 const void* data;
282 uint64_t size;
283
284 // file of origin and offset within that file
285 const char* origin;
286 uint64_t offset;
287 };
288
289 typedef std::list<TargetImage> TargetImageList;
290
291 // dynamic library and Image associated with lib
292 struct DynLib
293 {
294 DynLib(const char *_name, const void *_data,
295 COILIBRARY _lib) :
296 name(_name), data(_data), lib(_lib)
297 {}
298 // library name
299 const char* name;
300
301 // contents
302 const void* data;
303
304 COILIBRARY lib;
305 };
306 typedef std::list<DynLib> DynLibList;
307
308 // Data associated with persistent auto objects
309 struct PersistData
310 {
311 PersistData(const void *addr, uint64_t routine_num,
312 uint64_t size, uint64_t thread) :
313 stack_cpu_addr(addr), routine_id(routine_num), thread_id(thread)
314 {
315 stack_ptr_data = new PtrData(0, size);
316 }
317 // 1-st key value - beginning of the stack at CPU
318 const void * stack_cpu_addr;
319 // 2-nd key value - identifier of routine invocation at CPU
320 uint64_t routine_id;
321 // 3-rd key value - thread identifier
322 uint64_t thread_id;
323
324 // corresponded PtrData; only stack_ptr_data->mic_buf is used
325 PtrData * stack_ptr_data;
326 // used to get offset of the variable in stack buffer
327 char * cpu_stack_addr;
328 };
329
330 typedef std::list<PersistData> PersistDataList;
331
332 // Data associated with stream
333 struct Stream
334 {
335 Stream(int device, int num_of_cpus) :
336 m_number_of_cpus(num_of_cpus), m_pipeline(0), m_last_offload(0),
337 m_device(device)
338 {}
339 ~Stream() {
340 if (m_pipeline) {
341 COI::PipelineDestroy(m_pipeline);
342 }
343 }
344
345 COIPIPELINE get_pipeline(void) {
346 return(m_pipeline);
347 }
348
349 int get_device(void) {
350 return(m_device);
351 }
352
353 int get_cpu_number(void) {
354 return(m_number_of_cpus);
355 }
356
357 void set_pipeline(COIPIPELINE pipeline) {
358 m_pipeline = pipeline;
359 }
360
361 OffloadDescriptor* get_last_offload(void) {
362 return(m_last_offload);
363 }
364
365 void set_last_offload(OffloadDescriptor* last_offload) {
366 m_last_offload = last_offload;
367 }
368
369 static Stream* find_stream(uint64_t handle, bool remove);
370
371 static _Offload_stream add_stream(int device, int number_of_cpus) {
372 _Offload_stream result;
373 m_stream_lock.lock();
374 result = ++m_streams_count;
375 all_streams[m_streams_count] = new Stream(device, number_of_cpus);
376 m_stream_lock.unlock();
377 return(result);
378 }
379
380 static uint64_t get_streams_count() {
381 return m_streams_count;
382 }
383
384 typedef std::map<uint64_t, Stream*> StreamMap;
385
386 static uint64_t m_streams_count;
387 static StreamMap all_streams;
388 static mutex_t m_stream_lock;
389
390 int m_device;
391
392 // number of cpus
393 int m_number_of_cpus;
394
395 // The pipeline associated with the stream
396 COIPIPELINE m_pipeline;
397
398 // The last offload occured via the stream
399 OffloadDescriptor* m_last_offload;
400
401 // Cpus used by the stream
402 std::bitset<COI_MAX_HW_THREADS> m_stream_cpus;
403 };
404
405 typedef std::map<uint64_t, Stream*> StreamMap;
406 typedef std::bitset<COI_MAX_HW_THREADS> micLcpuMask;
407
408 // ordered by count double linked list of cpus used by streams
409 typedef struct CpuEl{
410 uint64_t count; // number of streams using the cpu
411 struct CpuEl* prev; // cpu with the same or lesser count
412 struct CpuEl* next; // cpu with the same or greater count
413 } CpuEl;
414
415 // class representing a single engine
416 struct Engine {
417 friend void __offload_init_library_once(void);
418 friend void __offload_fini_library(void);
419
420 #define CPU_INDEX(x) (x - m_cpus)
421 #define check_result(res, tag, ...) \
422 { \
423 if (res == COI_PROCESS_DIED) { \
424 fini_process(true); \
425 exit(1); \
426 } \
427 if (res != COI_SUCCESS) { \
428 __liboffload_error_support(tag, __VA_ARGS__); \
429 exit(1); \
430 } \
431 }
432
433 int get_logical_index() const {
434 return m_index;
435 }
436
437 int get_physical_index() const {
438 return m_physical_index;
439 }
440
441 const COIPROCESS& get_process() const {
442 return m_process;
443 }
444
445 bool get_ready() {
446 return m_ready;
447 }
448
449 uint64_t get_thread_id(void);
450
451 // initialize device
452 void init(void);
453
454 // unload library
455 void unload_library(const void *data, const char *name);
456
457 // add new library
458 void add_lib(const TargetImage &lib)
459 {
460 m_lock.lock();
461 m_ready = false;
462 m_images.push_back(lib);
463 m_lock.unlock();
464 }
465
466 COIRESULT compute(
467 _Offload_stream stream,
468 const std::list<COIBUFFER> &buffers,
469 const void* data,
470 uint16_t data_size,
471 void* ret,
472 uint16_t ret_size,
473 uint32_t num_deps,
474 const COIEVENT* deps,
475 COIEVENT* event
476 );
477
478 #ifdef MYO_SUPPORT
479 // temporary workaround for blocking behavior for myoiLibInit/Fini calls
480 void init_myo(COIEVENT *event) {
481 COIRESULT res;
482 res = COI::PipelineRunFunction(get_pipeline(),
483 m_funcs[c_func_myo_init],
484 0, 0, 0, 0, 0, 0, 0, 0, 0,
485 event);
486 check_result(res, c_pipeline_run_func, m_index, res);
487 }
488
489 void fini_myo(COIEVENT *event) {
490 COIRESULT res;
491 res = COI::PipelineRunFunction(get_pipeline(),
492 m_funcs[c_func_myo_fini],
493 0, 0, 0, 0, 0, 0, 0, 0, 0,
494 event);
495 check_result(res, c_pipeline_run_func, m_index, res);
496 }
497 #endif // MYO_SUPPORT
498
499 //
500 // Memory association table
501 //
502 PtrData* find_ptr_data(const void *ptr) {
503 return m_ptr_set.find_ptr_data(ptr);
504 }
505
506 PtrData* find_targetptr_data(const void *ptr) {
507 return m_targetptr_set.find_ptr_data(ptr);
508 }
509
510 PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
511 return m_ptr_set.insert_ptr_data(ptr, len, is_new);
512 }
513
514 PtrData* insert_targetptr_data(const void *ptr, uint64_t len,
515 bool &is_new) {
516 return m_targetptr_set.insert_ptr_data(ptr, len, is_new);
517 }
518
519 void remove_ptr_data(const void *ptr) {
520 m_ptr_set.remove_ptr_data(ptr);
521 }
522
523 void remove_targetptr_data(const void *ptr) {
524 m_targetptr_set.remove_ptr_data(ptr);
525 }
526
527 //
528 // Automatic variables
529 //
530 AutoData* find_auto_data(const void *ptr) {
531 AutoSet &auto_vars = get_auto_vars();
532 AutoSet::iterator res = auto_vars.find(AutoData(ptr, 0));
533 if (res == auto_vars.end()) {
534 return 0;
535 }
536 return const_cast<AutoData*>(res.operator->());
537 }
538
539 AutoData* insert_auto_data(const void *ptr, uint64_t len) {
540 AutoSet &auto_vars = get_auto_vars();
541 std::pair<AutoSet::iterator, bool> res =
542 auto_vars.insert(AutoData(ptr, len));
543 return const_cast<AutoData*>(res.first.operator->());
544 }
545
546 void remove_auto_data(const void *ptr) {
547 get_auto_vars().erase(AutoData(ptr, 0));
548 }
549
550 //
551 // Signals
552 //
553 void add_signal(const void *signal, OffloadDescriptor *desc) {
554 m_signal_lock.lock();
555 m_signal_map[signal] = desc;
556 m_signal_lock.unlock();
557 }
558
559 OffloadDescriptor* find_signal(const void *signal, bool remove) {
560 OffloadDescriptor *desc = 0;
561
562 m_signal_lock.lock();
563 {
564 SignalMap::iterator it = m_signal_map.find(signal);
565 if (it != m_signal_map.end()) {
566 desc = it->second;
567 if (remove) {
568 it->second = SIGNAL_HAS_COMPLETED;
569 }
570 }
571 }
572 m_signal_lock.unlock();
573
574 return desc;
575 }
576
577 void complete_signaled_ofld(const void *signal) {
578
579 m_signal_lock.lock();
580 {
581 SignalMap::iterator it = m_signal_map.find(signal);
582 if (it != m_signal_map.end()) {
583 it->second = SIGNAL_HAS_COMPLETED;
584 }
585 }
586 m_signal_lock.unlock();
587 }
588
589 void stream_destroy(_Offload_stream handle);
590
591 void move_cpu_el_after(CpuEl* cpu_what, CpuEl* cpu_after);
592 void print_stream_cpu_list(const char *);
593
594 COIPIPELINE get_pipeline(_Offload_stream stream);
595
596 StreamMap get_stream_map() {
597 return m_stream_map;
598 }
599
600 // stop device process
601 void fini_process(bool verbose);
602
603 // list of stacks active at the engine
604 PersistDataList m_persist_list;
605
606 private:
607 Engine() : m_index(-1), m_physical_index(-1), m_process(0), m_ready(false),
608 m_proc_number(0), m_assigned_cpus(0), m_cpus(0), m_cpu_head(0)
609 {}
610
611 ~Engine() {
612 m_ready = false;
613 for (StreamMap::iterator it = m_stream_map.begin();
614 it != m_stream_map.end(); it++) {
615 Stream * stream = it->second;
616 delete stream;
617 }
618 if (m_process != 0) {
619 fini_process(false);
620 }
621 if (m_assigned_cpus) {
622 delete m_assigned_cpus;
623 }
624 }
625
626 // set indexes
627 void set_indexes(int logical_index, int physical_index) {
628 m_index = logical_index;
629 m_physical_index = physical_index;
630 }
631
632 // set CPU mask
633 void set_cpu_mask(micLcpuMask *cpu_mask)
634 {
635 m_assigned_cpus = cpu_mask;
636 }
637
638 // start process on device
639 void init_process();
640
641 void load_libraries(void);
642 void init_ptr_data(void);
643
644 // performs library intialization on the device side
645 pid_t init_device(void);
646
647 private:
648 // get pipeline associated with a calling thread
649 COIPIPELINE get_pipeline(void);
650
651 // get automatic vars set associated with the calling thread
652 AutoSet& get_auto_vars(void);
653
654 // destructor for thread data
655 static void destroy_thread_data(void *data);
656
657 private:
658 typedef std::set<PtrData> PtrSet;
659 typedef std::map<const void*, OffloadDescriptor*> SignalMap;
660
661 // device indexes
662 int m_index;
663 int m_physical_index;
664
665 // cpu mask
666 micLcpuMask *m_assigned_cpus;
667
668 // number of COI pipes created for the engine
669 long m_proc_number;
670
671 // process handle
672 COIPROCESS m_process;
673
674 // If false, device either has not been initialized or new libraries
675 // have been added.
676 bool m_ready;
677 mutex_t m_lock;
678
679 // List of libraries to be loaded
680 TargetImageList m_images;
681
682 // var tables
683 PtrDataTable m_ptr_set;
684 PtrDataTable m_targetptr_set;
685
686 // signals
687 SignalMap m_signal_map;
688 mutex_t m_signal_lock;
689
690 // streams
691 StreamMap m_stream_map;
692 mutex_t m_stream_lock;
693 int m_num_cores;
694 int m_num_threads;
695 CpuEl* m_cpus;
696 CpuEl* m_cpu_head;
697
698 // List of dynamic libraries to be registred
699 DynLibList m_dyn_libs;
700
701 // constants for accessing device function handles
702 enum {
703 c_func_compute = 0,
704 #ifdef MYO_SUPPORT
705 c_func_myo_init,
706 c_func_myo_fini,
707 #endif // MYO_SUPPORT
708 c_func_init,
709 c_func_var_table_size,
710 c_func_var_table_copy,
711 c_func_set_stream_affinity,
712 c_funcs_total
713 };
714 static const char* m_func_names[c_funcs_total];
715
716 // device function handles
717 COIFUNCTION m_funcs[c_funcs_total];
718
719 // int -> name mapping for device signals
720 static const int c_signal_max = 32;
721 static const char* c_signal_names[c_signal_max];
722 };
723
724 #endif // OFFLOAD_ENGINE_H_INCLUDED