gpu-compute, arch-gcn3: refactor barriers
[gem5.git] / src / gpu-compute / kernel_code.hh
1 /*
2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Anthony Gutierrez
34 */
35
36 #ifndef __GPU_COMPUTE_KERNEL_CODE_HH__
37 #define __GPU_COMPUTE_KERNEL_CODE_HH__
38
39 #include <bitset>
40 #include <cstdint>
41
42 /**
43 * these enums represent the indices into the
44 * initialRegState bitfields in HsaKernelInfo.
45 * each bit specifies whether or not the
46 * particular piece of state that the bit
47 * corresponds to should be initialized into
48 * the VGPRs/SGPRs. the order in which the
49 * fields are placed matters, as all enabled
50 * pieces of state will be initialized into
51 * contiguous registers in the same order
52 * as their position in the bitfield - which
53 * is specified in the HSA ABI.
54 */
55 enum ScalarRegInitFields : int
56 {
57 PrivateSegBuf = 0,
58 DispatchPtr = 1,
59 QueuePtr = 2,
60 KernargSegPtr = 3,
61 DispatchId = 4,
62 FlatScratchInit = 5,
63 PrivateSegSize = 6,
64 GridWorkgroupCountX = 7,
65 GridWorkgroupCountY = 8,
66 GridWorkgroupCountZ = 9,
67 WorkgroupIdX = 10,
68 WorkgroupIdY = 11,
69 WorkgroupIdZ = 12,
70 WorkgroupInfo = 13,
71 PrivSegWaveByteOffset = 14,
72 NumScalarInitFields = 15
73 };
74
75 enum VectorRegInitFields : int
76 {
77 WorkitemIdX = 0,
78 WorkitemIdY = 1,
79 WorkitemIdZ = 2,
80 NumVectorInitFields = 3
81 };
82
83 struct AMDKernelCode
84 {
85 uint32_t amd_kernel_code_version_major;
86 uint32_t amd_kernel_code_version_minor;
87 uint16_t amd_machine_kind;
88 uint16_t amd_machine_version_major;
89 uint16_t amd_machine_version_minor;
90 uint16_t amd_machine_version_stepping;
91 int64_t kernel_code_entry_byte_offset;
92 int64_t kernel_code_prefetch_byte_offset;
93 uint64_t kernel_code_prefetch_byte_size;
94 uint64_t max_scratch_backing_memory_byte_size;
95
96 /**
97 * The fields below are used to set program settings for
98 * compute shaders. Here they are primarily used to setup
99 * initial register state. See the following for full details
100 * about kernel launch, state initialization, and the AMD kernel
101 * code object: https://github.com/RadeonOpenCompute/ROCm_Documentation/
102 * blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
103 * #initial-kernel-register-state
104 */
105
106 // the 32b below here represent the fields of
107 // the COMPUTE_PGM_RSRC1 register
108 uint32_t granulated_workitem_vgpr_count : 6;
109 uint32_t granulated_wavefront_sgpr_count : 4;
110 uint32_t priority : 2;
111 uint32_t float_mode_round_32 : 2;
112 uint32_t float_mode_round_16_64 : 2;
113 uint32_t float_mode_denorm_32 : 2;
114 uint32_t float_mode_denorm_16_64 : 2;
115 uint32_t priv : 1;
116 uint32_t enable_dx10_clamp : 1;
117 uint32_t debug_mode : 1;
118 uint32_t enable_ieee_mode : 1;
119 uint32_t bulky : 1;
120 uint32_t cdbg_user : 1;
121 uint32_t compute_pgm_rsrc1_reserved : 6;
122 // end COMPUTE_PGM_RSRC1 register
123
124 // the 32b below here represent the fields of
125 // the COMPUTE_PGM_RSRC2 register
126 uint32_t enable_sgpr_private_segment_wave_byte_offset : 1;
127 uint32_t user_sgpr_count : 5;
128 uint32_t enable_trap_handler : 1;
129 uint32_t enable_sgpr_workgroup_id_x : 1;
130 uint32_t enable_sgpr_workgroup_id_y : 1;
131 uint32_t enable_sgpr_workgroup_id_z : 1;
132 uint32_t enable_sgpr_workgroup_info : 1;
133 uint32_t enable_vgpr_workitem_id_y : 1;
134 uint32_t enable_vgpr_workitem_id_z : 1;
135 uint32_t enable_exception_address_watch : 1;
136 uint32_t enable_exception_memory_violation : 1;
137 uint32_t granulated_lds_size : 9;
138 uint32_t enable_exception_ieee_754_fp_invalid_operation : 1;
139 uint32_t enable_exception_fp_denormal_source : 1;
140 uint32_t enable_exception_ieee_754_fp_division_by_zero : 1;
141 uint32_t enable_exception_ieee_754_fp_overflow : 1;
142 uint32_t enable_exception_ieee_754_fp_underflow : 1;
143 uint32_t enable_exception_ieee_754_fp_inexact : 1;
144 uint32_t enable_exception_int_divide_by_zero : 1;
145 uint32_t compute_pgm_rsrc2_reserved : 1;
146 // end COMPUTE_PGM_RSRC2
147
148 // the 32b below here represent the fields of
149 // KERNEL_CODE_PROPERTIES
150 uint32_t enable_sgpr_private_segment_buffer : 1;
151 uint32_t enable_sgpr_dispatch_ptr : 1;
152 uint32_t enable_sgpr_queue_ptr : 1;
153 uint32_t enable_sgpr_kernarg_segment_ptr : 1;
154 uint32_t enable_sgpr_dispatch_id : 1;
155 uint32_t enable_sgpr_flat_scratch_init : 1;
156 uint32_t enable_sgpr_private_segment_size : 1;
157 uint32_t enable_sgpr_grid_workgroup_count_x : 1;
158 uint32_t enable_sgpr_grid_workgroup_count_y : 1;
159 uint32_t enable_sgpr_grid_workgroup_count_z : 1;
160 uint32_t kernel_code_properties_reserved1 : 6;
161 uint32_t enable_ordered_append_gds : 1;
162 uint32_t private_element_size : 2;
163 uint32_t is_ptr64 : 1;
164 uint32_t is_dynamic_callstack : 1;
165 uint32_t is_debug_enabled : 1;
166 uint32_t is_xnack_enabled : 1;
167 uint32_t kernel_code_properties_reserved2 : 9;
168 // end KERNEL_CODE_PROPERTIES
169
170 uint32_t workitem_private_segment_byte_size;
171 uint32_t workgroup_group_segment_byte_size;
172 uint32_t gds_segment_byte_size;
173 uint64_t kernarg_segment_byte_size;
174 uint32_t workgroup_fbarrier_count;
175 uint16_t wavefront_sgpr_count;
176 uint16_t workitem_vgpr_count;
177 uint16_t reserved_vgpr_first;
178 uint16_t reserved_vgpr_count;
179 uint16_t reserved_sgpr_first;
180 uint16_t reserved_sgpr_count;
181 uint16_t debug_wavefront_private_segment_offset_sgpr;
182 uint16_t debug_private_segment_buffer_sgpr;
183 uint8_t kernarg_segment_alignment;
184 uint8_t group_segment_alignment;
185 uint8_t private_segment_alignment;
186 uint8_t wavefront_size;
187 int32_t call_convention;
188 uint8_t reserved[12];
189 uint64_t runtime_loader_kernel_symbol;
190 uint64_t control_directives[16];
191 };
192
193 #endif // __GPU_COMPUTE_KERNEL_CODE_HH__