misc: Delete the now unnecessary create methods.
[gem5.git] / src / gpu-compute / gpu_command_processor.cc
1 /*
2 * Copyright (c) 2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Anthony Gutierrez
34 */
35
36 #include "gpu-compute/gpu_command_processor.hh"
37
38 #include "debug/GPUCommandProc.hh"
39 #include "debug/GPUKernelInfo.hh"
40 #include "gpu-compute/dispatcher.hh"
41 #include "params/GPUCommandProcessor.hh"
42
43 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
44 : HSADevice(p), dispatcher(*p.dispatcher)
45 {
46 dispatcher.setCommandProcessor(this);
47 }
48
49 /**
50 * submitDispatchPkt() is the entry point into the CP from the HSAPP
51 * and is only meant to be used with AQL kernel dispatch packets.
52 * After the HSAPP receives and extracts an AQL packet, it sends
53 * it to the CP, which is responsible for gathering all relevant
54 * information about a task, initializing CU state, and sending
55 * it to the dispatcher for WG creation and dispatch.
56 *
57 * First we need capture all information from the the AQL pkt and
58 * the code object, then store it in an HSAQueueEntry. Once the
59 * packet and code are extracted, we extract information from the
60 * queue descriptor that the CP needs to perform state initialization
61 * on the CU. Finally we call dispatch() to send the task to the
62 * dispatcher. When the task completely finishes, we call finishPkt()
63 * on the HSA packet processor in order to remove the packet from the
64 * queue, and notify the runtime that the task has completed.
65 */
66 void
67 GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
68 Addr host_pkt_addr)
69 {
70 static int dynamic_task_id = 0;
71 _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
72
73 /**
74 * we need to read a pointer in the application's address
75 * space to pull out the kernel code descriptor.
76 */
77 auto *tc = sys->threads[0];
78 auto &virt_proxy = tc->getVirtProxy();
79
80 /**
81 * The kernel_object is a pointer to the machine code, whose entry
82 * point is an 'amd_kernel_code_t' type, which is included in the
83 * kernel binary, and describes various aspects of the kernel. The
84 * desired entry is the 'kernel_code_entry_byte_offset' field,
85 * which provides the byte offset (positive or negative) from the
86 * address of the amd_kernel_code_t to the start of the machine
87 * instructions.
88 */
89 AMDKernelCode akc;
90 virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
91 sizeof(AMDKernelCode));
92
93 DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
94 "kernel object\n", akc.kernel_code_entry_byte_offset);
95
96 Addr machine_code_addr = (Addr)disp_pkt->kernel_object
97 + akc.kernel_code_entry_byte_offset;
98
99 DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
100 machine_code_addr);
101
102 Addr kern_name_addr(0);
103 std::string kernel_name;
104
105 /**
106 * BLIT kernels don't have symbol names. BLIT kernels are built-in compute
107 * kernels issued by ROCm to handle DMAs for dGPUs when the SDMA
108 * hardware engines are unavailable or explicitly disabled. They can also
109 * be used to do copies that ROCm things would be better performed
110 * by the shader than the SDMA engines. They are also sometimes used on
111 * APUs to implement asynchronous memcopy operations from 2 pointers in
112 * host memory. I have no idea what BLIT stands for.
113 * */
114 if (akc.runtime_loader_kernel_symbol) {
115 virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
116 (uint8_t*)&kern_name_addr, 0x8);
117
118 virt_proxy.readString(kernel_name, kern_name_addr);
119 } else {
120 kernel_name = "Blit kernel";
121 }
122
123 DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
124
125 HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
126 dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
127
128 DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
129 "grid size (%dx%dx%d) kernarg addr: %#x, completion "
130 "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
131 disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
132 disp_pkt->grid_size_x, disp_pkt->grid_size_y,
133 disp_pkt->grid_size_z, disp_pkt->kernarg_address,
134 disp_pkt->completion_signal);
135
136 DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
137 "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
138 "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
139 task->numScalarRegs(), task->codeAddr(), 0, 0);
140
141 initABI(task);
142 ++dynamic_task_id;
143 }
144
145 /**
146 * submitVendorPkt() is for accepting vendor-specific packets from
147 * the HSAPP. Vendor-specific packets may be used by the runtime to
148 * send commands to the HSA device that are specific to a particular
149 * vendor. The vendor-specific packets should be defined by the vendor
150 * in the runtime.
151 */
152
153 /**
154 * TODO: For now we simply tell the HSAPP to finish the packet,
155 * however a future patch will update this method to provide
156 * the proper handling of any required vendor-specific packets.
157 * In the version of ROCm that is currently supported (1.6)
158 * the runtime will send packets that direct the CP to
159 * invalidate the GPUs caches. We do this automatically on
160 * each kernel launch in the CU, so this is safe for now.
161 */
162 void
163 GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
164 Addr host_pkt_addr)
165 {
166 hsaPP->finishPkt(raw_pkt, queue_id);
167 }
168
169 /**
170 * Once the CP has finished extracting all relevant information about
171 * a task and has initialized the ABI state, we send a description of
172 * the task to the dispatcher. The dispatcher will create and dispatch
173 * WGs to the CUs.
174 */
175 void
176 GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
177 {
178 dispatcher.dispatch(task);
179 }
180
181 /**
182 * The CP is responsible for traversing all HSA-ABI-related data
183 * structures from memory and initializing the ABI state.
184 * Information provided by the MQD, AQL packet, and code object
185 * metadata will be used to initialze register file state.
186 */
187 void
188 GPUCommandProcessor::initABI(HSAQueueEntry *task)
189 {
190 auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
191
192 Addr hostReadIdxPtr
193 = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
194
195 dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
196 sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
197 &readDispIdOffEvent->readDispIdOffset);
198 }
199
200 System*
201 GPUCommandProcessor::system()
202 {
203 return sys;
204 }
205
206 AddrRangeList
207 GPUCommandProcessor::getAddrRanges() const
208 {
209 AddrRangeList ranges;
210 return ranges;
211 }
212
213 void
214 GPUCommandProcessor::setShader(Shader *shader)
215 {
216 _shader = shader;
217 }
218
219 Shader*
220 GPUCommandProcessor::shader()
221 {
222 return _shader;
223 }