dev-hsa: enable interruptible hsa signal support
[gem5.git] / src / gpu-compute / gpu_command_processor.cc
1 /*
2 * Copyright (c) 2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Anthony Gutierrez
34 */
35
36 #include "gpu-compute/gpu_command_processor.hh"
37
38 #include "debug/GPUCommandProc.hh"
39 #include "debug/GPUKernelInfo.hh"
40 #include "gpu-compute/dispatcher.hh"
41 #include "params/GPUCommandProcessor.hh"
42 #include "sim/process.hh"
43 #include "sim/proxy_ptr.hh"
44 #include "sim/syscall_emul_buf.hh"
45
46 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
47 : HSADevice(p), dispatcher(*p.dispatcher)
48 {
49 dispatcher.setCommandProcessor(this);
50 }
51
52 /**
53 * submitDispatchPkt() is the entry point into the CP from the HSAPP
54 * and is only meant to be used with AQL kernel dispatch packets.
55 * After the HSAPP receives and extracts an AQL packet, it sends
56 * it to the CP, which is responsible for gathering all relevant
57 * information about a task, initializing CU state, and sending
58 * it to the dispatcher for WG creation and dispatch.
59 *
60 * First we need capture all information from the the AQL pkt and
61 * the code object, then store it in an HSAQueueEntry. Once the
62 * packet and code are extracted, we extract information from the
63 * queue descriptor that the CP needs to perform state initialization
64 * on the CU. Finally we call dispatch() to send the task to the
65 * dispatcher. When the task completely finishes, we call finishPkt()
66 * on the HSA packet processor in order to remove the packet from the
67 * queue, and notify the runtime that the task has completed.
68 */
69 void
70 GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
71 Addr host_pkt_addr)
72 {
73 static int dynamic_task_id = 0;
74 _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
75
76 /**
77 * we need to read a pointer in the application's address
78 * space to pull out the kernel code descriptor.
79 */
80 auto *tc = sys->threads[0];
81 auto &virt_proxy = tc->getVirtProxy();
82
83 /**
84 * The kernel_object is a pointer to the machine code, whose entry
85 * point is an 'amd_kernel_code_t' type, which is included in the
86 * kernel binary, and describes various aspects of the kernel. The
87 * desired entry is the 'kernel_code_entry_byte_offset' field,
88 * which provides the byte offset (positive or negative) from the
89 * address of the amd_kernel_code_t to the start of the machine
90 * instructions.
91 */
92 AMDKernelCode akc;
93 virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
94 sizeof(AMDKernelCode));
95
96 DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
97 "kernel object\n", akc.kernel_code_entry_byte_offset);
98
99 DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
100 (uint64_t)tc->cpuId());
101
102
103 Addr machine_code_addr = (Addr)disp_pkt->kernel_object
104 + akc.kernel_code_entry_byte_offset;
105
106 DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
107 machine_code_addr);
108
109 Addr kern_name_addr(0);
110 std::string kernel_name;
111
112 /**
113 * BLIT kernels don't have symbol names. BLIT kernels are built-in compute
114 * kernels issued by ROCm to handle DMAs for dGPUs when the SDMA
115 * hardware engines are unavailable or explicitly disabled. They can also
116 * be used to do copies that ROCm things would be better performed
117 * by the shader than the SDMA engines. They are also sometimes used on
118 * APUs to implement asynchronous memcopy operations from 2 pointers in
119 * host memory. I have no idea what BLIT stands for.
120 * */
121 if (akc.runtime_loader_kernel_symbol) {
122 virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
123 (uint8_t*)&kern_name_addr, 0x8);
124
125 virt_proxy.readString(kernel_name, kern_name_addr);
126 } else {
127 kernel_name = "Blit kernel";
128 }
129
130 DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
131
132 HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
133 dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
134
135 DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
136 "grid size (%dx%dx%d) kernarg addr: %#x, completion "
137 "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
138 disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
139 disp_pkt->grid_size_x, disp_pkt->grid_size_y,
140 disp_pkt->grid_size_z, disp_pkt->kernarg_address,
141 disp_pkt->completion_signal);
142
143 DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
144 "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
145 "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
146 task->numScalarRegs(), task->codeAddr(), 0, 0);
147
148 initABI(task);
149 ++dynamic_task_id;
150 }
151
152 uint64_t
153 GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)
154 {
155 Addr value_addr = getHsaSignalValueAddr(signal_handle);
156 auto tc = system()->threads[0];
157 ConstVPtr<Addr> prev_value(value_addr, tc);
158 return *prev_value;
159 }
160
161 void
162 GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value)
163 {
164 // The signal value is aligned 8 bytes from
165 // the actual handle in the runtime
166 Addr value_addr = getHsaSignalValueAddr(signal_handle);
167 Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
168 Addr event_addr = getHsaSignalEventAddr(signal_handle);
169 DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);
170
171 Addr *new_signal = new Addr;
172 *new_signal = signal_value;
173
174 dmaWriteVirt(value_addr, sizeof(Addr), nullptr, new_signal, 0);
175
176 auto tc = system()->threads[0];
177 ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);
178
179 // Notifying an event with its mailbox pointer is
180 // not supported in the current implementation. Just use
181 // mailbox pointer to distinguish between interruptible
182 // and default signal. Interruptible signal will have
183 // a valid mailbox pointer.
184 if (*mailbox_ptr != 0) {
185 // This is an interruptible signal. Now, read the
186 // event ID and directly communicate with the driver
187 // about that event notification.
188 ConstVPtr<uint32_t> event_val(event_addr, tc);
189
190 DPRINTF(GPUCommandProc, "Calling signal wakeup event on "
191 "signal event value %d\n", *event_val);
192 signalWakeupEvent(*event_val);
193 }
194 }
195
196 void
197 GPUCommandProcessor::attachDriver(HSADriver *hsa_driver)
198 {
199 fatal_if(driver, "Should not overwrite driver.");
200 driver = hsa_driver;
201 }
202
203 /**
204 * submitVendorPkt() is for accepting vendor-specific packets from
205 * the HSAPP. Vendor-specific packets may be used by the runtime to
206 * send commands to the HSA device that are specific to a particular
207 * vendor. The vendor-specific packets should be defined by the vendor
208 * in the runtime.
209 */
210
211 /**
212 * TODO: For now we simply tell the HSAPP to finish the packet,
213 * however a future patch will update this method to provide
214 * the proper handling of any required vendor-specific packets.
215 * In the version of ROCm that is currently supported (1.6)
216 * the runtime will send packets that direct the CP to
217 * invalidate the GPUs caches. We do this automatically on
218 * each kernel launch in the CU, so this is safe for now.
219 */
220 void
221 GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
222 Addr host_pkt_addr)
223 {
224 hsaPP->finishPkt(raw_pkt, queue_id);
225 }
226
227 /**
228 * submitAgentDispatchPkt() is for accepting agent dispatch packets.
229 * These packets will control the dispatch of Wg on the device, and inform
230 * the host when a specified number of Wg have been executed on the device.
231 *
232 * For now it simply finishes the pkt.
233 */
234 void
235 GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
236 Addr host_pkt_addr)
237 {
238 //Parse the Packet, see what it wants us to do
239 _hsa_agent_dispatch_packet_t * agent_pkt =
240 (_hsa_agent_dispatch_packet_t *)raw_pkt;
241
242 if (agent_pkt->type == AgentCmd::Nop) {
243 DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
244 } else if (agent_pkt->type == AgentCmd::Steal) {
245 //This is where we steal the HSA Task's completion signal
246 int kid = agent_pkt->arg[0];
247 DPRINTF(GPUCommandProc,
248 "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
249 kid);
250
251 HSAQueueEntry *task = dispatcher.hsaTask(kid);
252 uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);
253
254 uint64_t return_address = agent_pkt->return_address;
255 DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
256 //*return_address = signal_addr;
257 Addr *new_signal_addr = new Addr;
258 *new_signal_addr = (Addr)signal_addr;
259 dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);
260
261 DPRINTF(GPUCommandProc,
262 "Agent Dispatch Packet Stealing signal handle from kid %d :" \
263 "(%x:%x) writing into %x\n",
264 kid,signal_addr,new_signal_addr,return_address);
265
266 } else
267 {
268 panic("The agent dispatch packet provided an unknown argument in" \
269 "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
270 }
271
272 hsaPP->finishPkt(raw_pkt, queue_id);
273 }
274
275 /**
276 * Once the CP has finished extracting all relevant information about
277 * a task and has initialized the ABI state, we send a description of
278 * the task to the dispatcher. The dispatcher will create and dispatch
279 * WGs to the CUs.
280 */
281 void
282 GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
283 {
284 dispatcher.dispatch(task);
285 }
286
287 void
288 GPUCommandProcessor::signalWakeupEvent(uint32_t event_id)
289 {
290 driver->signalWakeupEvent(event_id);
291 }
292
293 /**
294 * The CP is responsible for traversing all HSA-ABI-related data
295 * structures from memory and initializing the ABI state.
296 * Information provided by the MQD, AQL packet, and code object
297 * metadata will be used to initialze register file state.
298 */
299 void
300 GPUCommandProcessor::initABI(HSAQueueEntry *task)
301 {
302 auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
303
304 Addr hostReadIdxPtr
305 = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
306
307 dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
308 sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
309 &readDispIdOffEvent->readDispIdOffset);
310 }
311
312 System*
313 GPUCommandProcessor::system()
314 {
315 return sys;
316 }
317
318 AddrRangeList
319 GPUCommandProcessor::getAddrRanges() const
320 {
321 AddrRangeList ranges;
322 return ranges;
323 }
324
325 void
326 GPUCommandProcessor::setShader(Shader *shader)
327 {
328 _shader = shader;
329 }
330
331 Shader*
332 GPUCommandProcessor::shader()
333 {
334 return _shader;
335 }