2 * Copyright (c) 2018 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Authors: Anthony Gutierrez
36 #include "gpu-compute/gpu_command_processor.hh"
38 #include "debug/GPUCommandProc.hh"
39 #include "debug/GPUKernelInfo.hh"
40 #include "gpu-compute/dispatcher.hh"
41 #include "params/GPUCommandProcessor.hh"
43 GPUCommandProcessor::GPUCommandProcessor(const Params
&p
)
44 : HSADevice(p
), dispatcher(*p
.dispatcher
)
46 dispatcher
.setCommandProcessor(this);
50 * submitDispatchPkt() is the entry point into the CP from the HSAPP
51 * and is only meant to be used with AQL kernel dispatch packets.
52 * After the HSAPP receives and extracts an AQL packet, it sends
53 * it to the CP, which is responsible for gathering all relevant
54 * information about a task, initializing CU state, and sending
55 * it to the dispatcher for WG creation and dispatch.
57 * First we need capture all information from the the AQL pkt and
58 * the code object, then store it in an HSAQueueEntry. Once the
59 * packet and code are extracted, we extract information from the
60 * queue descriptor that the CP needs to perform state initialization
61 * on the CU. Finally we call dispatch() to send the task to the
62 * dispatcher. When the task completely finishes, we call finishPkt()
63 * on the HSA packet processor in order to remove the packet from the
64 * queue, and notify the runtime that the task has completed.
67 GPUCommandProcessor::submitDispatchPkt(void *raw_pkt
, uint32_t queue_id
,
70 static int dynamic_task_id
= 0;
71 _hsa_dispatch_packet_t
*disp_pkt
= (_hsa_dispatch_packet_t
*)raw_pkt
;
74 * we need to read a pointer in the application's address
75 * space to pull out the kernel code descriptor.
77 auto *tc
= sys
->threads
[0];
78 auto &virt_proxy
= tc
->getVirtProxy();
81 * The kernel_object is a pointer to the machine code, whose entry
82 * point is an 'amd_kernel_code_t' type, which is included in the
83 * kernel binary, and describes various aspects of the kernel. The
84 * desired entry is the 'kernel_code_entry_byte_offset' field,
85 * which provides the byte offset (positive or negative) from the
86 * address of the amd_kernel_code_t to the start of the machine
90 virt_proxy
.readBlob(disp_pkt
->kernel_object
, (uint8_t*)&akc
,
91 sizeof(AMDKernelCode
));
93 DPRINTF(GPUCommandProc
, "GPU machine code is %lli bytes from start of the "
94 "kernel object\n", akc
.kernel_code_entry_byte_offset
);
96 DPRINTF(GPUCommandProc
,"GPUCommandProc: Sending dispatch pkt to %lu\n",
97 (uint64_t)tc
->cpuId());
100 Addr machine_code_addr
= (Addr
)disp_pkt
->kernel_object
101 + akc
.kernel_code_entry_byte_offset
;
103 DPRINTF(GPUCommandProc
, "Machine code starts at addr: %#x\n",
106 Addr
kern_name_addr(0);
107 std::string kernel_name
;
110 * BLIT kernels don't have symbol names. BLIT kernels are built-in compute
111 * kernels issued by ROCm to handle DMAs for dGPUs when the SDMA
112 * hardware engines are unavailable or explicitly disabled. They can also
113 * be used to do copies that ROCm things would be better performed
114 * by the shader than the SDMA engines. They are also sometimes used on
115 * APUs to implement asynchronous memcopy operations from 2 pointers in
116 * host memory. I have no idea what BLIT stands for.
118 if (akc
.runtime_loader_kernel_symbol
) {
119 virt_proxy
.readBlob(akc
.runtime_loader_kernel_symbol
+ 0x10,
120 (uint8_t*)&kern_name_addr
, 0x8);
122 virt_proxy
.readString(kernel_name
, kern_name_addr
);
124 kernel_name
= "Blit kernel";
127 DPRINTF(GPUKernelInfo
, "Kernel name: %s\n", kernel_name
.c_str());
129 HSAQueueEntry
*task
= new HSAQueueEntry(kernel_name
, queue_id
,
130 dynamic_task_id
, raw_pkt
, &akc
, host_pkt_addr
, machine_code_addr
);
132 DPRINTF(GPUCommandProc
, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
133 "grid size (%dx%dx%d) kernarg addr: %#x, completion "
134 "signal addr:%#x\n", dynamic_task_id
, disp_pkt
->workgroup_size_x
,
135 disp_pkt
->workgroup_size_y
, disp_pkt
->workgroup_size_z
,
136 disp_pkt
->grid_size_x
, disp_pkt
->grid_size_y
,
137 disp_pkt
->grid_size_z
, disp_pkt
->kernarg_address
,
138 disp_pkt
->completion_signal
);
140 DPRINTF(GPUCommandProc
, "Extracted code object: %s (num vector regs: %d, "
141 "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
142 "LDS size: %d)\n", kernel_name
, task
->numVectorRegs(),
143 task
->numScalarRegs(), task
->codeAddr(), 0, 0);
150 * submitVendorPkt() is for accepting vendor-specific packets from
151 * the HSAPP. Vendor-specific packets may be used by the runtime to
152 * send commands to the HSA device that are specific to a particular
153 * vendor. The vendor-specific packets should be defined by the vendor
158 * TODO: For now we simply tell the HSAPP to finish the packet,
159 * however a future patch will update this method to provide
160 * the proper handling of any required vendor-specific packets.
161 * In the version of ROCm that is currently supported (1.6)
162 * the runtime will send packets that direct the CP to
163 * invalidate the GPUs caches. We do this automatically on
164 * each kernel launch in the CU, so this is safe for now.
167 GPUCommandProcessor::submitVendorPkt(void *raw_pkt
, uint32_t queue_id
,
170 hsaPP
->finishPkt(raw_pkt
, queue_id
);
174 * submitAgentDispatchPkt() is for accepting agent dispatch packets.
175 * These packets will control the dispatch of Wg on the device, and inform
176 * the host when a specified number of Wg have been executed on the device.
178 * For now it simply finishes the pkt.
181 GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt
, uint32_t queue_id
,
184 //Parse the Packet, see what it wants us to do
185 _hsa_agent_dispatch_packet_t
* agent_pkt
=
186 (_hsa_agent_dispatch_packet_t
*)raw_pkt
;
188 if (agent_pkt
->type
== AgentCmd::Nop
) {
189 DPRINTF(GPUCommandProc
, "Agent Dispatch Packet NOP\n");
190 } else if (agent_pkt
->type
== AgentCmd::Steal
) {
191 //This is where we steal the HSA Task's completion signal
192 int kid
= agent_pkt
->arg
[0];
193 DPRINTF(GPUCommandProc
,
194 "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
197 HSAQueueEntry
*task
= dispatcher
.hsaTask(kid
);
198 uint64_t signal_addr
= task
->completionSignal();// + sizeof(uint64_t);
200 uint64_t return_address
= agent_pkt
->return_address
;
201 DPRINTF(GPUCommandProc
, "Return Addr: %p\n",return_address
);
202 //*return_address = signal_addr;
203 Addr
*new_signal_addr
= new Addr
;
204 *new_signal_addr
= (Addr
)signal_addr
;
205 dmaWriteVirt(return_address
, sizeof(Addr
), nullptr, new_signal_addr
, 0);
207 DPRINTF(GPUCommandProc
,
208 "Agent Dispatch Packet Stealing signal handle from kid %d :" \
209 "(%x:%x) writing into %x\n",
210 kid
,signal_addr
,new_signal_addr
,return_address
);
214 panic("The agent dispatch packet provided an unknown argument in" \
215 "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
218 hsaPP
->finishPkt(raw_pkt
, queue_id
);
222 * Once the CP has finished extracting all relevant information about
223 * a task and has initialized the ABI state, we send a description of
224 * the task to the dispatcher. The dispatcher will create and dispatch
228 GPUCommandProcessor::dispatchPkt(HSAQueueEntry
*task
)
230 dispatcher
.dispatch(task
);
234 * The CP is responsible for traversing all HSA-ABI-related data
235 * structures from memory and initializing the ABI state.
236 * Information provided by the MQD, AQL packet, and code object
237 * metadata will be used to initialze register file state.
240 GPUCommandProcessor::initABI(HSAQueueEntry
*task
)
242 auto *readDispIdOffEvent
= new ReadDispIdOffsetDmaEvent(*this, task
);
245 = hsaPP
->getQueueDesc(task
->queueId())->hostReadIndexPtr
;
247 dmaReadVirt(hostReadIdxPtr
+ sizeof(hostReadIdxPtr
),
248 sizeof(readDispIdOffEvent
->readDispIdOffset
), readDispIdOffEvent
,
249 &readDispIdOffEvent
->readDispIdOffset
);
253 GPUCommandProcessor::system()
259 GPUCommandProcessor::getAddrRanges() const
261 AddrRangeList ranges
;
266 GPUCommandProcessor::setShader(Shader
*shader
)
272 GPUCommandProcessor::shader()