src/gpu-compute/gpu_command_processor.cc

   1 /*
   2  * Copyright (c) 2018 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Authors: Anthony Gutierrez
  34  */
  35
  36 #include "gpu-compute/gpu_command_processor.hh"
  37
  38 #include "debug/GPUCommandProc.hh"
  39 #include "debug/GPUKernelInfo.hh"
  40 #include "gpu-compute/dispatcher.hh"
  41 #include "params/GPUCommandProcessor.hh"
  42
  43 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
  44     : HSADevice(p), dispatcher(*p.dispatcher)
  45 {
  46     dispatcher.setCommandProcessor(this);
  47 }
  48
  49 /**
  50  * submitDispatchPkt() is the entry point into the CP from the HSAPP
  51  * and is only meant to be used with AQL kernel dispatch packets.
  52  * After the HSAPP receives and extracts an AQL packet, it sends
  53  * it to the CP, which is responsible for gathering all relevant
  54  * information about a task, initializing CU state, and sending
  55  * it to the dispatcher for WG creation and dispatch.
  56  *
  57  * First we need capture all information from the the AQL pkt and
  58  * the code object, then store it in an HSAQueueEntry. Once the
  59  * packet and code are extracted, we extract information from the
  60  * queue descriptor that the CP needs to perform state initialization
  61  * on the CU. Finally we call dispatch() to send the task to the
  62  * dispatcher. When the task completely finishes, we call finishPkt()
  63  * on the HSA packet processor in order to remove the packet from the
  64  * queue, and notify the runtime that the task has completed.
  65  */
  66 void
  67 GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
  68                                        Addr host_pkt_addr)
  69 {
  70     static int dynamic_task_id = 0;
  71     _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
  72
  73     /**
  74      * we need to read a pointer in the application's address
  75      * space to pull out the kernel code descriptor.
  76      */
  77     auto *tc = sys->threads[0];
  78     auto &virt_proxy = tc->getVirtProxy();
  79
  80     /**
  81      * The kernel_object is a pointer to the machine code, whose entry
  82      * point is an 'amd_kernel_code_t' type, which is included in the
  83      * kernel binary, and describes various aspects of the kernel. The
  84      * desired entry is the 'kernel_code_entry_byte_offset' field,
  85      * which provides the byte offset (positive or negative) from the
  86      * address of the amd_kernel_code_t to the start of the machine
  87      * instructions.
  88      */
  89     AMDKernelCode akc;
  90     virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
  91         sizeof(AMDKernelCode));
  92
  93     DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
  94         "kernel object\n", akc.kernel_code_entry_byte_offset);
  95
  96     DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
  97         (uint64_t)tc->cpuId());
  98
  99
 100     Addr machine_code_addr = (Addr)disp_pkt->kernel_object
 101         + akc.kernel_code_entry_byte_offset;
 102
 103     DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
 104         machine_code_addr);
 105
 106     Addr kern_name_addr(0);
 107     std::string kernel_name;
 108
 109     /**
 110      * BLIT kernels don't have symbol names.  BLIT kernels are built-in compute
 111      * kernels issued by ROCm to handle DMAs for dGPUs when the SDMA
 112      * hardware engines are unavailable or explicitly disabled.  They can also
 113      * be used to do copies that ROCm things would be better performed
 114      * by the shader than the SDMA engines.  They are also sometimes used on
 115      * APUs to implement asynchronous memcopy operations from 2 pointers in
 116      * host memory.  I have no idea what BLIT stands for.
 117      * */
 118     if (akc.runtime_loader_kernel_symbol) {
 119         virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
 120             (uint8_t*)&kern_name_addr, 0x8);
 121
 122         virt_proxy.readString(kernel_name, kern_name_addr);
 123     } else {
 124         kernel_name = "Blit kernel";
 125     }
 126
 127     DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
 128
 129     HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
 130         dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
 131
 132     DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
 133         "grid size (%dx%dx%d) kernarg addr: %#x, completion "
 134         "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
 135         disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
 136         disp_pkt->grid_size_x, disp_pkt->grid_size_y,
 137         disp_pkt->grid_size_z, disp_pkt->kernarg_address,
 138         disp_pkt->completion_signal);
 139
 140     DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
 141         "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
 142         "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
 143         task->numScalarRegs(), task->codeAddr(), 0, 0);
 144
 145     initABI(task);
 146     ++dynamic_task_id;
 147 }
 148
 149 /**
 150  * submitVendorPkt() is for accepting vendor-specific packets from
 151  * the HSAPP. Vendor-specific packets may be used by the runtime to
 152  * send commands to the HSA device that are specific to a particular
 153  * vendor. The vendor-specific packets should be defined by the vendor
 154  * in the runtime.
 155  */
 156
 157 /**
 158  * TODO: For now we simply tell the HSAPP to finish the packet,
 159  *       however a future patch will update this method to provide
 160  *       the proper handling of any required vendor-specific packets.
 161  *       In the version of ROCm that is currently supported (1.6)
 162  *       the runtime will send packets that direct the CP to
 163  *       invalidate the GPUs caches. We do this automatically on
 164  *       each kernel launch in the CU, so this is safe for now.
 165  */
 166 void
 167 GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
 168     Addr host_pkt_addr)
 169 {
 170     hsaPP->finishPkt(raw_pkt, queue_id);
 171 }
 172
 173 /**
 174  * submitAgentDispatchPkt() is for accepting agent dispatch packets.
 175  * These packets will control the dispatch of Wg on the device, and inform
 176  * the host when a specified number of Wg have been executed on the device.
 177  *
 178  * For now it simply finishes the pkt.
 179  */
 180 void
 181 GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
 182     Addr host_pkt_addr)
 183 {
 184     //Parse the Packet, see what it wants us to do
 185     _hsa_agent_dispatch_packet_t * agent_pkt =
 186         (_hsa_agent_dispatch_packet_t *)raw_pkt;
 187
 188     if (agent_pkt->type == AgentCmd::Nop) {
 189         DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
 190     } else if (agent_pkt->type == AgentCmd::Steal) {
 191         //This is where we steal the HSA Task's completion signal
 192         int kid = agent_pkt->arg[0];
 193         DPRINTF(GPUCommandProc,
 194             "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
 195             kid);
 196
 197         HSAQueueEntry *task = dispatcher.hsaTask(kid);
 198         uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);
 199
 200         uint64_t return_address = agent_pkt->return_address;
 201         DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
 202         //*return_address = signal_addr;
 203         Addr *new_signal_addr = new Addr;
 204         *new_signal_addr  = (Addr)signal_addr;
 205         dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);
 206
 207         DPRINTF(GPUCommandProc,
 208             "Agent Dispatch Packet Stealing signal handle from kid %d :" \
 209             "(%x:%x) writing into %x\n",
 210             kid,signal_addr,new_signal_addr,return_address);
 211
 212     } else
 213     {
 214         panic("The agent dispatch packet provided an unknown argument in" \
 215         "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
 216     }
 217
 218     hsaPP->finishPkt(raw_pkt, queue_id);
 219 }
 220
 221 /**
 222  * Once the CP has finished extracting all relevant information about
 223  * a task and has initialized the ABI state, we send a description of
 224  * the task to the dispatcher. The dispatcher will create and dispatch
 225  * WGs to the CUs.
 226  */
 227 void
 228 GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
 229 {
 230     dispatcher.dispatch(task);
 231 }
 232
 233 /**
 234  * The CP is responsible for traversing all HSA-ABI-related data
 235  * structures from memory and initializing the ABI state.
 236  * Information provided by the MQD, AQL packet, and code object
 237  * metadata will be used to initialze register file state.
 238  */
 239 void
 240 GPUCommandProcessor::initABI(HSAQueueEntry *task)
 241 {
 242     auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
 243
 244     Addr hostReadIdxPtr
 245         = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
 246
 247     dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
 248         sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
 249             &readDispIdOffEvent->readDispIdOffset);
 250 }
 251
 252 System*
 253 GPUCommandProcessor::system()
 254 {
 255     return sys;
 256 }
 257
 258 AddrRangeList
 259 GPUCommandProcessor::getAddrRanges() const
 260 {
 261     AddrRangeList ranges;
 262     return ranges;
 263 }
 264
 265 void
 266 GPUCommandProcessor::setShader(Shader *shader)
 267 {
 268     _shader = shader;
 269 }
 270
 271 Shader*
 272 GPUCommandProcessor::shader()
 273 {
 274     return _shader;
 275 }