src/gpu-compute/gpu_command_processor.cc

   1 /*
   2  * Copyright (c) 2018 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Authors: Anthony Gutierrez
  34  */
  35
  36 #include "gpu-compute/gpu_command_processor.hh"
  37
  38 #include "debug/GPUCommandProc.hh"
  39 #include "debug/GPUKernelInfo.hh"
  40 #include "gpu-compute/dispatcher.hh"
  41 #include "params/GPUCommandProcessor.hh"
  42 #include "sim/process.hh"
  43 #include "sim/proxy_ptr.hh"
  44 #include "sim/syscall_emul_buf.hh"
  45
  46 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
  47     : HSADevice(p), dispatcher(*p.dispatcher)
  48 {
  49     dispatcher.setCommandProcessor(this);
  50 }
  51
  52 /**
  53  * submitDispatchPkt() is the entry point into the CP from the HSAPP
  54  * and is only meant to be used with AQL kernel dispatch packets.
  55  * After the HSAPP receives and extracts an AQL packet, it sends
  56  * it to the CP, which is responsible for gathering all relevant
  57  * information about a task, initializing CU state, and sending
  58  * it to the dispatcher for WG creation and dispatch.
  59  *
  60  * First we need capture all information from the the AQL pkt and
  61  * the code object, then store it in an HSAQueueEntry. Once the
  62  * packet and code are extracted, we extract information from the
  63  * queue descriptor that the CP needs to perform state initialization
  64  * on the CU. Finally we call dispatch() to send the task to the
  65  * dispatcher. When the task completely finishes, we call finishPkt()
  66  * on the HSA packet processor in order to remove the packet from the
  67  * queue, and notify the runtime that the task has completed.
  68  */
  69 void
  70 GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
  71                                        Addr host_pkt_addr)
  72 {
  73     static int dynamic_task_id = 0;
  74     _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
  75
  76     /**
  77      * we need to read a pointer in the application's address
  78      * space to pull out the kernel code descriptor.
  79      */
  80     auto *tc = sys->threads[0];
  81     auto &virt_proxy = tc->getVirtProxy();
  82
  83     /**
  84      * The kernel_object is a pointer to the machine code, whose entry
  85      * point is an 'amd_kernel_code_t' type, which is included in the
  86      * kernel binary, and describes various aspects of the kernel. The
  87      * desired entry is the 'kernel_code_entry_byte_offset' field,
  88      * which provides the byte offset (positive or negative) from the
  89      * address of the amd_kernel_code_t to the start of the machine
  90      * instructions.
  91      */
  92     AMDKernelCode akc;
  93     virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
  94         sizeof(AMDKernelCode));
  95
  96     DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
  97         "kernel object\n", akc.kernel_code_entry_byte_offset);
  98
  99     DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
 100         (uint64_t)tc->cpuId());
 101
 102
 103     Addr machine_code_addr = (Addr)disp_pkt->kernel_object
 104         + akc.kernel_code_entry_byte_offset;
 105
 106     DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
 107         machine_code_addr);
 108
 109     Addr kern_name_addr(0);
 110     std::string kernel_name;
 111
 112     /**
 113      * BLIT kernels don't have symbol names.  BLIT kernels are built-in compute
 114      * kernels issued by ROCm to handle DMAs for dGPUs when the SDMA
 115      * hardware engines are unavailable or explicitly disabled.  They can also
 116      * be used to do copies that ROCm things would be better performed
 117      * by the shader than the SDMA engines.  They are also sometimes used on
 118      * APUs to implement asynchronous memcopy operations from 2 pointers in
 119      * host memory.  I have no idea what BLIT stands for.
 120      * */
 121     if (akc.runtime_loader_kernel_symbol) {
 122         virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
 123             (uint8_t*)&kern_name_addr, 0x8);
 124
 125         virt_proxy.readString(kernel_name, kern_name_addr);
 126     } else {
 127         kernel_name = "Blit kernel";
 128     }
 129
 130     DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
 131
 132     HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
 133         dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
 134
 135     DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
 136         "grid size (%dx%dx%d) kernarg addr: %#x, completion "
 137         "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
 138         disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
 139         disp_pkt->grid_size_x, disp_pkt->grid_size_y,
 140         disp_pkt->grid_size_z, disp_pkt->kernarg_address,
 141         disp_pkt->completion_signal);
 142
 143     DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
 144         "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
 145         "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
 146         task->numScalarRegs(), task->codeAddr(), 0, 0);
 147
 148     initABI(task);
 149     ++dynamic_task_id;
 150 }
 151
 152 uint64_t
 153 GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)
 154 {
 155     Addr value_addr = getHsaSignalValueAddr(signal_handle);
 156     auto tc = system()->threads[0];
 157     ConstVPtr<Addr> prev_value(value_addr, tc);
 158     return *prev_value;
 159 }
 160
 161 void
 162 GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value)
 163 {
 164     // The signal value is aligned 8 bytes from
 165     // the actual handle in the runtime
 166     Addr value_addr = getHsaSignalValueAddr(signal_handle);
 167     Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
 168     Addr event_addr = getHsaSignalEventAddr(signal_handle);
 169     DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);
 170
 171     Addr *new_signal = new Addr;
 172     *new_signal = signal_value;
 173
 174     dmaWriteVirt(value_addr, sizeof(Addr), nullptr, new_signal, 0);
 175
 176     auto tc = system()->threads[0];
 177     ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);
 178
 179     // Notifying an event with its mailbox pointer is
 180     // not supported in the current implementation. Just use
 181     // mailbox pointer to distinguish between interruptible
 182     // and default signal. Interruptible signal will have
 183     // a valid mailbox pointer.
 184     if (*mailbox_ptr != 0) {
 185         // This is an interruptible signal. Now, read the
 186         // event ID and directly communicate with the driver
 187         // about that event notification.
 188         ConstVPtr<uint32_t> event_val(event_addr, tc);
 189
 190         DPRINTF(GPUCommandProc, "Calling signal wakeup event on "
 191                 "signal event value %d\n", *event_val);
 192         signalWakeupEvent(*event_val);
 193     }
 194 }
 195
 196 void
 197 GPUCommandProcessor::attachDriver(HSADriver *hsa_driver)
 198 {
 199     fatal_if(driver, "Should not overwrite driver.");
 200     driver = hsa_driver;
 201 }
 202
 203 /**
 204  * submitVendorPkt() is for accepting vendor-specific packets from
 205  * the HSAPP. Vendor-specific packets may be used by the runtime to
 206  * send commands to the HSA device that are specific to a particular
 207  * vendor. The vendor-specific packets should be defined by the vendor
 208  * in the runtime.
 209  */
 210
 211 /**
 212  * TODO: For now we simply tell the HSAPP to finish the packet,
 213  *       however a future patch will update this method to provide
 214  *       the proper handling of any required vendor-specific packets.
 215  *       In the version of ROCm that is currently supported (1.6)
 216  *       the runtime will send packets that direct the CP to
 217  *       invalidate the GPUs caches. We do this automatically on
 218  *       each kernel launch in the CU, so this is safe for now.
 219  */
 220 void
 221 GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
 222     Addr host_pkt_addr)
 223 {
 224     hsaPP->finishPkt(raw_pkt, queue_id);
 225 }
 226
 227 /**
 228  * submitAgentDispatchPkt() is for accepting agent dispatch packets.
 229  * These packets will control the dispatch of Wg on the device, and inform
 230  * the host when a specified number of Wg have been executed on the device.
 231  *
 232  * For now it simply finishes the pkt.
 233  */
 234 void
 235 GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
 236     Addr host_pkt_addr)
 237 {
 238     //Parse the Packet, see what it wants us to do
 239     _hsa_agent_dispatch_packet_t * agent_pkt =
 240         (_hsa_agent_dispatch_packet_t *)raw_pkt;
 241
 242     if (agent_pkt->type == AgentCmd::Nop) {
 243         DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
 244     } else if (agent_pkt->type == AgentCmd::Steal) {
 245         //This is where we steal the HSA Task's completion signal
 246         int kid = agent_pkt->arg[0];
 247         DPRINTF(GPUCommandProc,
 248             "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
 249             kid);
 250
 251         HSAQueueEntry *task = dispatcher.hsaTask(kid);
 252         uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);
 253
 254         uint64_t return_address = agent_pkt->return_address;
 255         DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
 256         //*return_address = signal_addr;
 257         Addr *new_signal_addr = new Addr;
 258         *new_signal_addr  = (Addr)signal_addr;
 259         dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);
 260
 261         DPRINTF(GPUCommandProc,
 262             "Agent Dispatch Packet Stealing signal handle from kid %d :" \
 263             "(%x:%x) writing into %x\n",
 264             kid,signal_addr,new_signal_addr,return_address);
 265
 266     } else
 267     {
 268         panic("The agent dispatch packet provided an unknown argument in" \
 269         "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
 270     }
 271
 272     hsaPP->finishPkt(raw_pkt, queue_id);
 273 }
 274
 275 /**
 276  * Once the CP has finished extracting all relevant information about
 277  * a task and has initialized the ABI state, we send a description of
 278  * the task to the dispatcher. The dispatcher will create and dispatch
 279  * WGs to the CUs.
 280  */
 281 void
 282 GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
 283 {
 284     dispatcher.dispatch(task);
 285 }
 286
 287 void
 288 GPUCommandProcessor::signalWakeupEvent(uint32_t event_id)
 289 {
 290     driver->signalWakeupEvent(event_id);
 291 }
 292
 293 /**
 294  * The CP is responsible for traversing all HSA-ABI-related data
 295  * structures from memory and initializing the ABI state.
 296  * Information provided by the MQD, AQL packet, and code object
 297  * metadata will be used to initialze register file state.
 298  */
 299 void
 300 GPUCommandProcessor::initABI(HSAQueueEntry *task)
 301 {
 302     auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
 303
 304     Addr hostReadIdxPtr
 305         = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
 306
 307     dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
 308         sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
 309             &readDispIdOffEvent->readDispIdOffset);
 310 }
 311
 312 System*
 313 GPUCommandProcessor::system()
 314 {
 315     return sys;
 316 }
 317
 318 AddrRangeList
 319 GPUCommandProcessor::getAddrRanges() const
 320 {
 321     AddrRangeList ranges;
 322     return ranges;
 323 }
 324
 325 void
 326 GPUCommandProcessor::setShader(Shader *shader)
 327 {
 328     _shader = shader;
 329 }
 330
 331 Shader*
 332 GPUCommandProcessor::shader()
 333 {
 334     return _shader;
 335 }