src/gpu-compute/gpu_command_processor.cc

   1 /*
   2  * Copyright (c) 2018 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Authors: Anthony Gutierrez
  34  */
  35
  36 #include "gpu-compute/gpu_command_processor.hh"
  37
  38 #include "debug/GPUCommandProc.hh"
  39 #include "debug/GPUKernelInfo.hh"
  40 #include "gpu-compute/dispatcher.hh"
  41 #include "params/GPUCommandProcessor.hh"
  42
  43 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
  44     : HSADevice(p), dispatcher(*p.dispatcher)
  45 {
  46     dispatcher.setCommandProcessor(this);
  47 }
  48
  49 /**
  50  * submitDispatchPkt() is the entry point into the CP from the HSAPP
  51  * and is only meant to be used with AQL kernel dispatch packets.
  52  * After the HSAPP receives and extracts an AQL packet, it sends
  53  * it to the CP, which is responsible for gathering all relevant
  54  * information about a task, initializing CU state, and sending
  55  * it to the dispatcher for WG creation and dispatch.
  56  *
  57  * First we need capture all information from the the AQL pkt and
  58  * the code object, then store it in an HSAQueueEntry. Once the
  59  * packet and code are extracted, we extract information from the
  60  * queue descriptor that the CP needs to perform state initialization
  61  * on the CU. Finally we call dispatch() to send the task to the
  62  * dispatcher. When the task completely finishes, we call finishPkt()
  63  * on the HSA packet processor in order to remove the packet from the
  64  * queue, and notify the runtime that the task has completed.
  65  */
  66 void
  67 GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
  68                                        Addr host_pkt_addr)
  69 {
  70     static int dynamic_task_id = 0;
  71     _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
  72
  73     /**
  74      * we need to read a pointer in the application's address
  75      * space to pull out the kernel code descriptor.
  76      */
  77     auto *tc = sys->threads[0];
  78     auto &virt_proxy = tc->getVirtProxy();
  79
  80     /**
  81      * The kernel_object is a pointer to the machine code, whose entry
  82      * point is an 'amd_kernel_code_t' type, which is included in the
  83      * kernel binary, and describes various aspects of the kernel. The
  84      * desired entry is the 'kernel_code_entry_byte_offset' field,
  85      * which provides the byte offset (positive or negative) from the
  86      * address of the amd_kernel_code_t to the start of the machine
  87      * instructions.
  88      */
  89     AMDKernelCode akc;
  90     virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
  91         sizeof(AMDKernelCode));
  92
  93     DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
  94         "kernel object\n", akc.kernel_code_entry_byte_offset);
  95
  96     Addr machine_code_addr = (Addr)disp_pkt->kernel_object
  97         + akc.kernel_code_entry_byte_offset;
  98
  99     DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
 100         machine_code_addr);
 101
 102     Addr kern_name_addr(0);
 103     std::string kernel_name;
 104
 105     /**
 106      * BLIT kernels don't have symbol names.  BLIT kernels are built-in compute
 107      * kernels issued by ROCm to handle DMAs for dGPUs when the SDMA
 108      * hardware engines are unavailable or explicitly disabled.  They can also
 109      * be used to do copies that ROCm things would be better performed
 110      * by the shader than the SDMA engines.  They are also sometimes used on
 111      * APUs to implement asynchronous memcopy operations from 2 pointers in
 112      * host memory.  I have no idea what BLIT stands for.
 113      * */
 114     if (akc.runtime_loader_kernel_symbol) {
 115         virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
 116             (uint8_t*)&kern_name_addr, 0x8);
 117
 118         virt_proxy.readString(kernel_name, kern_name_addr);
 119     } else {
 120         kernel_name = "Blit kernel";
 121     }
 122
 123     DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
 124
 125     HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
 126         dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
 127
 128     DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
 129         "grid size (%dx%dx%d) kernarg addr: %#x, completion "
 130         "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
 131         disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
 132         disp_pkt->grid_size_x, disp_pkt->grid_size_y,
 133         disp_pkt->grid_size_z, disp_pkt->kernarg_address,
 134         disp_pkt->completion_signal);
 135
 136     DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
 137         "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
 138         "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
 139         task->numScalarRegs(), task->codeAddr(), 0, 0);
 140
 141     initABI(task);
 142     ++dynamic_task_id;
 143 }
 144
 145 /**
 146  * submitVendorPkt() is for accepting vendor-specific packets from
 147  * the HSAPP. Vendor-specific packets may be used by the runtime to
 148  * send commands to the HSA device that are specific to a particular
 149  * vendor. The vendor-specific packets should be defined by the vendor
 150  * in the runtime.
 151  */
 152
 153 /**
 154  * TODO: For now we simply tell the HSAPP to finish the packet,
 155  *       however a future patch will update this method to provide
 156  *       the proper handling of any required vendor-specific packets.
 157  *       In the version of ROCm that is currently supported (1.6)
 158  *       the runtime will send packets that direct the CP to
 159  *       invalidate the GPUs caches. We do this automatically on
 160  *       each kernel launch in the CU, so this is safe for now.
 161  */
 162 void
 163 GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
 164     Addr host_pkt_addr)
 165 {
 166     hsaPP->finishPkt(raw_pkt, queue_id);
 167 }
 168
 169 /**
 170  * Once the CP has finished extracting all relevant information about
 171  * a task and has initialized the ABI state, we send a description of
 172  * the task to the dispatcher. The dispatcher will create and dispatch
 173  * WGs to the CUs.
 174  */
 175 void
 176 GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
 177 {
 178     dispatcher.dispatch(task);
 179 }
 180
 181 /**
 182  * The CP is responsible for traversing all HSA-ABI-related data
 183  * structures from memory and initializing the ABI state.
 184  * Information provided by the MQD, AQL packet, and code object
 185  * metadata will be used to initialze register file state.
 186  */
 187 void
 188 GPUCommandProcessor::initABI(HSAQueueEntry *task)
 189 {
 190     auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
 191
 192     Addr hostReadIdxPtr
 193         = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
 194
 195     dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
 196         sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
 197             &readDispIdOffEvent->readDispIdOffset);
 198 }
 199
 200 System*
 201 GPUCommandProcessor::system()
 202 {
 203     return sys;
 204 }
 205
 206 AddrRangeList
 207 GPUCommandProcessor::getAddrRanges() const
 208 {
 209     AddrRangeList ranges;
 210     return ranges;
 211 }
 212
 213 void
 214 GPUCommandProcessor::setShader(Shader *shader)
 215 {
 216     _shader = shader;
 217 }
 218
 219 Shader*
 220 GPUCommandProcessor::shader()
 221 {
 222     return _shader;
 223 }