src/gpu-compute/dispatcher.cc

   1 /*
   2  * Copyright (c) 2011-2015,2018 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34
  35 #include "gpu-compute/dispatcher.hh"
  36
  37 #include "debug/GPUDisp.hh"
  38 #include "debug/GPUKernelInfo.hh"
  39 #include "debug/GPUWgLatency.hh"
  40 #include "gpu-compute/gpu_command_processor.hh"
  41 #include "gpu-compute/hsa_queue_entry.hh"
  42 #include "gpu-compute/shader.hh"
  43 #include "gpu-compute/wavefront.hh"
  44 #include "sim/syscall_emul_buf.hh"
  45 #include "sim/system.hh"
  46
  47 GPUDispatcher::GPUDispatcher(const Params &p)
  48     : SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
  49       tickEvent([this]{ exec(); },
  50           "GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
  51       dispatchActive(false)
  52 {
  53     schedule(&tickEvent, 0);
  54 }
  55
  56 GPUDispatcher::~GPUDispatcher()
  57 {
  58 }
  59
  60 void
  61 GPUDispatcher::regStats()
  62 {
  63     numKernelLaunched
  64     .name(name() + ".num_kernel_launched")
  65     .desc("number of kernel launched")
  66     ;
  67
  68     cyclesWaitingForDispatch
  69     .name(name() + ".cycles_wait_dispatch")
  70     .desc("number of cycles with outstanding wavefronts "
  71           "that are waiting to be dispatched")
  72     ;
  73 }
  74
  75 HSAQueueEntry*
  76 GPUDispatcher::hsaTask(int disp_id)
  77 {
  78     assert(hsaQueueEntries.find(disp_id) != hsaQueueEntries.end());
  79     return hsaQueueEntries[disp_id];
  80 }
  81
  82 void
  83 GPUDispatcher::setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
  84 {
  85     gpuCmdProc = gpu_cmd_proc;
  86 }
  87
  88 void
  89 GPUDispatcher::setShader(Shader *new_shader)
  90 {
  91     shader = new_shader;
  92 }
  93
  94 void
  95 GPUDispatcher::serialize(CheckpointOut &cp) const
  96 {
  97     Tick event_tick = 0;
  98
  99     if (tickEvent.scheduled())
 100         event_tick = tickEvent.when();
 101
 102     SERIALIZE_SCALAR(event_tick);
 103 }
 104
 105 void
 106 GPUDispatcher::unserialize(CheckpointIn &cp)
 107 {
 108     Tick event_tick;
 109
 110     if (tickEvent.scheduled())
 111         deschedule(&tickEvent);
 112
 113     UNSERIALIZE_SCALAR(event_tick);
 114
 115     if (event_tick) {
 116         schedule(&tickEvent, event_tick);
 117     }
 118 }
 119
 120 /**
 121  * After all relevant HSA data structures have been traversed/extracted
 122  * from memory by the CP, dispatch() is called on the dispatcher. This will
 123  * schedule a dispatch event that, when triggered, will attempt to dispatch
 124  * the WGs associated with the given task to the CUs.
 125  */
 126 void
 127 GPUDispatcher::dispatch(HSAQueueEntry *task)
 128 {
 129     ++numKernelLaunched;
 130
 131     DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
 132             task->kernelName(), task->dispatchId());
 133
 134     execIds.push(task->dispatchId());
 135     dispatchActive = true;
 136     hsaQueueEntries.emplace(task->dispatchId(), task);
 137
 138     if (!tickEvent.scheduled()) {
 139         schedule(&tickEvent, curTick() + shader->clockPeriod());
 140     }
 141 }
 142
 143 void
 144 GPUDispatcher::exec()
 145 {
 146     int fail_count(0);
 147
 148     /**
 149      * There are potentially multiple outstanding kernel launches.
 150      * It is possible that the workgroups in a different kernel
 151      * can fit on the GPU even if another kernel's workgroups cannot
 152      */
 153     DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
 154
 155     if (execIds.size() > 0) {
 156         ++cyclesWaitingForDispatch;
 157     }
 158
 159     /**
 160      * dispatch work cannot start until the kernel's invalidate is
 161      * completely finished; hence, kernel will always initiates
 162      * invalidate first and keeps waiting until inv done
 163      */
 164     while (execIds.size() > fail_count) {
 165         int exec_id = execIds.front();
 166         auto task = hsaQueueEntries[exec_id];
 167         bool launched(false);
 168
 169         // acq is needed before starting dispatch
 170         if (shader->impl_kern_launch_acq) {
 171             // try to invalidate cache
 172             shader->prepareInvalidate(task);
 173         } else {
 174             // kern launch acquire is not set, skip invalidate
 175             task->markInvDone();
 176         }
 177
 178         /**
 179          * invalidate is still ongoing, put the kernel on the queue to
 180          * retry later
 181          */
 182         if (!task->isInvDone()){
 183             execIds.push(exec_id);
 184             ++fail_count;
 185
 186             DPRINTF(GPUDisp, "kernel %d failed to launch, due to [%d] pending"
 187                 " invalidate requests\n", exec_id, task->outstandingInvs());
 188
 189             // try the next kernel_id
 190             execIds.pop();
 191             continue;
 192         }
 193
 194         // kernel invalidate is done, start workgroup dispatch
 195         while (!task->dispComplete()) {
 196             // update the thread context
 197             shader->updateContext(task->contextId());
 198
 199             // attempt to dispatch workgroup
 200             DPRINTF(GPUWgLatency, "Attempt Kernel Launch cycle:%d kernel:%d\n",
 201                 curTick(), exec_id);
 202
 203             if (!shader->dispatchWorkgroups(task)) {
 204                 /**
 205                  * if we failed try the next kernel,
 206                  * it may have smaller workgroups.
 207                  * put it on the queue to rety latter
 208                  */
 209                 DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id);
 210                 execIds.push(exec_id);
 211                 ++fail_count;
 212                 break;
 213             } else if (!launched) {
 214                 launched = true;
 215                 DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id);
 216             }
 217         }
 218
 219         // try the next kernel_id
 220         execIds.pop();
 221     }
 222
 223     DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
 224
 225     while (doneIds.size()) {
 226         DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front());
 227         doneIds.pop();
 228     }
 229 }
 230
 231 bool
 232 GPUDispatcher::isReachingKernelEnd(Wavefront *wf)
 233 {
 234     int kern_id = wf->kernId;
 235     assert(hsaQueueEntries.find(kern_id) != hsaQueueEntries.end());
 236     auto task = hsaQueueEntries[kern_id];
 237     assert(task->dispatchId() == kern_id);
 238
 239     /**
 240      * whether the next workgroup is the final one in the kernel,
 241      * +1 as we check first before taking action
 242      */
 243     return (task->numWgCompleted() + 1 == task->numWgTotal());
 244 }
 245
 246 /**
 247  * update the counter of oustanding inv requests for the kernel
 248  * kern_id: kernel id
 249  * val: +1/-1, increment or decrement the counter (default: -1)
 250  */
 251 void
 252 GPUDispatcher::updateInvCounter(int kern_id, int val) {
 253     assert(val == -1 || val == 1);
 254
 255     auto task = hsaQueueEntries[kern_id];
 256     task->updateOutstandingInvs(val);
 257
 258     // kernel invalidate is done, schedule dispatch work
 259     if (task->isInvDone() && !tickEvent.scheduled()) {
 260         schedule(&tickEvent, curTick() + shader->clockPeriod());
 261     }
 262 }
 263
 264 /**
 265  * update the counter of oustanding wb requests for the kernel
 266  * kern_id: kernel id
 267  * val: +1/-1, increment or decrement the counter (default: -1)
 268  *
 269  * return true if all wbs are done for the kernel
 270  */
 271 bool
 272 GPUDispatcher::updateWbCounter(int kern_id, int val) {
 273     assert(val == -1 || val == 1);
 274
 275     auto task = hsaQueueEntries[kern_id];
 276     task->updateOutstandingWbs(val);
 277
 278     // true: WB is done, false: WB is still ongoing
 279     return (task->outstandingWbs() == 0);
 280 }
 281
 282 /**
 283  * get kernel's outstanding cache writeback requests
 284  */
 285 int
 286 GPUDispatcher::getOutstandingWbs(int kernId) {
 287     auto task = hsaQueueEntries[kernId];
 288
 289     return task->outstandingWbs();
 290 }
 291
 292 /**
 293  * When an end program instruction detects that the last WF in
 294  * a WG has completed it will call this method on the dispatcher.
 295  * If we detect that this is the last WG for the given task, then
 296  * we ring the completion signal, which is used by the CPU to
 297  * synchronize with the GPU. The HSAPP is also notified that the
 298  * task has completed so it can be removed from its task queues.
 299  */
 300 void
 301 GPUDispatcher::notifyWgCompl(Wavefront *wf)
 302 {
 303     int kern_id = wf->kernId;
 304     DPRINTF(GPUDisp, "notify WgCompl %d\n", wf->wgId);
 305     auto task = hsaQueueEntries[kern_id];
 306     assert(task->dispatchId() == kern_id);
 307     task->notifyWgCompleted();
 308
 309     DPRINTF(GPUWgLatency, "WG Complete cycle:%d wg:%d kernel:%d cu:%d\n",
 310         curTick(), wf->wgId, kern_id, wf->computeUnit->cu_id);
 311
 312     if (task->numWgCompleted() == task->numWgTotal()) {
 313         // Notify the HSA PP that this kernel is complete
 314         gpuCmdProc->hsaPacketProc()
 315             .finishPkt(task->dispPktPtr(), task->queueId());
 316         if (task->completionSignal()) {
 317             // The signal value is aligned 8 bytes from
 318             // the actual handle in the runtime
 319             Addr signal_addr = task->completionSignal() + sizeof(Addr);
 320             DPRINTF(GPUDisp, "HSA AQL Kernel Complete! Triggering "
 321                     "completion signal: %x!\n", signal_addr);
 322
 323             /**
 324              * HACK: The semantics of the HSA signal is to decrement
 325              * the current signal value. We cheat here and read out
 326              * he value from main memory using functional access and
 327              * then just DMA the decremented value. This is because
 328              * the DMA controller does not currently support GPU
 329              * atomics.
 330              */
 331             auto *tc = gpuCmdProc->system()->threads[0];
 332             auto &virt_proxy = tc->getVirtProxy();
 333             TypedBufferArg<Addr> prev_signal(signal_addr);
 334             prev_signal.copyIn(virt_proxy);
 335
 336             Addr *new_signal = new Addr;
 337             *new_signal = (Addr)*prev_signal - 1;
 338
 339             gpuCmdProc->dmaWriteVirt(signal_addr, sizeof(Addr), nullptr,
 340                 new_signal, 0);
 341         } else {
 342             DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
 343                 "signal\n");
 344         }
 345
 346         DPRINTF(GPUWgLatency, "Kernel Complete ticks:%d kernel:%d\n",
 347                 curTick(), kern_id);
 348         DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);
 349     }
 350
 351     if (!tickEvent.scheduled()) {
 352         schedule(&tickEvent, curTick() + shader->clockPeriod());
 353     }
 354 }
 355
 356 void
 357 GPUDispatcher::scheduleDispatch()
 358 {
 359     if (!tickEvent.scheduled()) {
 360         schedule(&tickEvent, curTick() + shader->clockPeriod());
 361     }
 362 }