2 * Copyright (c) 2011-2015,2018 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
35 #include "gpu-compute/dispatcher.hh"
37 #include "debug/GPUDisp.hh"
38 #include "debug/GPUKernelInfo.hh"
39 #include "debug/GPUWgLatency.hh"
40 #include "gpu-compute/gpu_command_processor.hh"
41 #include "gpu-compute/hsa_queue_entry.hh"
42 #include "gpu-compute/shader.hh"
43 #include "gpu-compute/wavefront.hh"
44 #include "sim/syscall_emul_buf.hh"
45 #include "sim/system.hh"
47 GPUDispatcher::GPUDispatcher(const Params
&p
)
48 : SimObject(p
), shader(nullptr), gpuCmdProc(nullptr),
49 tickEvent([this]{ exec(); },
50 "GPU Dispatcher tick", false, Event::CPU_Tick_Pri
),
53 schedule(&tickEvent
, 0);
56 GPUDispatcher::~GPUDispatcher()
61 GPUDispatcher::regStats()
64 .name(name() + ".num_kernel_launched")
65 .desc("number of kernel launched")
68 cyclesWaitingForDispatch
69 .name(name() + ".cycles_wait_dispatch")
70 .desc("number of cycles with outstanding wavefronts "
71 "that are waiting to be dispatched")
76 GPUDispatcher::hsaTask(int disp_id
)
78 assert(hsaQueueEntries
.find(disp_id
) != hsaQueueEntries
.end());
79 return hsaQueueEntries
[disp_id
];
83 GPUDispatcher::setCommandProcessor(GPUCommandProcessor
*gpu_cmd_proc
)
85 gpuCmdProc
= gpu_cmd_proc
;
89 GPUDispatcher::setShader(Shader
*new_shader
)
95 GPUDispatcher::serialize(CheckpointOut
&cp
) const
99 if (tickEvent
.scheduled())
100 event_tick
= tickEvent
.when();
102 SERIALIZE_SCALAR(event_tick
);
106 GPUDispatcher::unserialize(CheckpointIn
&cp
)
110 if (tickEvent
.scheduled())
111 deschedule(&tickEvent
);
113 UNSERIALIZE_SCALAR(event_tick
);
116 schedule(&tickEvent
, event_tick
);
121 * After all relevant HSA data structures have been traversed/extracted
122 * from memory by the CP, dispatch() is called on the dispatcher. This will
123 * schedule a dispatch event that, when triggered, will attempt to dispatch
124 * the WGs associated with the given task to the CUs.
127 GPUDispatcher::dispatch(HSAQueueEntry
*task
)
131 DPRINTF(GPUDisp
, "launching kernel: %s, dispatch ID: %d\n",
132 task
->kernelName(), task
->dispatchId());
134 execIds
.push(task
->dispatchId());
135 dispatchActive
= true;
136 hsaQueueEntries
.emplace(task
->dispatchId(), task
);
138 if (!tickEvent
.scheduled()) {
139 schedule(&tickEvent
, curTick() + shader
->clockPeriod());
144 GPUDispatcher::exec()
149 * There are potentially multiple outstanding kernel launches.
150 * It is possible that the workgroups in a different kernel
151 * can fit on the GPU even if another kernel's workgroups cannot
153 DPRINTF(GPUDisp
, "Launching %d Kernels\n", execIds
.size());
155 if (execIds
.size() > 0) {
156 ++cyclesWaitingForDispatch
;
160 * dispatch work cannot start until the kernel's invalidate is
161 * completely finished; hence, kernel will always initiates
162 * invalidate first and keeps waiting until inv done
164 while (execIds
.size() > fail_count
) {
165 int exec_id
= execIds
.front();
166 auto task
= hsaQueueEntries
[exec_id
];
167 bool launched(false);
169 // acq is needed before starting dispatch
170 if (shader
->impl_kern_launch_acq
) {
171 // try to invalidate cache
172 shader
->prepareInvalidate(task
);
174 // kern launch acquire is not set, skip invalidate
179 * invalidate is still ongoing, put the kernel on the queue to
182 if (!task
->isInvDone()){
183 execIds
.push(exec_id
);
186 DPRINTF(GPUDisp
, "kernel %d failed to launch, due to [%d] pending"
187 " invalidate requests\n", exec_id
, task
->outstandingInvs());
189 // try the next kernel_id
194 // kernel invalidate is done, start workgroup dispatch
195 while (!task
->dispComplete()) {
196 // update the thread context
197 shader
->updateContext(task
->contextId());
199 // attempt to dispatch workgroup
200 DPRINTF(GPUWgLatency
, "Attempt Kernel Launch cycle:%d kernel:%d\n",
203 if (!shader
->dispatchWorkgroups(task
)) {
205 * if we failed try the next kernel,
206 * it may have smaller workgroups.
207 * put it on the queue to rety latter
209 DPRINTF(GPUDisp
, "kernel %d failed to launch\n", exec_id
);
210 execIds
.push(exec_id
);
213 } else if (!launched
) {
215 DPRINTF(GPUKernelInfo
, "Launched kernel %d\n", exec_id
);
219 // try the next kernel_id
223 DPRINTF(GPUDisp
, "Returning %d Kernels\n", doneIds
.size());
225 while (doneIds
.size()) {
226 DPRINTF(GPUDisp
, "Kernel %d completed\n", doneIds
.front());
232 GPUDispatcher::isReachingKernelEnd(Wavefront
*wf
)
234 int kern_id
= wf
->kernId
;
235 assert(hsaQueueEntries
.find(kern_id
) != hsaQueueEntries
.end());
236 auto task
= hsaQueueEntries
[kern_id
];
237 assert(task
->dispatchId() == kern_id
);
240 * whether the next workgroup is the final one in the kernel,
241 * +1 as we check first before taking action
243 return (task
->numWgCompleted() + 1 == task
->numWgTotal());
247 * update the counter of oustanding inv requests for the kernel
249 * val: +1/-1, increment or decrement the counter (default: -1)
252 GPUDispatcher::updateInvCounter(int kern_id
, int val
) {
253 assert(val
== -1 || val
== 1);
255 auto task
= hsaQueueEntries
[kern_id
];
256 task
->updateOutstandingInvs(val
);
258 // kernel invalidate is done, schedule dispatch work
259 if (task
->isInvDone() && !tickEvent
.scheduled()) {
260 schedule(&tickEvent
, curTick() + shader
->clockPeriod());
265 * update the counter of oustanding wb requests for the kernel
267 * val: +1/-1, increment or decrement the counter (default: -1)
269 * return true if all wbs are done for the kernel
272 GPUDispatcher::updateWbCounter(int kern_id
, int val
) {
273 assert(val
== -1 || val
== 1);
275 auto task
= hsaQueueEntries
[kern_id
];
276 task
->updateOutstandingWbs(val
);
278 // true: WB is done, false: WB is still ongoing
279 return (task
->outstandingWbs() == 0);
283 * get kernel's outstanding cache writeback requests
286 GPUDispatcher::getOutstandingWbs(int kernId
) {
287 auto task
= hsaQueueEntries
[kernId
];
289 return task
->outstandingWbs();
293 * When an end program instruction detects that the last WF in
294 * a WG has completed it will call this method on the dispatcher.
295 * If we detect that this is the last WG for the given task, then
296 * we ring the completion signal, which is used by the CPU to
297 * synchronize with the GPU. The HSAPP is also notified that the
298 * task has completed so it can be removed from its task queues.
301 GPUDispatcher::notifyWgCompl(Wavefront
*wf
)
303 int kern_id
= wf
->kernId
;
304 DPRINTF(GPUDisp
, "notify WgCompl %d\n", wf
->wgId
);
305 auto task
= hsaQueueEntries
[kern_id
];
306 assert(task
->dispatchId() == kern_id
);
307 task
->notifyWgCompleted();
309 DPRINTF(GPUWgLatency
, "WG Complete cycle:%d wg:%d kernel:%d cu:%d\n",
310 curTick(), wf
->wgId
, kern_id
, wf
->computeUnit
->cu_id
);
312 if (task
->numWgCompleted() == task
->numWgTotal()) {
313 // Notify the HSA PP that this kernel is complete
314 gpuCmdProc
->hsaPacketProc()
315 .finishPkt(task
->dispPktPtr(), task
->queueId());
316 if (task
->completionSignal()) {
317 // The signal value is aligned 8 bytes from
318 // the actual handle in the runtime
319 Addr signal_addr
= task
->completionSignal() + sizeof(Addr
);
320 DPRINTF(GPUDisp
, "HSA AQL Kernel Complete! Triggering "
321 "completion signal: %x!\n", signal_addr
);
324 * HACK: The semantics of the HSA signal is to decrement
325 * the current signal value. We cheat here and read out
326 * he value from main memory using functional access and
327 * then just DMA the decremented value. This is because
328 * the DMA controller does not currently support GPU
331 auto *tc
= gpuCmdProc
->system()->threads
[0];
332 auto &virt_proxy
= tc
->getVirtProxy();
333 TypedBufferArg
<Addr
> prev_signal(signal_addr
);
334 prev_signal
.copyIn(virt_proxy
);
336 Addr
*new_signal
= new Addr
;
337 *new_signal
= (Addr
)*prev_signal
- 1;
339 gpuCmdProc
->dmaWriteVirt(signal_addr
, sizeof(Addr
), nullptr,
342 DPRINTF(GPUDisp
, "HSA AQL Kernel Complete! No completion "
346 DPRINTF(GPUWgLatency
, "Kernel Complete ticks:%d kernel:%d\n",
348 DPRINTF(GPUKernelInfo
, "Completed kernel %d\n", kern_id
);
351 if (!tickEvent
.scheduled()) {
352 schedule(&tickEvent
, curTick() + shader
->clockPeriod());
357 GPUDispatcher::scheduleDispatch()
359 if (!tickEvent
.scheduled()) {
360 schedule(&tickEvent
, curTick() + shader
->clockPeriod());