gpu-compute: Dropping fetchs when no entry is reserved in the buffer
[gem5.git] / src / gpu-compute / dispatcher.cc
1 /*
2 * Copyright (c) 2011-2015,2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34
35 #include "gpu-compute/dispatcher.hh"
36
37 #include "debug/GPUDisp.hh"
38 #include "debug/GPUKernelInfo.hh"
39 #include "debug/GPUWgLatency.hh"
40 #include "gpu-compute/gpu_command_processor.hh"
41 #include "gpu-compute/hsa_queue_entry.hh"
42 #include "gpu-compute/shader.hh"
43 #include "gpu-compute/wavefront.hh"
44 #include "sim/syscall_emul_buf.hh"
45 #include "sim/system.hh"
46
47 GPUDispatcher::GPUDispatcher(const Params *p)
48 : SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
49 tickEvent([this]{ exec(); },
50 "GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
51 dispatchActive(false)
52 {
53 schedule(&tickEvent, 0);
54 }
55
56 GPUDispatcher::~GPUDispatcher()
57 {
58 }
59
60 void
61 GPUDispatcher::regStats()
62 {
63 numKernelLaunched
64 .name(name() + ".num_kernel_launched")
65 .desc("number of kernel launched")
66 ;
67
68 cyclesWaitingForDispatch
69 .name(name() + ".cycles_wait_dispatch")
70 .desc("number of cycles with outstanding wavefronts "
71 "that are waiting to be dispatched")
72 ;
73 }
74
75 HSAQueueEntry*
76 GPUDispatcher::hsaTask(int disp_id)
77 {
78 assert(hsaQueueEntries.find(disp_id) != hsaQueueEntries.end());
79 return hsaQueueEntries[disp_id];
80 }
81
82 void
83 GPUDispatcher::setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
84 {
85 gpuCmdProc = gpu_cmd_proc;
86 }
87
88 void
89 GPUDispatcher::setShader(Shader *new_shader)
90 {
91 shader = new_shader;
92 }
93
94 void
95 GPUDispatcher::serialize(CheckpointOut &cp) const
96 {
97 Tick event_tick = 0;
98
99 if (tickEvent.scheduled())
100 event_tick = tickEvent.when();
101
102 SERIALIZE_SCALAR(event_tick);
103 }
104
105 void
106 GPUDispatcher::unserialize(CheckpointIn &cp)
107 {
108 Tick event_tick;
109
110 if (tickEvent.scheduled())
111 deschedule(&tickEvent);
112
113 UNSERIALIZE_SCALAR(event_tick);
114
115 if (event_tick) {
116 schedule(&tickEvent, event_tick);
117 }
118 }
119
120 /**
121 * After all relevant HSA data structures have been traversed/extracted
122 * from memory by the CP, dispatch() is called on the dispatcher. This will
123 * schedule a dispatch event that, when triggered, will attempt to dispatch
124 * the WGs associated with the given task to the CUs.
125 */
126 void
127 GPUDispatcher::dispatch(HSAQueueEntry *task)
128 {
129 ++numKernelLaunched;
130
131 DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
132 task->kernelName(), task->dispatchId());
133
134 execIds.push(task->dispatchId());
135 dispatchActive = true;
136 hsaQueueEntries.emplace(task->dispatchId(), task);
137
138 if (!tickEvent.scheduled()) {
139 schedule(&tickEvent, curTick() + shader->clockPeriod());
140 }
141 }
142
143 void
144 GPUDispatcher::exec()
145 {
146 int fail_count(0);
147
148 /**
149 * There are potentially multiple outstanding kernel launches.
150 * It is possible that the workgroups in a different kernel
151 * can fit on the GPU even if another kernel's workgroups cannot
152 */
153 DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
154
155 if (execIds.size() > 0) {
156 ++cyclesWaitingForDispatch;
157 }
158
159 /**
160 * dispatch work cannot start until the kernel's invalidate is
161 * completely finished; hence, kernel will always initiates
162 * invalidate first and keeps waiting until inv done
163 */
164 while (execIds.size() > fail_count) {
165 int exec_id = execIds.front();
166 auto task = hsaQueueEntries[exec_id];
167 bool launched(false);
168
169 // acq is needed before starting dispatch
170 if (shader->impl_kern_launch_acq) {
171 // try to invalidate cache
172 shader->prepareInvalidate(task);
173 } else {
174 // kern launch acquire is not set, skip invalidate
175 task->markInvDone();
176 }
177
178 /**
179 * invalidate is still ongoing, put the kernel on the queue to
180 * retry later
181 */
182 if (!task->isInvDone()){
183 execIds.push(exec_id);
184 ++fail_count;
185
186 DPRINTF(GPUDisp, "kernel %d failed to launch, due to [%d] pending"
187 " invalidate requests\n", exec_id, task->outstandingInvs());
188
189 // try the next kernel_id
190 execIds.pop();
191 continue;
192 }
193
194 // kernel invalidate is done, start workgroup dispatch
195 while (!task->dispComplete()) {
196 // update the thread context
197 shader->updateContext(task->contextId());
198
199 // attempt to dispatch workgroup
200 DPRINTF(GPUWgLatency, "Attempt Kernel Launch cycle:%d kernel:%d\n",
201 curTick(), exec_id);
202
203 if (!shader->dispatchWorkgroups(task)) {
204 /**
205 * if we failed try the next kernel,
206 * it may have smaller workgroups.
207 * put it on the queue to rety latter
208 */
209 DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id);
210 execIds.push(exec_id);
211 ++fail_count;
212 break;
213 } else if (!launched) {
214 launched = true;
215 DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id);
216 }
217 }
218
219 // try the next kernel_id
220 execIds.pop();
221 }
222
223 DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
224
225 while (doneIds.size()) {
226 DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front());
227 doneIds.pop();
228 }
229 }
230
231 bool
232 GPUDispatcher::isReachingKernelEnd(Wavefront *wf)
233 {
234 int kern_id = wf->kernId;
235 assert(hsaQueueEntries.find(kern_id) != hsaQueueEntries.end());
236 auto task = hsaQueueEntries[kern_id];
237 assert(task->dispatchId() == kern_id);
238
239 /**
240 * whether the next workgroup is the final one in the kernel,
241 * +1 as we check first before taking action
242 */
243 return (task->numWgCompleted() + 1 == task->numWgTotal());
244 }
245
246 /**
247 * update the counter of oustanding inv requests for the kernel
248 * kern_id: kernel id
249 * val: +1/-1, increment or decrement the counter (default: -1)
250 */
251 void
252 GPUDispatcher::updateInvCounter(int kern_id, int val) {
253 assert(val == -1 || val == 1);
254
255 auto task = hsaQueueEntries[kern_id];
256 task->updateOutstandingInvs(val);
257
258 // kernel invalidate is done, schedule dispatch work
259 if (task->isInvDone() && !tickEvent.scheduled()) {
260 schedule(&tickEvent, curTick() + shader->clockPeriod());
261 }
262 }
263
264 /**
265 * update the counter of oustanding wb requests for the kernel
266 * kern_id: kernel id
267 * val: +1/-1, increment or decrement the counter (default: -1)
268 *
269 * return true if all wbs are done for the kernel
270 */
271 bool
272 GPUDispatcher::updateWbCounter(int kern_id, int val) {
273 assert(val == -1 || val == 1);
274
275 auto task = hsaQueueEntries[kern_id];
276 task->updateOutstandingWbs(val);
277
278 // true: WB is done, false: WB is still ongoing
279 return (task->outstandingWbs() == 0);
280 }
281
282 /**
283 * get kernel's outstanding cache writeback requests
284 */
285 int
286 GPUDispatcher::getOutstandingWbs(int kernId) {
287 auto task = hsaQueueEntries[kernId];
288
289 return task->outstandingWbs();
290 }
291
292 /**
293 * When an end program instruction detects that the last WF in
294 * a WG has completed it will call this method on the dispatcher.
295 * If we detect that this is the last WG for the given task, then
296 * we ring the completion signal, which is used by the CPU to
297 * synchronize with the GPU. The HSAPP is also notified that the
298 * task has completed so it can be removed from its task queues.
299 */
300 void
301 GPUDispatcher::notifyWgCompl(Wavefront *wf)
302 {
303 int kern_id = wf->kernId;
304 DPRINTF(GPUDisp, "notify WgCompl %d\n", wf->wgId);
305 auto task = hsaQueueEntries[kern_id];
306 assert(task->dispatchId() == kern_id);
307 task->notifyWgCompleted();
308
309 DPRINTF(GPUWgLatency, "WG Complete cycle:%d wg:%d kernel:%d cu:%d\n",
310 curTick(), wf->wgId, kern_id, wf->computeUnit->cu_id);
311
312 if (task->numWgCompleted() == task->numWgTotal()) {
313 // Notify the HSA PP that this kernel is complete
314 gpuCmdProc->hsaPacketProc()
315 .finishPkt(task->dispPktPtr(), task->queueId());
316 if (task->completionSignal()) {
317 // The signal value is aligned 8 bytes from
318 // the actual handle in the runtime
319 Addr signal_addr = task->completionSignal() + sizeof(Addr);
320 DPRINTF(GPUDisp, "HSA AQL Kernel Complete! Triggering "
321 "completion signal: %x!\n", signal_addr);
322
323 /**
324 * HACK: The semantics of the HSA signal is to decrement
325 * the current signal value. We cheat here and read out
326 * he value from main memory using functional access and
327 * then just DMA the decremented value. This is because
328 * the DMA controller does not currently support GPU
329 * atomics.
330 */
331 auto *tc = gpuCmdProc->system()->threads[0];
332 auto &virt_proxy = tc->getVirtProxy();
333 TypedBufferArg<Addr> prev_signal(signal_addr);
334 prev_signal.copyIn(virt_proxy);
335
336 Addr *new_signal = new Addr;
337 *new_signal = (Addr)*prev_signal - 1;
338
339 gpuCmdProc->dmaWriteVirt(signal_addr, sizeof(Addr), nullptr,
340 new_signal, 0);
341 } else {
342 DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
343 "signal\n");
344 }
345
346 DPRINTF(GPUWgLatency, "Kernel Complete ticks:%d kernel:%d\n",
347 curTick(), kern_id);
348 DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);
349 }
350
351 if (!tickEvent.scheduled()) {
352 schedule(&tickEvent, curTick() + shader->clockPeriod());
353 }
354 }
355
356 void
357 GPUDispatcher::scheduleDispatch()
358 {
359 if (!tickEvent.scheduled()) {
360 schedule(&tickEvent, curTick() + shader->clockPeriod());
361 }
362 }
363
364 GPUDispatcher *GPUDispatcherParams::create()
365 {
366 return new GPUDispatcher(this);
367 }