2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Author: Brad Beckmann, Marc Orr
37 #include "gpu-compute/dispatcher.hh"
39 #include "cpu/base.hh"
40 #include "debug/GPUDisp.hh"
41 #include "gpu-compute/cl_driver.hh"
42 #include "gpu-compute/cl_event.hh"
43 #include "gpu-compute/shader.hh"
44 #include "gpu-compute/wavefront.hh"
45 #include "mem/packet_access.hh"
47 GpuDispatcher
*GpuDispatcher::instance
= nullptr;
49 GpuDispatcher::GpuDispatcher(const Params
*p
)
50 : DmaDevice(p
), _masterId(p
->system
->getMasterId(name() + ".disp")),
51 pioAddr(p
->pio_addr
), pioSize(4096), pioDelay(p
->pio_latency
),
52 dispatchCount(0), dispatchActive(false), cpu(p
->cpu
),
53 shader(p
->shader_pointer
), driver(p
->cl_driver
), tickEvent(this)
55 shader
->handshake(this);
56 driver
->handshake(this);
58 ndRange
.wg_disp_rem
= false;
59 ndRange
.globalWgId
= 0;
61 schedule(&tickEvent
, 0);
63 // translation port for the dispatcher
64 tlbPort
= new TLBPort(csprintf("%s-port%d", name()), this);
67 .name(name() + ".num_kernel_launched")
68 .desc("number of kernel launched")
72 GpuDispatcher
*GpuDispatcherParams::create()
74 GpuDispatcher
*dispatcher
= new GpuDispatcher(this);
75 GpuDispatcher::setInstance(dispatcher
);
77 return GpuDispatcher::getInstance();
81 GpuDispatcher::serialize(CheckpointOut
&cp
) const
85 if (ndRange
.wg_disp_rem
)
86 fatal("Checkpointing not supported during active workgroup execution");
88 if (tickEvent
.scheduled())
89 event_tick
= tickEvent
.when();
91 SERIALIZE_SCALAR(event_tick
);
96 GpuDispatcher::unserialize(CheckpointIn
&cp
)
100 if (tickEvent
.scheduled())
101 deschedule(&tickEvent
);
103 UNSERIALIZE_SCALAR(event_tick
);
106 schedule(&tickEvent
, event_tick
);
110 GpuDispatcher::getAddrRanges() const
112 AddrRangeList ranges
;
114 DPRINTF(GPUDisp
, "dispatcher registering addr range at %#x size %#x\n",
117 ranges
.push_back(RangeSize(pioAddr
, pioSize
));
123 GpuDispatcher::read(PacketPtr pkt
)
125 assert(pkt
->getAddr() >= pioAddr
);
126 assert(pkt
->getAddr() < pioAddr
+ pioSize
);
128 int offset
= pkt
->getAddr() - pioAddr
;
131 DPRINTF(GPUDisp
, " read register %#x size=%d\n", offset
, pkt
->getSize());
135 assert(pkt
->getSize() == 8);
137 uint64_t retval
= dispatchActive
;
141 assert(offset
+ pkt
->getSize() < sizeof(HsaQueueEntry
));
142 char *curTaskPtr
= (char*)&curTask
;
144 memcpy(pkt
->getPtr
<const void*>(), curTaskPtr
+ offset
, pkt
->getSize());
147 pkt
->makeAtomicResponse();
153 GpuDispatcher::write(PacketPtr pkt
)
155 assert(pkt
->getAddr() >= pioAddr
);
156 assert(pkt
->getAddr() < pioAddr
+ pioSize
);
158 int offset
= pkt
->getAddr() - pioAddr
;
161 uint64_t data_val
= 0;
163 switch (pkt
->getSize()) {
165 data_val
= pkt
->get
<uint8_t>();
168 data_val
= pkt
->get
<uint16_t>();
171 data_val
= pkt
->get
<uint32_t>();
174 data_val
= pkt
->get
<uint64_t>();
177 DPRINTF(GPUDisp
, "bad size %d\n", pkt
->getSize());
180 DPRINTF(GPUDisp
, "write register %#x value %#x size=%d\n", offset
, data_val
,
184 static int nextId
= 0;
186 // The depends field of the qstruct, which was previously unused, is
187 // used to communicate with simulated application.
188 if (curTask
.depends
) {
190 shader
->ReadMem((uint64_t)(curTask
.depends
), &hs
,
191 sizeof(HostState
), 0);
193 // update event start time (in nano-seconds)
194 uint64_t start
= curTick() / 1000;
196 shader
->WriteMem((uint64_t)(&((_cl_event
*)hs
.event
)->start
),
197 &start
, sizeof(uint64_t), 0);
201 ++num_kernelLaunched
;
203 NDRange
*ndr
= &(ndRangeMap
[nextId
]);
204 // copy dispatch info
207 // update the numDispTask polled by the runtime
208 accessUserVar(cpu
, (uint64_t)(curTask
.numDispLeft
), 0, 1);
212 for (int i
= 0; i
< 3; ++i
) {
214 ndr
->numWg
[i
] = divCeil(curTask
.gdSize
[i
], curTask
.wgSize
[i
]);
215 ndr
->numWgTotal
*= ndr
->numWg
[i
];
218 ndr
->numWgCompleted
= 0;
220 ndr
->wg_disp_rem
= true;
221 ndr
->execDone
= false;
222 ndr
->addrToNotify
= (volatile bool*)curTask
.addrToNotify
;
223 ndr
->numDispLeft
= (volatile uint32_t*)curTask
.numDispLeft
;
224 ndr
->dispatchId
= nextId
;
225 ndr
->curTid
= pkt
->req
->threadId();
226 DPRINTF(GPUDisp
, "launching kernel %d\n",nextId
);
227 execIds
.push(nextId
);
230 dispatchActive
= true;
232 if (!tickEvent
.scheduled()) {
233 schedule(&tickEvent
, curTick() + shader
->ticks(1));
236 // populate current task struct
237 // first 64 bits are launch reg
239 assert(offset
< sizeof(HsaQueueEntry
));
240 char *curTaskPtr
= (char*)&curTask
;
241 memcpy(curTaskPtr
+ offset
, pkt
->getPtr
<const void*>(), pkt
->getSize());
244 pkt
->makeAtomicResponse();
251 GpuDispatcher::getMasterPort(const std::string
&if_name
, PortID idx
)
253 if (if_name
== "translation_port") {
257 return DmaDevice::getMasterPort(if_name
, idx
);
261 GpuDispatcher::exec()
265 // There are potentially multiple outstanding kernel launches.
266 // It is possible that the workgroups in a different kernel
267 // can fit on the GPU even if another kernel's workgroups cannot
268 DPRINTF(GPUDisp
, "Launching %d Kernels\n", execIds
.size());
270 while (execIds
.size() > fail_count
) {
271 int execId
= execIds
.front();
273 while (ndRangeMap
[execId
].wg_disp_rem
) {
274 //update the thread context
275 shader
->updateThreadContext(ndRangeMap
[execId
].curTid
);
277 // attempt to dispatch_workgroup
278 if (!shader
->dispatch_workgroups(&ndRangeMap
[execId
])) {
279 // if we failed try the next kernel,
280 // it may have smaller workgroups.
281 // put it on the queue to rety latter
282 DPRINTF(GPUDisp
, "kernel %d failed to launch\n", execId
);
283 execIds
.push(execId
);
288 // let's try the next kernel_id
292 DPRINTF(GPUDisp
, "Returning %d Kernels\n", doneIds
.size());
294 if (doneIds
.size() && cpu
) {
295 shader
->hostWakeUp(cpu
);
298 while (doneIds
.size()) {
299 // wakeup the CPU if any Kernels completed this cycle
300 DPRINTF(GPUDisp
, "WorkGroup %d completed\n", doneIds
.front());
306 GpuDispatcher::notifyWgCompl(Wavefront
*w
)
308 int kern_id
= w
->kern_id
;
309 DPRINTF(GPUDisp
, "notify WgCompl %d\n",kern_id
);
310 assert(ndRangeMap
[kern_id
].dispatchId
== kern_id
);
311 ndRangeMap
[kern_id
].numWgCompleted
++;
313 if (ndRangeMap
[kern_id
].numWgCompleted
== ndRangeMap
[kern_id
].numWgTotal
) {
314 ndRangeMap
[kern_id
].execDone
= true;
315 doneIds
.push(kern_id
);
317 if (ndRangeMap
[kern_id
].addrToNotify
) {
318 accessUserVar(cpu
, (uint64_t)(ndRangeMap
[kern_id
].addrToNotify
), 1,
322 accessUserVar(cpu
, (uint64_t)(ndRangeMap
[kern_id
].numDispLeft
), 0, -1);
324 // update event end time (in nano-seconds)
325 if (ndRangeMap
[kern_id
].q
.depends
) {
326 HostState
*host_state
= (HostState
*)ndRangeMap
[kern_id
].q
.depends
;
328 shader
->ReadMem((uint64_t)(&host_state
->event
), &event
,
329 sizeof(uint64_t), 0);
331 uint64_t end
= curTick() / 1000;
333 shader
->WriteMem((uint64_t)(&((_cl_event
*)event
)->end
), &end
,
334 sizeof(uint64_t), 0);
338 if (!tickEvent
.scheduled()) {
339 schedule(&tickEvent
, curTick() + shader
->ticks(1));
344 GpuDispatcher::scheduleDispatch()
346 if (!tickEvent
.scheduled())
347 schedule(&tickEvent
, curTick() + shader
->ticks(1));
351 GpuDispatcher::accessUserVar(BaseCPU
*cpu
, uint64_t addr
, int val
, int off
)
355 shader
->AccessMem(addr
, &val
, sizeof(int), 0, MemCmd::ReadReq
,
360 shader
->AccessMem(addr
, &val
, sizeof(int), 0, MemCmd::WriteReq
, true);
362 panic("Cannot find host");
366 GpuDispatcher::TickEvent::TickEvent(GpuDispatcher
*_dispatcher
)
367 : Event(CPU_Tick_Pri
), dispatcher(_dispatcher
)
372 GpuDispatcher::TickEvent::process()
378 GpuDispatcher::TickEvent::description() const
380 return "GPU Dispatcher tick";
383 // helper functions for driver to retrieve GPU attributes
385 GpuDispatcher::getNumCUs()
387 return shader
->cuList
.size();
391 GpuDispatcher::setFuncargsSize(int funcargs_size
)
393 shader
->funcargs_size
= funcargs_size
;