2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Author: Brad Beckmann, Marc Orr
37 #include "gpu-compute/dispatcher.hh"
39 #include "cpu/base.hh"
40 #include "debug/GPUDisp.hh"
41 #include "gpu-compute/cl_driver.hh"
42 #include "gpu-compute/cl_event.hh"
43 #include "gpu-compute/shader.hh"
44 #include "gpu-compute/wavefront.hh"
45 #include "mem/packet_access.hh"
47 GpuDispatcher
*GpuDispatcher::instance
= nullptr;
49 GpuDispatcher::GpuDispatcher(const Params
*p
)
50 : DmaDevice(p
), _masterId(p
->system
->getMasterId(name() + ".disp")),
51 pioAddr(p
->pio_addr
), pioSize(4096), pioDelay(p
->pio_latency
),
52 dispatchCount(0), dispatchActive(false), cpu(p
->cpu
),
53 shader(p
->shader_pointer
), driver(p
->cl_driver
),
54 tickEvent([this]{ exec(); }, "GPU Dispatcher tick",
55 false, Event::CPU_Tick_Pri
)
57 shader
->handshake(this);
58 driver
->handshake(this);
60 ndRange
.wg_disp_rem
= false;
61 ndRange
.globalWgId
= 0;
63 schedule(&tickEvent
, 0);
65 // translation port for the dispatcher
66 tlbPort
= new TLBPort(csprintf("%s-port%d", name()), this);
69 .name(name() + ".num_kernel_launched")
70 .desc("number of kernel launched")
74 GpuDispatcher
*GpuDispatcherParams::create()
76 GpuDispatcher
*dispatcher
= new GpuDispatcher(this);
77 GpuDispatcher::setInstance(dispatcher
);
79 return GpuDispatcher::getInstance();
83 GpuDispatcher::serialize(CheckpointOut
&cp
) const
87 if (ndRange
.wg_disp_rem
)
88 fatal("Checkpointing not supported during active workgroup execution");
90 if (tickEvent
.scheduled())
91 event_tick
= tickEvent
.when();
93 SERIALIZE_SCALAR(event_tick
);
98 GpuDispatcher::unserialize(CheckpointIn
&cp
)
102 if (tickEvent
.scheduled())
103 deschedule(&tickEvent
);
105 UNSERIALIZE_SCALAR(event_tick
);
108 schedule(&tickEvent
, event_tick
);
112 GpuDispatcher::getAddrRanges() const
114 AddrRangeList ranges
;
116 DPRINTF(GPUDisp
, "dispatcher registering addr range at %#x size %#x\n",
119 ranges
.push_back(RangeSize(pioAddr
, pioSize
));
125 GpuDispatcher::read(PacketPtr pkt
)
127 assert(pkt
->getAddr() >= pioAddr
);
128 assert(pkt
->getAddr() < pioAddr
+ pioSize
);
130 int offset
= pkt
->getAddr() - pioAddr
;
133 DPRINTF(GPUDisp
, " read register %#x size=%d\n", offset
, pkt
->getSize());
137 assert(pkt
->getSize() == 8);
139 uint64_t retval
= dispatchActive
;
143 assert(offset
+ pkt
->getSize() < sizeof(HsaQueueEntry
));
144 char *curTaskPtr
= (char*)&curTask
;
146 memcpy(pkt
->getPtr
<const void*>(), curTaskPtr
+ offset
, pkt
->getSize());
149 pkt
->makeAtomicResponse();
155 GpuDispatcher::write(PacketPtr pkt
)
157 assert(pkt
->getAddr() >= pioAddr
);
158 assert(pkt
->getAddr() < pioAddr
+ pioSize
);
160 int offset
= pkt
->getAddr() - pioAddr
;
163 uint64_t data_val
= 0;
165 switch (pkt
->getSize()) {
167 data_val
= pkt
->get
<uint8_t>();
170 data_val
= pkt
->get
<uint16_t>();
173 data_val
= pkt
->get
<uint32_t>();
176 data_val
= pkt
->get
<uint64_t>();
179 DPRINTF(GPUDisp
, "bad size %d\n", pkt
->getSize());
182 DPRINTF(GPUDisp
, "write register %#x value %#x size=%d\n", offset
, data_val
,
186 static int nextId
= 0;
188 // The depends field of the qstruct, which was previously unused, is
189 // used to communicate with simulated application.
190 if (curTask
.depends
) {
192 shader
->ReadMem((uint64_t)(curTask
.depends
), &hs
,
193 sizeof(HostState
), 0);
195 // update event start time (in nano-seconds)
196 uint64_t start
= curTick() / 1000;
198 shader
->WriteMem((uint64_t)(&((_cl_event
*)hs
.event
)->start
),
199 &start
, sizeof(uint64_t), 0);
203 ++num_kernelLaunched
;
205 NDRange
*ndr
= &(ndRangeMap
[nextId
]);
206 // copy dispatch info
209 // update the numDispTask polled by the runtime
210 accessUserVar(cpu
, (uint64_t)(curTask
.numDispLeft
), 0, 1);
214 for (int i
= 0; i
< 3; ++i
) {
216 ndr
->numWg
[i
] = divCeil(curTask
.gdSize
[i
], curTask
.wgSize
[i
]);
217 ndr
->numWgTotal
*= ndr
->numWg
[i
];
220 ndr
->numWgCompleted
= 0;
222 ndr
->wg_disp_rem
= true;
223 ndr
->execDone
= false;
224 ndr
->addrToNotify
= (volatile bool*)curTask
.addrToNotify
;
225 ndr
->numDispLeft
= (volatile uint32_t*)curTask
.numDispLeft
;
226 ndr
->dispatchId
= nextId
;
227 ndr
->curCid
= pkt
->req
->contextId();
228 DPRINTF(GPUDisp
, "launching kernel %d\n",nextId
);
229 execIds
.push(nextId
);
232 dispatchActive
= true;
234 if (!tickEvent
.scheduled()) {
235 schedule(&tickEvent
, curTick() + shader
->ticks(1));
238 // populate current task struct
239 // first 64 bits are launch reg
241 assert(offset
< sizeof(HsaQueueEntry
));
242 char *curTaskPtr
= (char*)&curTask
;
243 memcpy(curTaskPtr
+ offset
, pkt
->getPtr
<const void*>(), pkt
->getSize());
246 pkt
->makeAtomicResponse();
253 GpuDispatcher::getMasterPort(const std::string
&if_name
, PortID idx
)
255 if (if_name
== "translation_port") {
259 return DmaDevice::getMasterPort(if_name
, idx
);
263 GpuDispatcher::exec()
267 // There are potentially multiple outstanding kernel launches.
268 // It is possible that the workgroups in a different kernel
269 // can fit on the GPU even if another kernel's workgroups cannot
270 DPRINTF(GPUDisp
, "Launching %d Kernels\n", execIds
.size());
272 while (execIds
.size() > fail_count
) {
273 int execId
= execIds
.front();
275 while (ndRangeMap
[execId
].wg_disp_rem
) {
276 //update the thread context
277 shader
->updateContext(ndRangeMap
[execId
].curCid
);
279 // attempt to dispatch_workgroup
280 if (!shader
->dispatch_workgroups(&ndRangeMap
[execId
])) {
281 // if we failed try the next kernel,
282 // it may have smaller workgroups.
283 // put it on the queue to rety latter
284 DPRINTF(GPUDisp
, "kernel %d failed to launch\n", execId
);
285 execIds
.push(execId
);
290 // let's try the next kernel_id
294 DPRINTF(GPUDisp
, "Returning %d Kernels\n", doneIds
.size());
296 if (doneIds
.size() && cpu
) {
297 shader
->hostWakeUp(cpu
);
300 while (doneIds
.size()) {
301 // wakeup the CPU if any Kernels completed this cycle
302 DPRINTF(GPUDisp
, "WorkGroup %d completed\n", doneIds
.front());
308 GpuDispatcher::notifyWgCompl(Wavefront
*w
)
310 int kern_id
= w
->kernId
;
311 DPRINTF(GPUDisp
, "notify WgCompl %d\n",kern_id
);
312 assert(ndRangeMap
[kern_id
].dispatchId
== kern_id
);
313 ndRangeMap
[kern_id
].numWgCompleted
++;
315 if (ndRangeMap
[kern_id
].numWgCompleted
== ndRangeMap
[kern_id
].numWgTotal
) {
316 ndRangeMap
[kern_id
].execDone
= true;
317 doneIds
.push(kern_id
);
319 if (ndRangeMap
[kern_id
].addrToNotify
) {
320 accessUserVar(cpu
, (uint64_t)(ndRangeMap
[kern_id
].addrToNotify
), 1,
324 accessUserVar(cpu
, (uint64_t)(ndRangeMap
[kern_id
].numDispLeft
), 0, -1);
326 // update event end time (in nano-seconds)
327 if (ndRangeMap
[kern_id
].q
.depends
) {
328 HostState
*host_state
= (HostState
*)ndRangeMap
[kern_id
].q
.depends
;
330 shader
->ReadMem((uint64_t)(&host_state
->event
), &event
,
331 sizeof(uint64_t), 0);
333 uint64_t end
= curTick() / 1000;
335 shader
->WriteMem((uint64_t)(&((_cl_event
*)event
)->end
), &end
,
336 sizeof(uint64_t), 0);
340 if (!tickEvent
.scheduled()) {
341 schedule(&tickEvent
, curTick() + shader
->ticks(1));
346 GpuDispatcher::scheduleDispatch()
348 if (!tickEvent
.scheduled())
349 schedule(&tickEvent
, curTick() + shader
->ticks(1));
353 GpuDispatcher::accessUserVar(BaseCPU
*cpu
, uint64_t addr
, int val
, int off
)
357 shader
->AccessMem(addr
, &val
, sizeof(int), 0, MemCmd::ReadReq
,
362 shader
->AccessMem(addr
, &val
, sizeof(int), 0, MemCmd::WriteReq
, true);
364 panic("Cannot find host");
368 // helper functions for driver to retrieve GPU attributes
370 GpuDispatcher::getNumCUs()
372 return shader
->cuList
.size();
376 GpuDispatcher::wfSize() const
378 return shader
->cuList
[0]->wfSize();
382 GpuDispatcher::setFuncargsSize(int funcargs_size
)
384 shader
->funcargs_size
= funcargs_size
;
388 GpuDispatcher::getStaticContextSize() const
390 return shader
->cuList
[0]->wfList
[0][0]->getStaticContextSize();